src/video/SDL_blit_A.c
changeset 2255 17b2369756be
parent 2232 6630fefab312
child 2257 340942cfda48
     1.1 --- a/src/video/SDL_blit_A.c	Thu Aug 16 21:54:26 2007 +0000
     1.2 +++ b/src/video/SDL_blit_A.c	Thu Aug 16 22:18:53 2007 +0000
     1.3 @@ -24,41 +24,6 @@
     1.4  #include "SDL_video.h"
     1.5  #include "SDL_blit.h"
     1.6  
     1.7 -/*
     1.8 -  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
     1.9 -   Checking if _mm_free is #defined in malloc.h is is the only way to
    1.10 -   determine if the Processor Pack is installed, as far as I can tell.
    1.11 -*/
    1.12 -
    1.13 -#if SDL_ASSEMBLY_ROUTINES
    1.14 -#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    1.15 -#    define MMX_ASMBLIT 1
    1.16 -#    define GCC_ASMBLIT 1
    1.17 -#  elif defined(_MSC_VER) && defined(_M_IX86)
    1.18 -#    if (_MSC_VER <= 1200)
    1.19 -#      include <malloc.h>
    1.20 -#      if defined(_mm_free)
    1.21 -#          define HAVE_MMINTRIN_H 1
    1.22 -#      endif
    1.23 -#    else /* Visual Studio > VC6 always has mmintrin.h */
    1.24 -#      define HAVE_MMINTRIN_H 1
    1.25 -#    endif
    1.26 -#    if HAVE_MMINTRIN_H
    1.27 -#      define MMX_ASMBLIT 1
    1.28 -#      define MSVC_ASMBLIT 1
    1.29 -#    endif
    1.30 -#  endif
    1.31 -#endif /* SDL_ASSEMBLY_ROUTINES */
    1.32 -
    1.33 -/* Function to check the CPU flags */
    1.34 -#include "SDL_cpuinfo.h"
    1.35 -#if GCC_ASMBLIT
    1.36 -#include "mmx.h"
    1.37 -#elif MSVC_ASMBLIT
    1.38 -#include <mmintrin.h>
    1.39 -#include <mm3dnow.h>
    1.40 -#endif
    1.41 -
    1.42  /* Functions to perform alpha blended blitting */
    1.43  
    1.44  /* N->1 blending with per-surface alpha */
    1.45 @@ -232,239 +197,8 @@
    1.46      }
    1.47  }
    1.48  
    1.49 -#if GCC_ASMBLIT
    1.50 -/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    1.51 -static void
    1.52 -BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
    1.53 -{
    1.54 -    int width = info->d_width;
    1.55 -    int height = info->d_height;
    1.56 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
    1.57 -    int srcskip = info->s_skip >> 2;
    1.58 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
    1.59 -    int dstskip = info->d_skip >> 2;
    1.60 -    Uint32 dalpha = info->dst->Amask;
    1.61 -    Uint8 load[8];
    1.62 +#ifdef __MMX__
    1.63  
    1.64 -    *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
    1.65 -    movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
    1.66 -    *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
    1.67 -    movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
    1.68 -    movd_m2r(dalpha, mm7);      /* dst alpha mask */
    1.69 -    punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
    1.70 -    while (height--) {
    1.71 -		/* *INDENT-OFF* */
    1.72 -		DUFFS_LOOP_DOUBLE2(
    1.73 -		{
    1.74 -			Uint32 s = *srcp++;
    1.75 -			Uint32 d = *dstp;
    1.76 -			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    1.77 -				   + (s & d & 0x00010101)) | dalpha;
    1.78 -		},{
    1.79 -			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    1.80 -			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    1.81 -
    1.82 -			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
    1.83 -			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
    1.84 -
    1.85 -			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
    1.86 -			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
    1.87 -			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
    1.88 -			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
    1.89 -			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
    1.90 -			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
    1.91 -			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
    1.92 -			
    1.93 -			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    1.94 -			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
    1.95 -			dstp += 2;
    1.96 -			srcp += 2;
    1.97 -		}, width);
    1.98 -		/* *INDENT-ON* */
    1.99 -        srcp += srcskip;
   1.100 -        dstp += dstskip;
   1.101 -    }
   1.102 -    emms();
   1.103 -}
   1.104 -
   1.105 -/* fast RGB888->(A)RGB888 blending with surface alpha */
   1.106 -static void
   1.107 -BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   1.108 -{
   1.109 -    SDL_PixelFormat *df = info->dst;
   1.110 -    unsigned alpha = info->src->alpha;
   1.111 -
   1.112 -    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   1.113 -        /* only call a128 version when R,G,B occupy lower bits */
   1.114 -        BlitRGBtoRGBSurfaceAlpha128MMX(info);
   1.115 -    } else {
   1.116 -        int width = info->d_width;
   1.117 -        int height = info->d_height;
   1.118 -        Uint32 *srcp = (Uint32 *) info->s_pixels;
   1.119 -        int srcskip = info->s_skip >> 2;
   1.120 -        Uint32 *dstp = (Uint32 *) info->d_pixels;
   1.121 -        int dstskip = info->d_skip >> 2;
   1.122 -
   1.123 -        pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   1.124 -        /* form the alpha mult */
   1.125 -        movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   1.126 -        punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   1.127 -        punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   1.128 -        alpha =
   1.129 -            (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   1.130 -                                                           Bshift);
   1.131 -        movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   1.132 -        punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   1.133 -        pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   1.134 -        /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   1.135 -        movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   1.136 -        punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   1.137 -
   1.138 -        while (height--) {
   1.139 -			/* *INDENT-OFF* */
   1.140 -			DUFFS_LOOP_DOUBLE2({
   1.141 -				/* One Pixel Blend */
   1.142 -				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   1.143 -				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   1.144 -				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   1.145 -				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   1.146 -
   1.147 -				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   1.148 -				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   1.149 -				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   1.150 -				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   1.151 -
   1.152 -				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   1.153 -				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   1.154 -				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   1.155 -				++srcp;
   1.156 -				++dstp;
   1.157 -			},{
   1.158 -				/* Two Pixels Blend */
   1.159 -				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   1.160 -				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   1.161 -				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   1.162 -				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   1.163 -
   1.164 -				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   1.165 -				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   1.166 -				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   1.167 -				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   1.168 -
   1.169 -				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   1.170 -				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   1.171 -				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   1.172 -				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   1.173 -
   1.174 -				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   1.175 -				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   1.176 -				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   1.177 -				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   1.178 -
   1.179 -				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   1.180 -				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   1.181 -				
   1.182 -				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   1.183 -
   1.184 -  				srcp += 2;
   1.185 -  				dstp += 2;
   1.186 -  			}, width);
   1.187 -			/* *INDENT-ON* */
   1.188 -            srcp += srcskip;
   1.189 -            dstp += dstskip;
   1.190 -        }
   1.191 -        emms();
   1.192 -    }
   1.193 -}
   1.194 -
   1.195 -/* fast ARGB888->(A)RGB888 blending with pixel alpha */
   1.196 -static void
   1.197 -BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   1.198 -{
   1.199 -    int width = info->d_width;
   1.200 -    int height = info->d_height;
   1.201 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
   1.202 -    int srcskip = info->s_skip >> 2;
   1.203 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
   1.204 -    int dstskip = info->d_skip >> 2;
   1.205 -    SDL_PixelFormat *sf = info->src;
   1.206 -    Uint32 amask = sf->Amask;
   1.207 -
   1.208 -    pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   1.209 -    /* form multiplication mask */
   1.210 -    movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   1.211 -    punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   1.212 -    pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   1.213 -    movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   1.214 -    pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   1.215 -    /* form channel masks */
   1.216 -    movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   1.217 -    packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   1.218 -    packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   1.219 -    pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   1.220 -    /* get alpha channel shift */
   1.221 -    /* *INDENT-OFF* */
   1.222 -    __asm__ __volatile__ (
   1.223 -        "movd %0, %%mm5"
   1.224 -        : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   1.225 -    /* *INDENT-ON* */
   1.226 -
   1.227 -    while (height--) {
   1.228 -	    /* *INDENT-OFF* */
   1.229 -	    DUFFS_LOOP4({
   1.230 -		Uint32 alpha = *srcp & amask;
   1.231 -		/* FIXME: Here we special-case opaque alpha since the
   1.232 -			compositioning used (>>8 instead of /255) doesn't handle
   1.233 -			it correctly. Also special-case alpha=0 for speed?
   1.234 -			Benchmark this! */
   1.235 -		if(alpha == 0) {
   1.236 -			/* do nothing */
   1.237 -		} else if(alpha == amask) {
   1.238 -			/* opaque alpha -- copy RGB, keep dst alpha */
   1.239 -			/* using MMX here to free up regular registers for other things */
   1.240 -			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   1.241 -			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   1.242 -			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   1.243 -			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   1.244 -			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   1.245 -			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   1.246 -		} else {
   1.247 -			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   1.248 -			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   1.249 -
   1.250 -			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   1.251 -			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   1.252 -
   1.253 -			__asm__ __volatile__ (
   1.254 -				"movd %0, %%mm4"
   1.255 -				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   1.256 -			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   1.257 -			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   1.258 -			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   1.259 -			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   1.260 -
   1.261 -			/* blend */		    
   1.262 -			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   1.263 -			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   1.264 -			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   1.265 -			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   1.266 -			
   1.267 -			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   1.268 -			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   1.269 -		}
   1.270 -		++srcp;
   1.271 -		++dstp;
   1.272 -	    }, width);
   1.273 -	    /* *INDENT-ON* */
   1.274 -        srcp += srcskip;
   1.275 -        dstp += dstskip;
   1.276 -    }
   1.277 -    emms();
   1.278 -}
   1.279 -
   1.280 -/* End GCC_ASMBLIT */
   1.281 -
   1.282 -#elif MSVC_ASMBLIT
   1.283  /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   1.284  static void
   1.285  BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   1.286 @@ -637,9 +371,9 @@
   1.287      __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   1.288  
   1.289      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   1.290 -	/* *INDENT-OFF* */
   1.291 -	multmask = ~(0xFFFFI64 << (ashift * 2));
   1.292 -	/* *INDENT-ON* */
   1.293 +	multmask = 0xFFFF;
   1.294 +    multmask <<= (ashift * 2);
   1.295 +    multmask = ~multmask;
   1.296      dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   1.297  
   1.298      while (height--) {
   1.299 @@ -683,9 +417,7 @@
   1.300      _mm_empty();
   1.301  }
   1.302  
   1.303 -/* End MSVC_ASMBLIT */
   1.304 -
   1.305 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   1.306 +#endif /* __MMX__ */
   1.307  
   1.308  #if SDL_ALTIVEC_BLITTERS
   1.309  #if __MWERKS__
   1.310 @@ -1639,123 +1371,7 @@
   1.311      }
   1.312  }
   1.313  
   1.314 -#if GCC_ASMBLIT
   1.315 -/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1.316 -static void
   1.317 -BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   1.318 -{
   1.319 -    int width = info->d_width;
   1.320 -    int height = info->d_height;
   1.321 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
   1.322 -    int srcskip = info->s_skip >> 2;
   1.323 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
   1.324 -    int dstskip = info->d_skip >> 2;
   1.325 -    SDL_PixelFormat *sf = info->src;
   1.326 -    Uint32 amask = sf->Amask;
   1.327 -
   1.328 -    __asm__(
   1.329 -               /* make mm6 all zeros. */
   1.330 -               "pxor       %%mm6, %%mm6\n"
   1.331 -               /* Make a mask to preserve the alpha. */
   1.332 -               "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
   1.333 -               "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
   1.334 -               "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
   1.335 -               "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
   1.336 -               "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
   1.337 -               /* form channel masks */
   1.338 -               "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
   1.339 -               "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
   1.340 -               "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
   1.341 -               "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
   1.342 -               /* get alpha channel shift */
   1.343 -               "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
   1.344 -  : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
   1.345 -
   1.346 -    while (height--) {
   1.347 -
   1.348 -	    /* *INDENT-OFF* */
   1.349 -	    DUFFS_LOOP4({
   1.350 -		Uint32 alpha;
   1.351 -
   1.352 -		__asm__ (
   1.353 -		"prefetch 64(%0)\n"
   1.354 -		"prefetch 64(%1)\n"
   1.355 -			: : "r" (srcp), "r" (dstp) );
   1.356 -
   1.357 -		alpha = *srcp & amask;
   1.358 -		/* FIXME: Here we special-case opaque alpha since the
   1.359 -		   compositioning used (>>8 instead of /255) doesn't handle
   1.360 -		   it correctly. Also special-case alpha=0 for speed?
   1.361 -		   Benchmark this! */
   1.362 -		if(alpha == 0) {
   1.363 -		    /* do nothing */
   1.364 -		}
   1.365 -		else if(alpha == amask) {
   1.366 -			/* opaque alpha -- copy RGB, keep dst alpha */
   1.367 -		    /* using MMX here to free up regular registers for other things */
   1.368 -			    __asm__ (
   1.369 -		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
   1.370 -		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
   1.371 -		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
   1.372 -		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
   1.373 -		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
   1.374 -		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
   1.375 -
   1.376 -		     : : "r" (srcp), "r" (dstp) );
   1.377 -		} 
   1.378 -
   1.379 -		else {
   1.380 -			    __asm__ (
   1.381 -		    /* load in the source, and dst. */
   1.382 -		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
   1.383 -		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
   1.384 -
   1.385 -		    /* Move the src alpha into mm2 */
   1.386 -
   1.387 -		    /* if supporting pshufw */
   1.388 -		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
   1.389 -		    /*"psrlw     $8, %%mm2\n" */
   1.390 -		    
   1.391 -		    /* else: */
   1.392 -		    "movd       %2,    %%mm2\n"
   1.393 -		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
   1.394 -		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
   1.395 -		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
   1.396 -		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
   1.397 -
   1.398 -		    /* move the colors into words. */
   1.399 -		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
   1.400 -		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
   1.401 -
   1.402 -		    /* src - dst */
   1.403 -		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
   1.404 -
   1.405 -		    /* A * (src-dst) */
   1.406 -		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
   1.407 -		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
   1.408 -		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
   1.409 -
   1.410 -		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
   1.411 -		    
   1.412 -		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
   1.413 -
   1.414 -		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
   1.415 -
   1.416 -		}
   1.417 -		++srcp;
   1.418 -		++dstp;
   1.419 -	    }, width);
   1.420 -	    /* *INDENT-ON* */
   1.421 -        srcp += srcskip;
   1.422 -        dstp += dstskip;
   1.423 -    }
   1.424 -
   1.425 -  __asm__("emms\n":);
   1.426 -}
   1.427 -
   1.428 -/* End GCC_ASMBLIT*/
   1.429 -
   1.430 -#elif MSVC_ASMBLIT
   1.431 +#ifdef __MMX__
   1.432  /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   1.433  static void
   1.434  BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   1.435 @@ -1775,9 +1391,9 @@
   1.436      __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   1.437  
   1.438      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   1.439 -	/* *INDENT-OFF* */
   1.440 -    multmask = ~(0xFFFFI64 << (ashift * 2));
   1.441 -	/* *INDENT-ON* */
   1.442 +	multmask = 0xFFFF;
   1.443 +    multmask <<= (ashift * 2);
   1.444 +    multmask = ~multmask;
   1.445      dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   1.446  
   1.447      while (height--) {
   1.448 @@ -1826,9 +1442,7 @@
   1.449      _mm_empty();
   1.450  }
   1.451  
   1.452 -/* End MSVC_ASMBLIT */
   1.453 -
   1.454 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   1.455 +#endif /* __MMX__ */
   1.456  
   1.457  /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   1.458  
   1.459 @@ -1940,299 +1554,8 @@
   1.460      }
   1.461  }
   1.462  
   1.463 -#if GCC_ASMBLIT
   1.464 -/* fast RGB565->RGB565 blending with surface alpha */
   1.465 -static void
   1.466 -Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   1.467 -{
   1.468 -    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
   1.469 -    if (alpha == 128) {
   1.470 -        Blit16to16SurfaceAlpha128(info, 0xf7de);
   1.471 -    } else {
   1.472 -        int width = info->d_width;
   1.473 -        int height = info->d_height;
   1.474 -        Uint16 *srcp = (Uint16 *) info->s_pixels;
   1.475 -        int srcskip = info->s_skip >> 1;
   1.476 -        Uint16 *dstp = (Uint16 *) info->d_pixels;
   1.477 -        int dstskip = info->d_skip >> 1;
   1.478 -        Uint32 s, d;
   1.479 -        Uint8 load[8];
   1.480 +#ifdef __MMX__
   1.481  
   1.482 -        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   1.483 -        *(Uint64 *) load = alpha;
   1.484 -        alpha >>= 3;            /* downscale alpha to 5 bits */
   1.485 -
   1.486 -        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
   1.487 -        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
   1.488 -        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
   1.489 -        /* position alpha to allow for mullo and mulhi on diff channels
   1.490 -           to reduce the number of operations */
   1.491 -        psllq_i2r(3, mm0);
   1.492 -
   1.493 -        /* Setup the 565 color channel masks */
   1.494 -        *(Uint64 *) load = 0x07E007E007E007E0ULL;
   1.495 -        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
   1.496 -        *(Uint64 *) load = 0x001F001F001F001FULL;
   1.497 -        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
   1.498 -        while (height--) {
   1.499 -			/* *INDENT-OFF* */
   1.500 -			DUFFS_LOOP_QUATRO2(
   1.501 -			{
   1.502 -				s = *srcp++;
   1.503 -				d = *dstp;
   1.504 -				/*
   1.505 -				 * shift out the middle component (green) to
   1.506 -				 * the high 16 bits, and process all three RGB
   1.507 -				 * components at the same time.
   1.508 -				 */
   1.509 -				s = (s | s << 16) & 0x07e0f81f;
   1.510 -				d = (d | d << 16) & 0x07e0f81f;
   1.511 -				d += (s - d) * alpha >> 5;
   1.512 -				d &= 0x07e0f81f;
   1.513 -				*dstp++ = d | d >> 16;
   1.514 -			},{
   1.515 -				s = *srcp++;
   1.516 -				d = *dstp;
   1.517 -				/*
   1.518 -				 * shift out the middle component (green) to
   1.519 -				 * the high 16 bits, and process all three RGB
   1.520 -				 * components at the same time.
   1.521 -				 */
   1.522 -				s = (s | s << 16) & 0x07e0f81f;
   1.523 -				d = (d | d << 16) & 0x07e0f81f;
   1.524 -				d += (s - d) * alpha >> 5;
   1.525 -				d &= 0x07e0f81f;
   1.526 -				*dstp++ = d | d >> 16;
   1.527 -				s = *srcp++;
   1.528 -				d = *dstp;
   1.529 -				/*
   1.530 -				 * shift out the middle component (green) to
   1.531 -				 * the high 16 bits, and process all three RGB
   1.532 -				 * components at the same time.
   1.533 -				 */
   1.534 -				s = (s | s << 16) & 0x07e0f81f;
   1.535 -				d = (d | d << 16) & 0x07e0f81f;
   1.536 -				d += (s - d) * alpha >> 5;
   1.537 -				d &= 0x07e0f81f;
   1.538 -				*dstp++ = d | d >> 16;
   1.539 -			},{
   1.540 -				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   1.541 -				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   1.542 -
   1.543 -				/* red -- does not need a mask since the right shift clears
   1.544 -				   the uninteresting bits */
   1.545 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.546 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.547 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
   1.548 -				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
   1.549 -
   1.550 -				/* blend */
   1.551 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.552 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.553 -				/* alpha used is actually 11 bits
   1.554 -				   11 + 5 = 16 bits, so the sign bits are lost */
   1.555 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1.556 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.557 -				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
   1.558 -
   1.559 -				movq_r2r(mm6, mm1); /* save new reds in dsts */
   1.560 -
   1.561 -				/* green -- process the bits in place */
   1.562 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.563 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.564 -				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   1.565 -				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   1.566 -
   1.567 -				/* blend */
   1.568 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.569 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.570 -				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
   1.571 -				   bits are gone and the sign bits present */
   1.572 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   1.573 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.574 -
   1.575 -				por_r2r(mm6, mm1); /* save new greens in dsts */
   1.576 -
   1.577 -				/* blue */
   1.578 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.579 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.580 -				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   1.581 -				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1.582 -
   1.583 -				/* blend */
   1.584 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.585 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.586 -				/* 11 + 5 = 16 bits, so the sign bits are lost and
   1.587 -				   the interesting bits will need to be MASKed */
   1.588 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1.589 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.590 -				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1.591 -
   1.592 -				por_r2r(mm6, mm1); /* save new blues in dsts */
   1.593 -
   1.594 -				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
   1.595 -
   1.596 -				srcp += 4;
   1.597 -				dstp += 4;
   1.598 -			}, width);			
   1.599 -			/* *INDENT-ON* */
   1.600 -            srcp += srcskip;
   1.601 -            dstp += dstskip;
   1.602 -        }
   1.603 -        emms();
   1.604 -    }
   1.605 -}
   1.606 -
   1.607 -/* fast RGB555->RGB555 blending with surface alpha */
   1.608 -static void
   1.609 -Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   1.610 -{
   1.611 -    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
   1.612 -    if (alpha == 128) {
   1.613 -        Blit16to16SurfaceAlpha128(info, 0xfbde);
   1.614 -    } else {
   1.615 -        int width = info->d_width;
   1.616 -        int height = info->d_height;
   1.617 -        Uint16 *srcp = (Uint16 *) info->s_pixels;
   1.618 -        int srcskip = info->s_skip >> 1;
   1.619 -        Uint16 *dstp = (Uint16 *) info->d_pixels;
   1.620 -        int dstskip = info->d_skip >> 1;
   1.621 -        Uint32 s, d;
   1.622 -        Uint8 load[8];
   1.623 -
   1.624 -        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   1.625 -        *(Uint64 *) load = alpha;
   1.626 -        alpha >>= 3;            /* downscale alpha to 5 bits */
   1.627 -
   1.628 -        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
   1.629 -        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
   1.630 -        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
   1.631 -        /* position alpha to allow for mullo and mulhi on diff channels
   1.632 -           to reduce the number of operations */
   1.633 -        psllq_i2r(3, mm0);
   1.634 -
   1.635 -        /* Setup the 555 color channel masks */
   1.636 -        *(Uint64 *) load = 0x03E003E003E003E0ULL;
   1.637 -        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
   1.638 -        *(Uint64 *) load = 0x001F001F001F001FULL;
   1.639 -        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
   1.640 -        while (height--) {
   1.641 -			/* *INDENT-OFF* */
   1.642 -			DUFFS_LOOP_QUATRO2(
   1.643 -			{
   1.644 -				s = *srcp++;
   1.645 -				d = *dstp;
   1.646 -				/*
   1.647 -				 * shift out the middle component (green) to
   1.648 -				 * the high 16 bits, and process all three RGB
   1.649 -				 * components at the same time.
   1.650 -				 */
   1.651 -				s = (s | s << 16) & 0x03e07c1f;
   1.652 -				d = (d | d << 16) & 0x03e07c1f;
   1.653 -				d += (s - d) * alpha >> 5;
   1.654 -				d &= 0x03e07c1f;
   1.655 -				*dstp++ = d | d >> 16;
   1.656 -			},{
   1.657 -				s = *srcp++;
   1.658 -				d = *dstp;
   1.659 -				/*
   1.660 -				 * shift out the middle component (green) to
   1.661 -				 * the high 16 bits, and process all three RGB
   1.662 -				 * components at the same time.
   1.663 -				 */
   1.664 -				s = (s | s << 16) & 0x03e07c1f;
   1.665 -				d = (d | d << 16) & 0x03e07c1f;
   1.666 -				d += (s - d) * alpha >> 5;
   1.667 -				d &= 0x03e07c1f;
   1.668 -				*dstp++ = d | d >> 16;
   1.669 -			        s = *srcp++;
   1.670 -				d = *dstp;
   1.671 -				/*
   1.672 -				 * shift out the middle component (green) to
   1.673 -				 * the high 16 bits, and process all three RGB
   1.674 -				 * components at the same time.
   1.675 -				 */
   1.676 -				s = (s | s << 16) & 0x03e07c1f;
   1.677 -				d = (d | d << 16) & 0x03e07c1f;
   1.678 -				d += (s - d) * alpha >> 5;
   1.679 -				d &= 0x03e07c1f;
   1.680 -				*dstp++ = d | d >> 16;
   1.681 -			},{
   1.682 -				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   1.683 -				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   1.684 -
   1.685 -				/* red -- process the bits in place */
   1.686 -				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
   1.687 -					/* by reusing the GREEN mask we free up another mmx
   1.688 -					   register to accumulate the result */
   1.689 -
   1.690 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.691 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.692 -				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
   1.693 -				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
   1.694 -
   1.695 -				/* blend */
   1.696 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.697 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.698 -				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
   1.699 -				   cleared by a MASK below */
   1.700 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   1.701 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.702 -				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
   1.703 -
   1.704 -				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
   1.705 -
   1.706 -				movq_r2r(mm6, mm1); /* save new reds in dsts */
   1.707 -
   1.708 -				/* green -- process the bits in place */
   1.709 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.710 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.711 -				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   1.712 -				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   1.713 -
   1.714 -				/* blend */
   1.715 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.716 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.717 -				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
   1.718 -				   bits are gone and the sign bits present */
   1.719 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   1.720 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.721 -
   1.722 -				por_r2r(mm6, mm1); /* save new greens in dsts */
   1.723 -
   1.724 -				/* blue */
   1.725 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   1.726 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   1.727 -				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   1.728 -				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1.729 -
   1.730 -				/* blend */
   1.731 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   1.732 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   1.733 -				/* 11 + 5 = 16 bits, so the sign bits are lost and
   1.734 -				   the interesting bits will need to be MASKed */
   1.735 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   1.736 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   1.737 -				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   1.738 -
   1.739 -				por_r2r(mm6, mm1); /* save new blues in dsts */
   1.740 -
   1.741 -				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
   1.742 -
   1.743 -				srcp += 4;
   1.744 -				dstp += 4;
   1.745 -			}, width);
   1.746 -			/* *INDENT-ON* */
   1.747 -            srcp += srcskip;
   1.748 -            dstp += dstskip;
   1.749 -        }
   1.750 -        emms();
   1.751 -    }
   1.752 -}
   1.753 -
   1.754 -/* End GCC_ASMBLIT */
   1.755 -
   1.756 -#elif MSVC_ASMBLIT
   1.757  /* fast RGB565->RGB565 blending with surface alpha */
   1.758  static void
   1.759  Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   1.760 @@ -2507,7 +1830,8 @@
   1.761          _mm_empty();
   1.762      }
   1.763  }
   1.764 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   1.765 +
   1.766 +#endif /* __MMX__ */
   1.767  
   1.768  /* fast RGB565->RGB565 blending with surface alpha */
   1.769  static void
   1.770 @@ -2852,14 +2176,14 @@
   1.771              case 2:
   1.772                  if (surface->map->identity) {
   1.773                      if (df->Gmask == 0x7e0) {
   1.774 -#if MMX_ASMBLIT
   1.775 +#ifdef __MMX__
   1.776                          if (SDL_HasMMX())
   1.777                              return Blit565to565SurfaceAlphaMMX;
   1.778                          else
   1.779  #endif
   1.780                              return Blit565to565SurfaceAlpha;
   1.781                      } else if (df->Gmask == 0x3e0) {
   1.782 -#if MMX_ASMBLIT
   1.783 +#ifdef __MMX__
   1.784                          if (SDL_HasMMX())
   1.785                              return Blit555to555SurfaceAlphaMMX;
   1.786                          else
   1.787 @@ -2873,7 +2197,7 @@
   1.788                  if (sf->Rmask == df->Rmask
   1.789                      && sf->Gmask == df->Gmask
   1.790                      && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1.791 -#if MMX_ASMBLIT
   1.792 +#ifdef __MMX__
   1.793                      if (sf->Rshift % 8 == 0
   1.794                          && sf->Gshift % 8 == 0
   1.795                          && sf->Bshift % 8 == 0 && SDL_HasMMX())
   1.796 @@ -2928,7 +2252,7 @@
   1.797              if (sf->Rmask == df->Rmask
   1.798                  && sf->Gmask == df->Gmask
   1.799                  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   1.800 -#if MMX_ASMBLIT
   1.801 +#ifdef __MMX__
   1.802                  if (sf->Rshift % 8 == 0
   1.803                      && sf->Gshift % 8 == 0
   1.804                      && sf->Bshift % 8 == 0