Use MMX intrinsics over GCC inline assembly
authorSam Lantinga <slouken@libsdl.org>
Thu, 16 Aug 2007 22:18:53 +0000
changeset 225517b2369756be
parent 2254 79e00f5561f4
child 2256 e893d24ad8db
Use MMX intrinsics over GCC inline assembly
src/video/SDL_blit.c
src/video/SDL_blit_A.c
     1.1 --- a/src/video/SDL_blit.c	Thu Aug 16 21:54:26 2007 +0000
     1.2 +++ b/src/video/SDL_blit.c	Thu Aug 16 22:18:53 2007 +0000
     1.3 @@ -28,15 +28,6 @@
     1.4  #include "SDL_RLEaccel_c.h"
     1.5  #include "SDL_pixels_c.h"
     1.6  
     1.7 -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
     1.8 -#define MMX_ASMBLIT
     1.9 -#endif
    1.10 -
    1.11 -#if defined(MMX_ASMBLIT)
    1.12 -#include "SDL_cpuinfo.h"
    1.13 -#include "mmx.h"
    1.14 -#endif
    1.15 -
    1.16  /* The general purpose software blit routine */
    1.17  static int
    1.18  SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect,
     2.1 --- a/src/video/SDL_blit_A.c	Thu Aug 16 21:54:26 2007 +0000
     2.2 +++ b/src/video/SDL_blit_A.c	Thu Aug 16 22:18:53 2007 +0000
     2.3 @@ -24,41 +24,6 @@
     2.4  #include "SDL_video.h"
     2.5  #include "SDL_blit.h"
     2.6  
     2.7 -/*
     2.8 -  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
     2.9 -   Checking if _mm_free is #defined in malloc.h is is the only way to
    2.10 -   determine if the Processor Pack is installed, as far as I can tell.
    2.11 -*/
    2.12 -
    2.13 -#if SDL_ASSEMBLY_ROUTINES
    2.14 -#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    2.15 -#    define MMX_ASMBLIT 1
    2.16 -#    define GCC_ASMBLIT 1
    2.17 -#  elif defined(_MSC_VER) && defined(_M_IX86)
    2.18 -#    if (_MSC_VER <= 1200)
    2.19 -#      include <malloc.h>
    2.20 -#      if defined(_mm_free)
    2.21 -#          define HAVE_MMINTRIN_H 1
    2.22 -#      endif
    2.23 -#    else /* Visual Studio > VC6 always has mmintrin.h */
    2.24 -#      define HAVE_MMINTRIN_H 1
    2.25 -#    endif
    2.26 -#    if HAVE_MMINTRIN_H
    2.27 -#      define MMX_ASMBLIT 1
    2.28 -#      define MSVC_ASMBLIT 1
    2.29 -#    endif
    2.30 -#  endif
    2.31 -#endif /* SDL_ASSEMBLY_ROUTINES */
    2.32 -
    2.33 -/* Function to check the CPU flags */
    2.34 -#include "SDL_cpuinfo.h"
    2.35 -#if GCC_ASMBLIT
    2.36 -#include "mmx.h"
    2.37 -#elif MSVC_ASMBLIT
    2.38 -#include <mmintrin.h>
    2.39 -#include <mm3dnow.h>
    2.40 -#endif
    2.41 -
    2.42  /* Functions to perform alpha blended blitting */
    2.43  
    2.44  /* N->1 blending with per-surface alpha */
    2.45 @@ -232,239 +197,8 @@
    2.46      }
    2.47  }
    2.48  
    2.49 -#if GCC_ASMBLIT
    2.50 -/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    2.51 -static void
    2.52 -BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
    2.53 -{
    2.54 -    int width = info->d_width;
    2.55 -    int height = info->d_height;
    2.56 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
    2.57 -    int srcskip = info->s_skip >> 2;
    2.58 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
    2.59 -    int dstskip = info->d_skip >> 2;
    2.60 -    Uint32 dalpha = info->dst->Amask;
    2.61 -    Uint8 load[8];
    2.62 +#ifdef __MMX__
    2.63  
    2.64 -    *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
    2.65 -    movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
    2.66 -    *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
    2.67 -    movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
    2.68 -    movd_m2r(dalpha, mm7);      /* dst alpha mask */
    2.69 -    punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
    2.70 -    while (height--) {
    2.71 -		/* *INDENT-OFF* */
    2.72 -		DUFFS_LOOP_DOUBLE2(
    2.73 -		{
    2.74 -			Uint32 s = *srcp++;
    2.75 -			Uint32 d = *dstp;
    2.76 -			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
    2.77 -				   + (s & d & 0x00010101)) | dalpha;
    2.78 -		},{
    2.79 -			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
    2.80 -			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
    2.81 -
    2.82 -			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
    2.83 -			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
    2.84 -
    2.85 -			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
    2.86 -			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
    2.87 -			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
    2.88 -			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
    2.89 -			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
    2.90 -			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
    2.91 -			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
    2.92 -			
    2.93 -			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
    2.94 -			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
    2.95 -			dstp += 2;
    2.96 -			srcp += 2;
    2.97 -		}, width);
    2.98 -		/* *INDENT-ON* */
    2.99 -        srcp += srcskip;
   2.100 -        dstp += dstskip;
   2.101 -    }
   2.102 -    emms();
   2.103 -}
   2.104 -
   2.105 -/* fast RGB888->(A)RGB888 blending with surface alpha */
   2.106 -static void
   2.107 -BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   2.108 -{
   2.109 -    SDL_PixelFormat *df = info->dst;
   2.110 -    unsigned alpha = info->src->alpha;
   2.111 -
   2.112 -    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   2.113 -        /* only call a128 version when R,G,B occupy lower bits */
   2.114 -        BlitRGBtoRGBSurfaceAlpha128MMX(info);
   2.115 -    } else {
   2.116 -        int width = info->d_width;
   2.117 -        int height = info->d_height;
   2.118 -        Uint32 *srcp = (Uint32 *) info->s_pixels;
   2.119 -        int srcskip = info->s_skip >> 2;
   2.120 -        Uint32 *dstp = (Uint32 *) info->d_pixels;
   2.121 -        int dstskip = info->d_skip >> 2;
   2.122 -
   2.123 -        pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   2.124 -        /* form the alpha mult */
   2.125 -        movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   2.126 -        punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   2.127 -        punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   2.128 -        alpha =
   2.129 -            (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   2.130 -                                                           Bshift);
   2.131 -        movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   2.132 -        punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   2.133 -        pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   2.134 -        /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   2.135 -        movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   2.136 -        punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   2.137 -
   2.138 -        while (height--) {
   2.139 -			/* *INDENT-OFF* */
   2.140 -			DUFFS_LOOP_DOUBLE2({
   2.141 -				/* One Pixel Blend */
   2.142 -				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   2.143 -				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   2.144 -				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   2.145 -				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   2.146 -
   2.147 -				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   2.148 -				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   2.149 -				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   2.150 -				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   2.151 -
   2.152 -				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   2.153 -				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   2.154 -				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   2.155 -				++srcp;
   2.156 -				++dstp;
   2.157 -			},{
   2.158 -				/* Two Pixels Blend */
   2.159 -				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   2.160 -				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   2.161 -				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   2.162 -				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   2.163 -
   2.164 -				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   2.165 -				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   2.166 -				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   2.167 -				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   2.168 -
   2.169 -				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   2.170 -				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   2.171 -				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   2.172 -				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   2.173 -
   2.174 -				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   2.175 -				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   2.176 -				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   2.177 -				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   2.178 -
   2.179 -				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   2.180 -				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   2.181 -				
   2.182 -				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   2.183 -
   2.184 -  				srcp += 2;
   2.185 -  				dstp += 2;
   2.186 -  			}, width);
   2.187 -			/* *INDENT-ON* */
   2.188 -            srcp += srcskip;
   2.189 -            dstp += dstskip;
   2.190 -        }
   2.191 -        emms();
   2.192 -    }
   2.193 -}
   2.194 -
   2.195 -/* fast ARGB888->(A)RGB888 blending with pixel alpha */
   2.196 -static void
   2.197 -BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   2.198 -{
   2.199 -    int width = info->d_width;
   2.200 -    int height = info->d_height;
   2.201 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
   2.202 -    int srcskip = info->s_skip >> 2;
   2.203 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
   2.204 -    int dstskip = info->d_skip >> 2;
   2.205 -    SDL_PixelFormat *sf = info->src;
   2.206 -    Uint32 amask = sf->Amask;
   2.207 -
   2.208 -    pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   2.209 -    /* form multiplication mask */
   2.210 -    movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   2.211 -    punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   2.212 -    pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   2.213 -    movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   2.214 -    pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   2.215 -    /* form channel masks */
   2.216 -    movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   2.217 -    packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   2.218 -    packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   2.219 -    pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   2.220 -    /* get alpha channel shift */
   2.221 -    /* *INDENT-OFF* */
   2.222 -    __asm__ __volatile__ (
   2.223 -        "movd %0, %%mm5"
   2.224 -        : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   2.225 -    /* *INDENT-ON* */
   2.226 -
   2.227 -    while (height--) {
   2.228 -	    /* *INDENT-OFF* */
   2.229 -	    DUFFS_LOOP4({
   2.230 -		Uint32 alpha = *srcp & amask;
   2.231 -		/* FIXME: Here we special-case opaque alpha since the
   2.232 -			compositioning used (>>8 instead of /255) doesn't handle
   2.233 -			it correctly. Also special-case alpha=0 for speed?
   2.234 -			Benchmark this! */
   2.235 -		if(alpha == 0) {
   2.236 -			/* do nothing */
   2.237 -		} else if(alpha == amask) {
   2.238 -			/* opaque alpha -- copy RGB, keep dst alpha */
   2.239 -			/* using MMX here to free up regular registers for other things */
   2.240 -			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   2.241 -			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   2.242 -			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   2.243 -			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   2.244 -			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   2.245 -			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   2.246 -		} else {
   2.247 -			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   2.248 -			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   2.249 -
   2.250 -			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   2.251 -			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   2.252 -
   2.253 -			__asm__ __volatile__ (
   2.254 -				"movd %0, %%mm4"
   2.255 -				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   2.256 -			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   2.257 -			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   2.258 -			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   2.259 -			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   2.260 -
   2.261 -			/* blend */		    
   2.262 -			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   2.263 -			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   2.264 -			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   2.265 -			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   2.266 -			
   2.267 -			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   2.268 -			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   2.269 -		}
   2.270 -		++srcp;
   2.271 -		++dstp;
   2.272 -	    }, width);
   2.273 -	    /* *INDENT-ON* */
   2.274 -        srcp += srcskip;
   2.275 -        dstp += dstskip;
   2.276 -    }
   2.277 -    emms();
   2.278 -}
   2.279 -
   2.280 -/* End GCC_ASMBLIT */
   2.281 -
   2.282 -#elif MSVC_ASMBLIT
   2.283  /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   2.284  static void
   2.285  BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   2.286 @@ -637,9 +371,9 @@
   2.287      __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   2.288  
   2.289      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   2.290 -	/* *INDENT-OFF* */
   2.291 -	multmask = ~(0xFFFFI64 << (ashift * 2));
   2.292 -	/* *INDENT-ON* */
   2.293 +	multmask = 0xFFFF;
   2.294 +    multmask <<= (ashift * 2);
   2.295 +    multmask = ~multmask;
   2.296      dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   2.297  
   2.298      while (height--) {
   2.299 @@ -683,9 +417,7 @@
   2.300      _mm_empty();
   2.301  }
   2.302  
   2.303 -/* End MSVC_ASMBLIT */
   2.304 -
   2.305 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   2.306 +#endif /* __MMX__ */
   2.307  
   2.308  #if SDL_ALTIVEC_BLITTERS
   2.309  #if __MWERKS__
   2.310 @@ -1639,123 +1371,7 @@
   2.311      }
   2.312  }
   2.313  
   2.314 -#if GCC_ASMBLIT
   2.315 -/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   2.316 -static void
   2.317 -BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   2.318 -{
   2.319 -    int width = info->d_width;
   2.320 -    int height = info->d_height;
   2.321 -    Uint32 *srcp = (Uint32 *) info->s_pixels;
   2.322 -    int srcskip = info->s_skip >> 2;
   2.323 -    Uint32 *dstp = (Uint32 *) info->d_pixels;
   2.324 -    int dstskip = info->d_skip >> 2;
   2.325 -    SDL_PixelFormat *sf = info->src;
   2.326 -    Uint32 amask = sf->Amask;
   2.327 -
   2.328 -    __asm__(
   2.329 -               /* make mm6 all zeros. */
   2.330 -               "pxor       %%mm6, %%mm6\n"
   2.331 -               /* Make a mask to preserve the alpha. */
   2.332 -               "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
   2.333 -               "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
   2.334 -               "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
   2.335 -               "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
   2.336 -               "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
   2.337 -               /* form channel masks */
   2.338 -               "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
   2.339 -               "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
   2.340 -               "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
   2.341 -               "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
   2.342 -               /* get alpha channel shift */
   2.343 -               "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
   2.344 -  : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
   2.345 -
   2.346 -    while (height--) {
   2.347 -
   2.348 -	    /* *INDENT-OFF* */
   2.349 -	    DUFFS_LOOP4({
   2.350 -		Uint32 alpha;
   2.351 -
   2.352 -		__asm__ (
   2.353 -		"prefetch 64(%0)\n"
   2.354 -		"prefetch 64(%1)\n"
   2.355 -			: : "r" (srcp), "r" (dstp) );
   2.356 -
   2.357 -		alpha = *srcp & amask;
   2.358 -		/* FIXME: Here we special-case opaque alpha since the
   2.359 -		   compositioning used (>>8 instead of /255) doesn't handle
   2.360 -		   it correctly. Also special-case alpha=0 for speed?
   2.361 -		   Benchmark this! */
   2.362 -		if(alpha == 0) {
   2.363 -		    /* do nothing */
   2.364 -		}
   2.365 -		else if(alpha == amask) {
   2.366 -			/* opaque alpha -- copy RGB, keep dst alpha */
   2.367 -		    /* using MMX here to free up regular registers for other things */
   2.368 -			    __asm__ (
   2.369 -		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
   2.370 -		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
   2.371 -		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
   2.372 -		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
   2.373 -		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
   2.374 -		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
   2.375 -
   2.376 -		     : : "r" (srcp), "r" (dstp) );
   2.377 -		} 
   2.378 -
   2.379 -		else {
   2.380 -			    __asm__ (
   2.381 -		    /* load in the source, and dst. */
   2.382 -		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
   2.383 -		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
   2.384 -
   2.385 -		    /* Move the src alpha into mm2 */
   2.386 -
   2.387 -		    /* if supporting pshufw */
   2.388 -		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
   2.389 -		    /*"psrlw     $8, %%mm2\n" */
   2.390 -		    
   2.391 -		    /* else: */
   2.392 -		    "movd       %2,    %%mm2\n"
   2.393 -		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
   2.394 -		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
   2.395 -		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
   2.396 -		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
   2.397 -
   2.398 -		    /* move the colors into words. */
   2.399 -		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
   2.400 -		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
   2.401 -
   2.402 -		    /* src - dst */
   2.403 -		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
   2.404 -
   2.405 -		    /* A * (src-dst) */
   2.406 -		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
   2.407 -		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
   2.408 -		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
   2.409 -
   2.410 -		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
   2.411 -		    
   2.412 -		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
   2.413 -
   2.414 -		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
   2.415 -
   2.416 -		}
   2.417 -		++srcp;
   2.418 -		++dstp;
   2.419 -	    }, width);
   2.420 -	    /* *INDENT-ON* */
   2.421 -        srcp += srcskip;
   2.422 -        dstp += dstskip;
   2.423 -    }
   2.424 -
   2.425 -  __asm__("emms\n":);
   2.426 -}
   2.427 -
   2.428 -/* End GCC_ASMBLIT*/
   2.429 -
   2.430 -#elif MSVC_ASMBLIT
   2.431 +#ifdef __MMX__
   2.432  /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   2.433  static void
   2.434  BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   2.435 @@ -1775,9 +1391,9 @@
   2.436      __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   2.437  
   2.438      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   2.439 -	/* *INDENT-OFF* */
   2.440 -    multmask = ~(0xFFFFI64 << (ashift * 2));
   2.441 -	/* *INDENT-ON* */
   2.442 +	multmask = 0xFFFF;
   2.443 +    multmask <<= (ashift * 2);
   2.444 +    multmask = ~multmask;
   2.445      dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   2.446  
   2.447      while (height--) {
   2.448 @@ -1826,9 +1442,7 @@
   2.449      _mm_empty();
   2.450  }
   2.451  
   2.452 -/* End MSVC_ASMBLIT */
   2.453 -
   2.454 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   2.455 +#endif /* __MMX__ */
   2.456  
   2.457  /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   2.458  
   2.459 @@ -1940,299 +1554,8 @@
   2.460      }
   2.461  }
   2.462  
   2.463 -#if GCC_ASMBLIT
   2.464 -/* fast RGB565->RGB565 blending with surface alpha */
   2.465 -static void
   2.466 -Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   2.467 -{
   2.468 -    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
   2.469 -    if (alpha == 128) {
   2.470 -        Blit16to16SurfaceAlpha128(info, 0xf7de);
   2.471 -    } else {
   2.472 -        int width = info->d_width;
   2.473 -        int height = info->d_height;
   2.474 -        Uint16 *srcp = (Uint16 *) info->s_pixels;
   2.475 -        int srcskip = info->s_skip >> 1;
   2.476 -        Uint16 *dstp = (Uint16 *) info->d_pixels;
   2.477 -        int dstskip = info->d_skip >> 1;
   2.478 -        Uint32 s, d;
   2.479 -        Uint8 load[8];
   2.480 +#ifdef __MMX__
   2.481  
   2.482 -        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   2.483 -        *(Uint64 *) load = alpha;
   2.484 -        alpha >>= 3;            /* downscale alpha to 5 bits */
   2.485 -
   2.486 -        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
   2.487 -        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
   2.488 -        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
   2.489 -        /* position alpha to allow for mullo and mulhi on diff channels
   2.490 -           to reduce the number of operations */
   2.491 -        psllq_i2r(3, mm0);
   2.492 -
   2.493 -        /* Setup the 565 color channel masks */
   2.494 -        *(Uint64 *) load = 0x07E007E007E007E0ULL;
   2.495 -        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
   2.496 -        *(Uint64 *) load = 0x001F001F001F001FULL;
   2.497 -        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
   2.498 -        while (height--) {
   2.499 -			/* *INDENT-OFF* */
   2.500 -			DUFFS_LOOP_QUATRO2(
   2.501 -			{
   2.502 -				s = *srcp++;
   2.503 -				d = *dstp;
   2.504 -				/*
   2.505 -				 * shift out the middle component (green) to
   2.506 -				 * the high 16 bits, and process all three RGB
   2.507 -				 * components at the same time.
   2.508 -				 */
   2.509 -				s = (s | s << 16) & 0x07e0f81f;
   2.510 -				d = (d | d << 16) & 0x07e0f81f;
   2.511 -				d += (s - d) * alpha >> 5;
   2.512 -				d &= 0x07e0f81f;
   2.513 -				*dstp++ = d | d >> 16;
   2.514 -			},{
   2.515 -				s = *srcp++;
   2.516 -				d = *dstp;
   2.517 -				/*
   2.518 -				 * shift out the middle component (green) to
   2.519 -				 * the high 16 bits, and process all three RGB
   2.520 -				 * components at the same time.
   2.521 -				 */
   2.522 -				s = (s | s << 16) & 0x07e0f81f;
   2.523 -				d = (d | d << 16) & 0x07e0f81f;
   2.524 -				d += (s - d) * alpha >> 5;
   2.525 -				d &= 0x07e0f81f;
   2.526 -				*dstp++ = d | d >> 16;
   2.527 -				s = *srcp++;
   2.528 -				d = *dstp;
   2.529 -				/*
   2.530 -				 * shift out the middle component (green) to
   2.531 -				 * the high 16 bits, and process all three RGB
   2.532 -				 * components at the same time.
   2.533 -				 */
   2.534 -				s = (s | s << 16) & 0x07e0f81f;
   2.535 -				d = (d | d << 16) & 0x07e0f81f;
   2.536 -				d += (s - d) * alpha >> 5;
   2.537 -				d &= 0x07e0f81f;
   2.538 -				*dstp++ = d | d >> 16;
   2.539 -			},{
   2.540 -				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   2.541 -				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   2.542 -
   2.543 -				/* red -- does not need a mask since the right shift clears
   2.544 -				   the uninteresting bits */
   2.545 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.546 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.547 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
   2.548 -				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
   2.549 -
   2.550 -				/* blend */
   2.551 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.552 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.553 -				/* alpha used is actually 11 bits
   2.554 -				   11 + 5 = 16 bits, so the sign bits are lost */
   2.555 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   2.556 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.557 -				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
   2.558 -
   2.559 -				movq_r2r(mm6, mm1); /* save new reds in dsts */
   2.560 -
   2.561 -				/* green -- process the bits in place */
   2.562 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.563 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.564 -				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   2.565 -				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   2.566 -
   2.567 -				/* blend */
   2.568 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.569 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.570 -				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
   2.571 -				   bits are gone and the sign bits present */
   2.572 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2.573 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.574 -
   2.575 -				por_r2r(mm6, mm1); /* save new greens in dsts */
   2.576 -
   2.577 -				/* blue */
   2.578 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.579 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.580 -				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   2.581 -				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2.582 -
   2.583 -				/* blend */
   2.584 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.585 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.586 -				/* 11 + 5 = 16 bits, so the sign bits are lost and
   2.587 -				   the interesting bits will need to be MASKed */
   2.588 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   2.589 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.590 -				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2.591 -
   2.592 -				por_r2r(mm6, mm1); /* save new blues in dsts */
   2.593 -
   2.594 -				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
   2.595 -
   2.596 -				srcp += 4;
   2.597 -				dstp += 4;
   2.598 -			}, width);			
   2.599 -			/* *INDENT-ON* */
   2.600 -            srcp += srcskip;
   2.601 -            dstp += dstskip;
   2.602 -        }
   2.603 -        emms();
   2.604 -    }
   2.605 -}
   2.606 -
   2.607 -/* fast RGB555->RGB555 blending with surface alpha */
   2.608 -static void
   2.609 -Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   2.610 -{
   2.611 -    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
   2.612 -    if (alpha == 128) {
   2.613 -        Blit16to16SurfaceAlpha128(info, 0xfbde);
   2.614 -    } else {
   2.615 -        int width = info->d_width;
   2.616 -        int height = info->d_height;
   2.617 -        Uint16 *srcp = (Uint16 *) info->s_pixels;
   2.618 -        int srcskip = info->s_skip >> 1;
   2.619 -        Uint16 *dstp = (Uint16 *) info->d_pixels;
   2.620 -        int dstskip = info->d_skip >> 1;
   2.621 -        Uint32 s, d;
   2.622 -        Uint8 load[8];
   2.623 -
   2.624 -        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   2.625 -        *(Uint64 *) load = alpha;
   2.626 -        alpha >>= 3;            /* downscale alpha to 5 bits */
   2.627 -
   2.628 -        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
   2.629 -        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
   2.630 -        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
   2.631 -        /* position alpha to allow for mullo and mulhi on diff channels
   2.632 -           to reduce the number of operations */
   2.633 -        psllq_i2r(3, mm0);
   2.634 -
   2.635 -        /* Setup the 555 color channel masks */
   2.636 -        *(Uint64 *) load = 0x03E003E003E003E0ULL;
   2.637 -        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
   2.638 -        *(Uint64 *) load = 0x001F001F001F001FULL;
   2.639 -        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
   2.640 -        while (height--) {
   2.641 -			/* *INDENT-OFF* */
   2.642 -			DUFFS_LOOP_QUATRO2(
   2.643 -			{
   2.644 -				s = *srcp++;
   2.645 -				d = *dstp;
   2.646 -				/*
   2.647 -				 * shift out the middle component (green) to
   2.648 -				 * the high 16 bits, and process all three RGB
   2.649 -				 * components at the same time.
   2.650 -				 */
   2.651 -				s = (s | s << 16) & 0x03e07c1f;
   2.652 -				d = (d | d << 16) & 0x03e07c1f;
   2.653 -				d += (s - d) * alpha >> 5;
   2.654 -				d &= 0x03e07c1f;
   2.655 -				*dstp++ = d | d >> 16;
   2.656 -			},{
   2.657 -				s = *srcp++;
   2.658 -				d = *dstp;
   2.659 -				/*
   2.660 -				 * shift out the middle component (green) to
   2.661 -				 * the high 16 bits, and process all three RGB
   2.662 -				 * components at the same time.
   2.663 -				 */
   2.664 -				s = (s | s << 16) & 0x03e07c1f;
   2.665 -				d = (d | d << 16) & 0x03e07c1f;
   2.666 -				d += (s - d) * alpha >> 5;
   2.667 -				d &= 0x03e07c1f;
   2.668 -				*dstp++ = d | d >> 16;
   2.669 -			        s = *srcp++;
   2.670 -				d = *dstp;
   2.671 -				/*
   2.672 -				 * shift out the middle component (green) to
   2.673 -				 * the high 16 bits, and process all three RGB
   2.674 -				 * components at the same time.
   2.675 -				 */
   2.676 -				s = (s | s << 16) & 0x03e07c1f;
   2.677 -				d = (d | d << 16) & 0x03e07c1f;
   2.678 -				d += (s - d) * alpha >> 5;
   2.679 -				d &= 0x03e07c1f;
   2.680 -				*dstp++ = d | d >> 16;
   2.681 -			},{
   2.682 -				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
   2.683 -				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
   2.684 -
   2.685 -				/* red -- process the bits in place */
   2.686 -				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
   2.687 -					/* by reusing the GREEN mask we free up another mmx
   2.688 -					   register to accumulate the result */
   2.689 -
   2.690 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.691 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.692 -				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
   2.693 -				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
   2.694 -
   2.695 -				/* blend */
   2.696 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.697 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.698 -				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
   2.699 -				   cleared by a MASK below */
   2.700 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2.701 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.702 -				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
   2.703 -
   2.704 -				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
   2.705 -
   2.706 -				movq_r2r(mm6, mm1); /* save new reds in dsts */
   2.707 -
   2.708 -				/* green -- process the bits in place */
   2.709 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.710 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.711 -				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
   2.712 -				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
   2.713 -
   2.714 -				/* blend */
   2.715 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.716 -				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.717 -				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
   2.718 -				   bits are gone and the sign bits present */
   2.719 -				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
   2.720 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.721 -
   2.722 -				por_r2r(mm6, mm1); /* save new greens in dsts */
   2.723 -
   2.724 -				/* blue */
   2.725 -				movq_r2r(mm2, mm5); /* src -> mm5 */
   2.726 -				movq_r2r(mm3, mm6); /* dst -> mm6 */
   2.727 -				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
   2.728 -				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2.729 -
   2.730 -				/* blend */
   2.731 -				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
   2.732 -				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
   2.733 -				/* 11 + 5 = 16 bits, so the sign bits are lost and
   2.734 -				   the interesting bits will need to be MASKed */
   2.735 -				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
   2.736 -				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
   2.737 -				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
   2.738 -
   2.739 -				por_r2r(mm6, mm1); /* save new blues in dsts */
   2.740 -
   2.741 -				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
   2.742 -
   2.743 -				srcp += 4;
   2.744 -				dstp += 4;
   2.745 -			}, width);
   2.746 -			/* *INDENT-ON* */
   2.747 -            srcp += srcskip;
   2.748 -            dstp += dstskip;
   2.749 -        }
   2.750 -        emms();
   2.751 -    }
   2.752 -}
   2.753 -
   2.754 -/* End GCC_ASMBLIT */
   2.755 -
   2.756 -#elif MSVC_ASMBLIT
   2.757  /* fast RGB565->RGB565 blending with surface alpha */
   2.758  static void
   2.759  Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   2.760 @@ -2507,7 +1830,8 @@
   2.761          _mm_empty();
   2.762      }
   2.763  }
   2.764 -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   2.765 +
   2.766 +#endif /* __MMX__ */
   2.767  
   2.768  /* fast RGB565->RGB565 blending with surface alpha */
   2.769  static void
   2.770 @@ -2852,14 +2176,14 @@
   2.771              case 2:
   2.772                  if (surface->map->identity) {
   2.773                      if (df->Gmask == 0x7e0) {
   2.774 -#if MMX_ASMBLIT
   2.775 +#ifdef __MMX__
   2.776                          if (SDL_HasMMX())
   2.777                              return Blit565to565SurfaceAlphaMMX;
   2.778                          else
   2.779  #endif
   2.780                              return Blit565to565SurfaceAlpha;
   2.781                      } else if (df->Gmask == 0x3e0) {
   2.782 -#if MMX_ASMBLIT
   2.783 +#ifdef __MMX__
   2.784                          if (SDL_HasMMX())
   2.785                              return Blit555to555SurfaceAlphaMMX;
   2.786                          else
   2.787 @@ -2873,7 +2197,7 @@
   2.788                  if (sf->Rmask == df->Rmask
   2.789                      && sf->Gmask == df->Gmask
   2.790                      && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   2.791 -#if MMX_ASMBLIT
   2.792 +#ifdef __MMX__
   2.793                      if (sf->Rshift % 8 == 0
   2.794                          && sf->Gshift % 8 == 0
   2.795                          && sf->Bshift % 8 == 0 && SDL_HasMMX())
   2.796 @@ -2928,7 +2252,7 @@
   2.797              if (sf->Rmask == df->Rmask
   2.798                  && sf->Gmask == df->Gmask
   2.799                  && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
   2.800 -#if MMX_ASMBLIT
   2.801 +#ifdef __MMX__
   2.802                  if (sf->Rshift % 8 == 0
   2.803                      && sf->Gshift % 8 == 0
   2.804                      && sf->Bshift % 8 == 0