src/video/SDL_blit_A.c
author Sam Lantinga
Tue, 03 Jul 2007 09:55:29 +0000
changeset 2141 e1a70460c354
parent 2132 46648dc418ec
child 2231 a353684c7cc1
permissions -rw-r--r--
stupid indent
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /*
    28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
    29    Checking if _mm_free is #defined in malloc.h is is the only way to
    30    determine if the Processor Pack is installed, as far as I can tell.
    31 */
    32 
    33 #if SDL_ASSEMBLY_ROUTINES
    34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    35 #    define MMX_ASMBLIT 1
    36 #    define GCC_ASMBLIT 1
    37 #  elif defined(_MSC_VER) && defined(_M_IX86)
    38 #    if (_MSC_VER <= 1200)
    39 #      include <malloc.h>
    40 #      if defined(_mm_free)
    41 #          define HAVE_MMINTRIN_H 1
    42 #      endif
    43 #    else /* Visual Studio > VC6 always has mmintrin.h */
    44 #      define HAVE_MMINTRIN_H 1
    45 #    endif
    46 #    if HAVE_MMINTRIN_H
    47 #      define MMX_ASMBLIT 1
    48 #      define MSVC_ASMBLIT 1
    49 #    endif
    50 #  endif
    51 #endif /* SDL_ASSEMBLY_ROUTINES */
    52 
    53 /* Function to check the CPU flags */
    54 #include "SDL_cpuinfo.h"
    55 #if GCC_ASMBLIT
    56 #include "mmx.h"
    57 #elif MSVC_ASMBLIT
    58 #include <mmintrin.h>
    59 #include <mm3dnow.h>
    60 #endif
    61 
    62 /* Functions to perform alpha blended blitting */
    63 
    64 /* N->1 blending with per-surface alpha */
    65 static void
    66 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    67 {
    68     int width = info->d_width;
    69     int height = info->d_height;
    70     Uint8 *src = info->s_pixels;
    71     int srcskip = info->s_skip;
    72     Uint8 *dst = info->d_pixels;
    73     int dstskip = info->d_skip;
    74     Uint8 *palmap = info->table;
    75     SDL_PixelFormat *srcfmt = info->src;
    76     SDL_PixelFormat *dstfmt = info->dst;
    77     int srcbpp = srcfmt->BytesPerPixel;
    78 
    79     const unsigned A = srcfmt->alpha;
    80 
    81     while (height--) {
    82 	    /* *INDENT-OFF* */
    83 	    DUFFS_LOOP4(
    84 	    {
    85 		Uint32 Pixel;
    86 		unsigned sR;
    87 		unsigned sG;
    88 		unsigned sB;
    89 		unsigned dR;
    90 		unsigned dG;
    91 		unsigned dB;
    92 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    93 		dR = dstfmt->palette->colors[*dst].r;
    94 		dG = dstfmt->palette->colors[*dst].g;
    95 		dB = dstfmt->palette->colors[*dst].b;
    96 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    97 		dR &= 0xff;
    98 		dG &= 0xff;
    99 		dB &= 0xff;
   100 		/* Pack RGB into 8bit pixel */
   101 		if ( palmap == NULL ) {
   102 		    *dst =((dR>>5)<<(3+2))|
   103 			  ((dG>>5)<<(2))|
   104 			  ((dB>>6)<<(0));
   105 		} else {
   106 		    *dst = palmap[((dR>>5)<<(3+2))|
   107 				  ((dG>>5)<<(2))  |
   108 				  ((dB>>6)<<(0))];
   109 		}
   110 		dst++;
   111 		src += srcbpp;
   112 	    },
   113 	    width);
   114 	    /* *INDENT-ON* */
   115         src += srcskip;
   116         dst += dstskip;
   117     }
   118 }
   119 
   120 /* N->1 blending with pixel alpha */
   121 static void
   122 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   123 {
   124     int width = info->d_width;
   125     int height = info->d_height;
   126     Uint8 *src = info->s_pixels;
   127     int srcskip = info->s_skip;
   128     Uint8 *dst = info->d_pixels;
   129     int dstskip = info->d_skip;
   130     Uint8 *palmap = info->table;
   131     SDL_PixelFormat *srcfmt = info->src;
   132     SDL_PixelFormat *dstfmt = info->dst;
   133     int srcbpp = srcfmt->BytesPerPixel;
   134 
   135     /* FIXME: fix alpha bit field expansion here too? */
   136     while (height--) {
   137 	    /* *INDENT-OFF* */
   138 	    DUFFS_LOOP4(
   139 	    {
   140 		Uint32 Pixel;
   141 		unsigned sR;
   142 		unsigned sG;
   143 		unsigned sB;
   144 		unsigned sA;
   145 		unsigned dR;
   146 		unsigned dG;
   147 		unsigned dB;
   148 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   149 		dR = dstfmt->palette->colors[*dst].r;
   150 		dG = dstfmt->palette->colors[*dst].g;
   151 		dB = dstfmt->palette->colors[*dst].b;
   152 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   153 		dR &= 0xff;
   154 		dG &= 0xff;
   155 		dB &= 0xff;
   156 		/* Pack RGB into 8bit pixel */
   157 		if ( palmap == NULL ) {
   158 		    *dst =((dR>>5)<<(3+2))|
   159 			  ((dG>>5)<<(2))|
   160 			  ((dB>>6)<<(0));
   161 		} else {
   162 		    *dst = palmap[((dR>>5)<<(3+2))|
   163 				  ((dG>>5)<<(2))  |
   164 				  ((dB>>6)<<(0))  ];
   165 		}
   166 		dst++;
   167 		src += srcbpp;
   168 	    },
   169 	    width);
   170 	    /* *INDENT-ON* */
   171         src += srcskip;
   172         dst += dstskip;
   173     }
   174 }
   175 
   176 /* colorkeyed N->1 blending with per-surface alpha */
   177 static void
   178 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   179 {
   180     int width = info->d_width;
   181     int height = info->d_height;
   182     Uint8 *src = info->s_pixels;
   183     int srcskip = info->s_skip;
   184     Uint8 *dst = info->d_pixels;
   185     int dstskip = info->d_skip;
   186     Uint8 *palmap = info->table;
   187     SDL_PixelFormat *srcfmt = info->src;
   188     SDL_PixelFormat *dstfmt = info->dst;
   189     int srcbpp = srcfmt->BytesPerPixel;
   190     Uint32 ckey = srcfmt->colorkey;
   191 
   192     const int A = srcfmt->alpha;
   193 
   194     while (height--) {
   195 	    /* *INDENT-OFF* */
   196 	    DUFFS_LOOP(
   197 	    {
   198 		Uint32 Pixel;
   199 		unsigned sR;
   200 		unsigned sG;
   201 		unsigned sB;
   202 		unsigned dR;
   203 		unsigned dG;
   204 		unsigned dB;
   205 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   206 		if ( Pixel != ckey ) {
   207 		    dR = dstfmt->palette->colors[*dst].r;
   208 		    dG = dstfmt->palette->colors[*dst].g;
   209 		    dB = dstfmt->palette->colors[*dst].b;
   210 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   211 		    dR &= 0xff;
   212 		    dG &= 0xff;
   213 		    dB &= 0xff;
   214 		    /* Pack RGB into 8bit pixel */
   215 		    if ( palmap == NULL ) {
   216 			*dst =((dR>>5)<<(3+2))|
   217 			      ((dG>>5)<<(2)) |
   218 			      ((dB>>6)<<(0));
   219 		    } else {
   220 			*dst = palmap[((dR>>5)<<(3+2))|
   221 				      ((dG>>5)<<(2))  |
   222 				      ((dB>>6)<<(0))  ];
   223 		    }
   224 		}
   225 		dst++;
   226 		src += srcbpp;
   227 	    },
   228 	    width);
   229 	    /* *INDENT-ON* */
   230         src += srcskip;
   231         dst += dstskip;
   232     }
   233 }
   234 
   235 #if GCC_ASMBLIT
   236 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   237 static void
   238 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   239 {
   240     int width = info->d_width;
   241     int height = info->d_height;
   242     Uint32 *srcp = (Uint32 *) info->s_pixels;
   243     int srcskip = info->s_skip >> 2;
   244     Uint32 *dstp = (Uint32 *) info->d_pixels;
   245     int dstskip = info->d_skip >> 2;
   246     Uint32 dalpha = info->dst->Amask;
   247     Uint8 load[8];
   248 
   249     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   250     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   251     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   252     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   253     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   254     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   255     while (height--) {
   256 		/* *INDENT-OFF* */
   257 		DUFFS_LOOP_DOUBLE2(
   258 		{
   259 			Uint32 s = *srcp++;
   260 			Uint32 d = *dstp;
   261 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   262 				   + (s & d & 0x00010101)) | dalpha;
   263 		},{
   264 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   265 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   266 
   267 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   268 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   269 
   270 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   271 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   272 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   273 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   274 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   275 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   276 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   277 			
   278 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   279 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   280 			dstp += 2;
   281 			srcp += 2;
   282 		}, width);
   283 		/* *INDENT-ON* */
   284         srcp += srcskip;
   285         dstp += dstskip;
   286     }
   287     emms();
   288 }
   289 
   290 /* fast RGB888->(A)RGB888 blending with surface alpha */
   291 static void
   292 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   293 {
   294     SDL_PixelFormat *df = info->dst;
   295     unsigned alpha = info->src->alpha;
   296 
   297     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   298         /* only call a128 version when R,G,B occupy lower bits */
   299         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   300     } else {
   301         int width = info->d_width;
   302         int height = info->d_height;
   303         Uint32 *srcp = (Uint32 *) info->s_pixels;
   304         int srcskip = info->s_skip >> 2;
   305         Uint32 *dstp = (Uint32 *) info->d_pixels;
   306         int dstskip = info->d_skip >> 2;
   307 
   308         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   309         /* form the alpha mult */
   310         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   311         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   312         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   313         alpha =
   314             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   315                                                            Bshift);
   316         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   317         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   318         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   319         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   320         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   321         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   322 
   323         while (height--) {
   324 			/* *INDENT-OFF* */
   325 			DUFFS_LOOP_DOUBLE2({
   326 				/* One Pixel Blend */
   327 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   328 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   329 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   330 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   331 
   332 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   333 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   334 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   335 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   336 
   337 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   338 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   339 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   340 				++srcp;
   341 				++dstp;
   342 			},{
   343 				/* Two Pixels Blend */
   344 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   345 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   346 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   347 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   348 
   349 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   350 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   351 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   352 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   353 
   354 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   355 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   356 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   357 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   358 
   359 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   360 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   361 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   362 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   363 
   364 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   365 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   366 				
   367 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   368 
   369   				srcp += 2;
   370   				dstp += 2;
   371   			}, width);
   372 			/* *INDENT-ON* */
   373             srcp += srcskip;
   374             dstp += dstskip;
   375         }
   376         emms();
   377     }
   378 }
   379 
   380 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   381 static void
   382 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   383 {
   384     int width = info->d_width;
   385     int height = info->d_height;
   386     Uint32 *srcp = (Uint32 *) info->s_pixels;
   387     int srcskip = info->s_skip >> 2;
   388     Uint32 *dstp = (Uint32 *) info->d_pixels;
   389     int dstskip = info->d_skip >> 2;
   390     SDL_PixelFormat *sf = info->src;
   391     Uint32 amask = sf->Amask;
   392 
   393     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   394     /* form multiplication mask */
   395     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   396     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   397     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   398     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   399     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   400     /* form channel masks */
   401     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   402     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   403     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   404     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   405     /* get alpha channel shift */
   406     /* *INDENT-OFF* */
   407     __asm__ __volatile__ (
   408         "movd %0, %%mm5"
   409         : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   410     /* *INDENT-ON* */
   411 
   412     while (height--) {
   413 	    /* *INDENT-OFF* */
   414 	    DUFFS_LOOP4({
   415 		Uint32 alpha = *srcp & amask;
   416 		/* FIXME: Here we special-case opaque alpha since the
   417 			compositioning used (>>8 instead of /255) doesn't handle
   418 			it correctly. Also special-case alpha=0 for speed?
   419 			Benchmark this! */
   420 		if(alpha == 0) {
   421 			/* do nothing */
   422 		} else if(alpha == amask) {
   423 			/* opaque alpha -- copy RGB, keep dst alpha */
   424 			/* using MMX here to free up regular registers for other things */
   425 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   426 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   427 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   428 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   429 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   430 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   431 		} else {
   432 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   433 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   434 
   435 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   436 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   437 
   438 			__asm__ __volatile__ (
   439 				"movd %0, %%mm4"
   440 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   441 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   442 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   443 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   444 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   445 
   446 			/* blend */		    
   447 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   448 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   449 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   450 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   451 			
   452 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   453 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   454 		}
   455 		++srcp;
   456 		++dstp;
   457 	    }, width);
   458 	    /* *INDENT-ON* */
   459         srcp += srcskip;
   460         dstp += dstskip;
   461     }
   462     emms();
   463 }
   464 
   465 /* End GCC_ASMBLIT */
   466 
   467 #elif MSVC_ASMBLIT
   468 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   469 static void
   470 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   471 {
   472     int width = info->d_width;
   473     int height = info->d_height;
   474     Uint32 *srcp = (Uint32 *) info->s_pixels;
   475     int srcskip = info->s_skip >> 2;
   476     Uint32 *dstp = (Uint32 *) info->d_pixels;
   477     int dstskip = info->d_skip >> 2;
   478     Uint32 dalpha = info->dst->Amask;
   479 
   480     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   481 
   482     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   483     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   484     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   485 
   486     while (height--) {
   487         int n = width;
   488         if (n & 1) {
   489             Uint32 s = *srcp++;
   490             Uint32 d = *dstp;
   491             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   492                        + (s & d & 0x00010101)) | dalpha;
   493             n--;
   494         }
   495 
   496         for (n >>= 1; n > 0; --n) {
   497             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   498             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   499 
   500             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   501             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   502 
   503             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   504             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   505             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   506             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   507 
   508             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   509             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   510             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   511             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   512 
   513             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   514             dstp += 2;
   515             srcp += 2;
   516         }
   517 
   518         srcp += srcskip;
   519         dstp += dstskip;
   520     }
   521     _mm_empty();
   522 }
   523 
   524 /* fast RGB888->(A)RGB888 blending with surface alpha */
   525 static void
   526 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   527 {
   528     SDL_PixelFormat *df = info->dst;
   529     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   530     unsigned alpha = info->src->alpha;
   531 
   532     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   533         /* only call a128 version when R,G,B occupy lower bits */
   534         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   535     } else {
   536         int width = info->d_width;
   537         int height = info->d_height;
   538         Uint32 *srcp = (Uint32 *) info->s_pixels;
   539         int srcskip = info->s_skip >> 2;
   540         Uint32 *dstp = (Uint32 *) info->d_pixels;
   541         int dstskip = info->d_skip >> 2;
   542         Uint32 dalpha = df->Amask;
   543         Uint32 amult;
   544 
   545         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   546 
   547         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   548         /* form the alpha mult */
   549         amult = alpha | (alpha << 8);
   550         amult = amult | (amult << 16);
   551         chanmask =
   552             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   553                                                            Bshift);
   554         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   555         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   556         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   557         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   558 
   559         while (height--) {
   560             int n = width;
   561             if (n & 1) {
   562                 /* One Pixel Blend */
   563                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   564                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   565 
   566                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   567                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   568 
   569                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   570                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   571                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   572                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   573 
   574                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   575                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   576                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   577 
   578                 ++srcp;
   579                 ++dstp;
   580 
   581                 n--;
   582             }
   583 
   584             for (n >>= 1; n > 0; --n) {
   585                 /* Two Pixels Blend */
   586                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   587                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   588                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   589                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   590 
   591                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   592                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   593                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   594                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   595 
   596                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   597                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   598                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   599                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   600 
   601                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   602                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   603                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   604                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   605 
   606                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   607                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   608 
   609                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   610 
   611                 srcp += 2;
   612                 dstp += 2;
   613             }
   614             srcp += srcskip;
   615             dstp += dstskip;
   616         }
   617         _mm_empty();
   618     }
   619 }
   620 
   621 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   622 static void
   623 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   624 {
   625     int width = info->d_width;
   626     int height = info->d_height;
   627     Uint32 *srcp = (Uint32 *) info->s_pixels;
   628     int srcskip = info->s_skip >> 2;
   629     Uint32 *dstp = (Uint32 *) info->d_pixels;
   630     int dstskip = info->d_skip >> 2;
   631     SDL_PixelFormat *sf = info->src;
   632     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   633     Uint32 amask = sf->Amask;
   634     Uint32 ashift = sf->Ashift;
   635     Uint64 multmask;
   636 
   637     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   638 
   639     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   640 	/* *INDENT-OFF* */
   641 	multmask = ~(0xFFFFI64 << (ashift * 2));
   642 	/* *INDENT-ON* */
   643     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   644 
   645     while (height--) {
   646 		/* *INDENT-OFF* */
   647 		DUFFS_LOOP4({
   648 		Uint32 alpha = *srcp & amask;
   649 		if (alpha == 0) {
   650 			/* do nothing */
   651 		} else if (alpha == amask) {
   652 			/* opaque alpha -- copy RGB, keep dst alpha */
   653 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   654 		} else {
   655 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   656 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   657 
   658 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   659 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   660 
   661 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   662 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   663 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   664 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   665 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   666 
   667 			/* blend */		    
   668 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   669 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   670 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   671 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   672 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   673 			
   674 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   675 		}
   676 		++srcp;
   677 		++dstp;
   678 	    }, width);
   679 		/* *INDENT-ON* */
   680         srcp += srcskip;
   681         dstp += dstskip;
   682     }
   683     _mm_empty();
   684 }
   685 
   686 /* End MSVC_ASMBLIT */
   687 
   688 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   689 
   690 #if SDL_ALTIVEC_BLITTERS
   691 #if __MWERKS__
   692 #pragma altivec_model on
   693 #endif
   694 #if HAVE_ALTIVEC_H
   695 #include <altivec.h>
   696 #endif
   697 #include <assert.h>
   698 
   699 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   700 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   701         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   702 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   703         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   704 #else
   705 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   706         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   707 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   708         (vector unsigned short) { a,b,c,d,e,f,g,h }
   709 #endif
   710 
   711 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   712 #define VECPRINT(msg, v) do { \
   713     vector unsigned int tmpvec = (vector unsigned int)(v); \
   714     unsigned int *vp = (unsigned int *)&tmpvec; \
   715     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   716 } while (0)
   717 
   718 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   719     (vector unsigned char)(
   720         0x00, 0x10, 0x02, 0x12,
   721         0x04, 0x14, 0x06, 0x16,
   722         0x08, 0x18, 0x0A, 0x1A,
   723         0x0C, 0x1C, 0x0E, 0x1E );
   724 */
   725 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   726 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   727 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   728 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   729     ? vec_lvsl(0, src) \
   730     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   731 
   732 
   733 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   734     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   735     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   736     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   737     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   738     /* valpha2 is 255-alpha */ \
   739     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   740     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   741     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   742     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   743     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   744     /* add source and dest */ \
   745     vtemp1 = vec_add(vtemp1, vtemp3); \
   746     vtemp2 = vec_add(vtemp2, vtemp4); \
   747     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   748     vtemp1 = vec_add(vtemp1, v1_16); \
   749     vtemp3 = vec_sr(vtemp1, v8_16); \
   750     vtemp1 = vec_add(vtemp1, vtemp3); \
   751     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   752     vtemp2 = vec_add(vtemp2, v1_16); \
   753     vtemp4 = vec_sr(vtemp2, v8_16); \
   754     vtemp2 = vec_add(vtemp2, vtemp4); \
   755     /* (>>8) and get ARGBARGBARGBARGB */ \
   756     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   757 } while (0)
   758 
   759 /* Calculate the permute vector used for 32->32 swizzling */
   760 static vector unsigned char
   761 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   762 {
   763     /*
   764      * We have to assume that the bits that aren't used by other
   765      *  colors is alpha, and it's one complete byte, since some formats
   766      *  leave alpha with a zero mask, but we should still swizzle the bits.
   767      */
   768     /* ARGB */
   769     const static struct SDL_PixelFormat default_pixel_format = {
   770         NULL, 0, 0,
   771         0, 0, 0, 0,
   772         16, 8, 0, 24,
   773         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   774         0, 0
   775     };
   776     if (!srcfmt) {
   777         srcfmt = &default_pixel_format;
   778     }
   779     if (!dstfmt) {
   780         dstfmt = &default_pixel_format;
   781     }
   782     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   783                                                        0x04, 0x04, 0x04, 0x04,
   784                                                        0x08, 0x08, 0x08, 0x08,
   785                                                        0x0C, 0x0C, 0x0C,
   786                                                        0x0C);
   787     vector unsigned char vswiz;
   788     vector unsigned int srcvec;
   789 #define RESHIFT(X) (3 - ((X) >> 3))
   790     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   791     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   792     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   793     Uint32 amask;
   794     /* Use zero for alpha if either surface doesn't have alpha */
   795     if (dstfmt->Amask) {
   796         amask =
   797             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   798                                                                    Ashift);
   799     } else {
   800         amask =
   801             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   802                           0xFFFFFFFF);
   803     }
   804 #undef RESHIFT
   805     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   806     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   807     return (vswiz);
   808 }
   809 
   810 static void
   811 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   812 {
   813     int height = info->d_height;
   814     Uint8 *src = (Uint8 *) info->s_pixels;
   815     int srcskip = info->s_skip;
   816     Uint8 *dst = (Uint8 *) info->d_pixels;
   817     int dstskip = info->d_skip;
   818     SDL_PixelFormat *srcfmt = info->src;
   819 
   820     vector unsigned char v0 = vec_splat_u8(0);
   821     vector unsigned short v8_16 = vec_splat_u16(8);
   822     vector unsigned short v1_16 = vec_splat_u16(1);
   823     vector unsigned short v2_16 = vec_splat_u16(2);
   824     vector unsigned short v3_16 = vec_splat_u16(3);
   825     vector unsigned int v8_32 = vec_splat_u32(8);
   826     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   827     vector unsigned short v3f =
   828         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   829                           0x003f, 0x003f, 0x003f, 0x003f);
   830     vector unsigned short vfc =
   831         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   832                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   833 
   834     /* 
   835        0x10 - 0x1f is the alpha
   836        0x00 - 0x0e evens are the red
   837        0x01 - 0x0f odds are zero
   838      */
   839     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   840                                                        0x10, 0x02, 0x01, 0x01,
   841                                                        0x10, 0x04, 0x01, 0x01,
   842                                                        0x10, 0x06, 0x01,
   843                                                        0x01);
   844     vector unsigned char vredalpha2 =
   845         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   846                                         vec_sl(v8_32, v16_32))
   847         );
   848     /*
   849        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   850        0x11 - 0x0f odds are blue
   851      */
   852     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   853                                                    0x04, 0x05, 0x06, 0x13,
   854                                                    0x08, 0x09, 0x0a, 0x15,
   855                                                    0x0c, 0x0d, 0x0e, 0x17);
   856     vector unsigned char vblue2 =
   857         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   858         );
   859     /*
   860        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   861        0x10 - 0x0e evens are green
   862      */
   863     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   864                                                     0x04, 0x05, 0x12, 0x07,
   865                                                     0x08, 0x09, 0x14, 0x0b,
   866                                                     0x0c, 0x0d, 0x16, 0x0f);
   867     vector unsigned char vgreen2 =
   868         (vector unsigned
   869          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   870         );
   871     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   872                                                     0x00, 0x0a, 0x00, 0x0e,
   873                                                     0x00, 0x12, 0x00, 0x16,
   874                                                     0x00, 0x1a, 0x00, 0x1e);
   875     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   876     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   877     vector unsigned char valphaPermute =
   878         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   879 
   880     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   881     vf800 = vec_sl(vf800, vec_splat_u16(8));
   882 
   883     while (height--) {
   884         int extrawidth;
   885         vector unsigned char valigner;
   886         vector unsigned char vsrc;
   887         vector unsigned char voverflow;
   888         int width = info->d_width;
   889 
   890 #define ONE_PIXEL_BLEND(condition, widthvar) \
   891         while (condition) { \
   892             Uint32 Pixel; \
   893             unsigned sR, sG, sB, dR, dG, dB, sA; \
   894             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   895             if(sA) { \
   896                 unsigned short dstpixel = *((unsigned short *)dst); \
   897                 dR = (dstpixel >> 8) & 0xf8; \
   898                 dG = (dstpixel >> 3) & 0xfc; \
   899                 dB = (dstpixel << 3) & 0xf8; \
   900                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   901                 *((unsigned short *)dst) = ( \
   902                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   903                 ); \
   904             } \
   905             src += 4; \
   906             dst += 2; \
   907             widthvar--; \
   908         }
   909         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   910         extrawidth = (width % 8);
   911         valigner = VEC_ALIGNER(src);
   912         vsrc = (vector unsigned char) vec_ld(0, src);
   913         width -= extrawidth;
   914         while (width) {
   915             vector unsigned char valpha;
   916             vector unsigned char vsrc1, vsrc2;
   917             vector unsigned char vdst1, vdst2;
   918             vector unsigned short vR, vG, vB;
   919             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   920 
   921             /* Load 8 pixels from src as ARGB */
   922             voverflow = (vector unsigned char) vec_ld(15, src);
   923             vsrc = vec_perm(vsrc, voverflow, valigner);
   924             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   925             src += 16;
   926             vsrc = (vector unsigned char) vec_ld(15, src);
   927             voverflow = vec_perm(voverflow, vsrc, valigner);
   928             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   929             src += 16;
   930 
   931             /* Load 8 pixels from dst as XRGB */
   932             voverflow = vec_ld(0, dst);
   933             vR = vec_and((vector unsigned short) voverflow, vf800);
   934             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   935             vG = vec_sl(vB, v2_16);
   936             vdst1 =
   937                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   938                                                 (vector unsigned char) vR,
   939                                                 vredalpha1);
   940             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   941             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   942             vdst2 =
   943                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   944                                                 (vector unsigned char) vR,
   945                                                 vredalpha2);
   946             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   947             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   948 
   949             /* Alpha blend 8 pixels as ARGB */
   950             valpha = vec_perm(vsrc1, v0, valphaPermute);
   951             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   952                                v8_16);
   953             valpha = vec_perm(vsrc2, v0, valphaPermute);
   954             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   955                                v8_16);
   956 
   957             /* Convert 8 pixels to 565 */
   958             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   959                                                         vdst1,
   960                                                         (vector unsigned int)
   961                                                         vdst2);
   962             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   963             vgpixel = vec_and(vgpixel, vfc);
   964             vgpixel = vec_sl(vgpixel, v3_16);
   965             vrpixel = vec_sl(vpixel, v1_16);
   966             vrpixel = vec_and(vrpixel, vf800);
   967             vbpixel = vec_and(vpixel, v3f);
   968             vdst1 =
   969                 vec_or((vector unsigned char) vrpixel,
   970                        (vector unsigned char) vgpixel);
   971             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   972 
   973             /* Store 8 pixels */
   974             vec_st(vdst1, 0, dst);
   975 
   976             width -= 8;
   977             dst += 16;
   978         }
   979         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   980 #undef ONE_PIXEL_BLEND
   981         src += srcskip;
   982         dst += dstskip;
   983     }
   984 }
   985 
   986 static void
   987 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   988 {
   989     unsigned alpha = info->src->alpha;
   990     int height = info->d_height;
   991     Uint32 *srcp = (Uint32 *) info->s_pixels;
   992     int srcskip = info->s_skip >> 2;
   993     Uint32 *dstp = (Uint32 *) info->d_pixels;
   994     int dstskip = info->d_skip >> 2;
   995     SDL_PixelFormat *srcfmt = info->src;
   996     SDL_PixelFormat *dstfmt = info->dst;
   997     unsigned sA = srcfmt->alpha;
   998     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   999     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
  1000     Uint32 ckey = info->src->colorkey;
  1001     vector unsigned char mergePermute;
  1002     vector unsigned char vsrcPermute;
  1003     vector unsigned char vdstPermute;
  1004     vector unsigned char vsdstPermute;
  1005     vector unsigned char valpha;
  1006     vector unsigned char valphamask;
  1007     vector unsigned char vbits;
  1008     vector unsigned char v0;
  1009     vector unsigned short v1;
  1010     vector unsigned short v8;
  1011     vector unsigned int vckey;
  1012     vector unsigned int vrgbmask;
  1013 
  1014     mergePermute = VEC_MERGE_PERMUTE();
  1015     v0 = vec_splat_u8(0);
  1016     v1 = vec_splat_u16(1);
  1017     v8 = vec_splat_u16(8);
  1018 
  1019     /* set the alpha to 255 on the destination surf */
  1020     valphamask = VEC_ALPHA_MASK();
  1021 
  1022     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1023     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1024     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1025 
  1026     /* set a vector full of alpha and 255-alpha */
  1027     ((unsigned char *) &valpha)[0] = alpha;
  1028     valpha = vec_splat(valpha, 0);
  1029     vbits = (vector unsigned char) vec_splat_s8(-1);
  1030 
  1031     ckey &= rgbmask;
  1032     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1033     vckey = vec_splat(vckey, 0);
  1034     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1035     vrgbmask = vec_splat(vrgbmask, 0);
  1036 
  1037     while (height--) {
  1038         int width = info->d_width;
  1039 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1040         while (condition) { \
  1041             Uint32 Pixel; \
  1042             unsigned sR, sG, sB, dR, dG, dB; \
  1043             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  1044             if(sA && Pixel != ckey) { \
  1045                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  1046                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1047                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1048                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1049             } \
  1050             dstp++; \
  1051             srcp++; \
  1052             widthvar--; \
  1053         }
  1054         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1055         if (width > 0) {
  1056             int extrawidth = (width % 4);
  1057             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1058             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1059             width -= extrawidth;
  1060             while (width) {
  1061                 vector unsigned char vsel;
  1062                 vector unsigned char voverflow;
  1063                 vector unsigned char vd;
  1064                 vector unsigned char vd_orig;
  1065 
  1066                 /* s = *srcp */
  1067                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1068                 vs = vec_perm(vs, voverflow, valigner);
  1069 
  1070                 /* vsel is set for items that match the key */
  1071                 vsel =
  1072                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1073                                                    vrgbmask);
  1074                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1075                                                         vsel, vckey);
  1076 
  1077                 /* permute to source format */
  1078                 vs = vec_perm(vs, valpha, vsrcPermute);
  1079 
  1080                 /* d = *dstp */
  1081                 vd = (vector unsigned char) vec_ld(0, dstp);
  1082                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1083 
  1084                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1085 
  1086                 /* set the alpha channel to full on */
  1087                 vd = vec_or(vd, valphamask);
  1088 
  1089                 /* mask out color key */
  1090                 vd = vec_sel(vd, vd_orig, vsel);
  1091 
  1092                 /* permute to dest format */
  1093                 vd = vec_perm(vd, vbits, vdstPermute);
  1094 
  1095                 /* *dstp = res */
  1096                 vec_st((vector unsigned int) vd, 0, dstp);
  1097 
  1098                 srcp += 4;
  1099                 dstp += 4;
  1100                 width -= 4;
  1101                 vs = voverflow;
  1102             }
  1103             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1104         }
  1105 #undef ONE_PIXEL_BLEND
  1106 
  1107         srcp += srcskip;
  1108         dstp += dstskip;
  1109     }
  1110 }
  1111 
  1112 
  1113 static void
  1114 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1115 {
  1116     int width = info->d_width;
  1117     int height = info->d_height;
  1118     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1119     int srcskip = info->s_skip >> 2;
  1120     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1121     int dstskip = info->d_skip >> 2;
  1122     SDL_PixelFormat *srcfmt = info->src;
  1123     SDL_PixelFormat *dstfmt = info->dst;
  1124     vector unsigned char mergePermute;
  1125     vector unsigned char valphaPermute;
  1126     vector unsigned char vsrcPermute;
  1127     vector unsigned char vdstPermute;
  1128     vector unsigned char vsdstPermute;
  1129     vector unsigned char valphamask;
  1130     vector unsigned char vpixelmask;
  1131     vector unsigned char v0;
  1132     vector unsigned short v1;
  1133     vector unsigned short v8;
  1134 
  1135     v0 = vec_splat_u8(0);
  1136     v1 = vec_splat_u16(1);
  1137     v8 = vec_splat_u16(8);
  1138     mergePermute = VEC_MERGE_PERMUTE();
  1139     valphamask = VEC_ALPHA_MASK();
  1140     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1141     vpixelmask = vec_nor(valphamask, v0);
  1142     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1143     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1144     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1145 
  1146     while (height--) {
  1147         width = info->d_width;
  1148 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1149             Uint32 Pixel; \
  1150             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1151             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1152             if(sA) { \
  1153               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1154               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1155               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1156             } \
  1157             ++srcp; \
  1158             ++dstp; \
  1159             widthvar--; \
  1160         }
  1161         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1162         if (width > 0) {
  1163             /* vsrcPermute */
  1164             /* vdstPermute */
  1165             int extrawidth = (width % 4);
  1166             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1167             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1168             width -= extrawidth;
  1169             while (width) {
  1170                 vector unsigned char voverflow;
  1171                 vector unsigned char vd;
  1172                 vector unsigned char valpha;
  1173                 vector unsigned char vdstalpha;
  1174                 /* s = *srcp */
  1175                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1176                 vs = vec_perm(vs, voverflow, valigner);
  1177                 vs = vec_perm(vs, v0, vsrcPermute);
  1178 
  1179                 valpha = vec_perm(vs, v0, valphaPermute);
  1180 
  1181                 /* d = *dstp */
  1182                 vd = (vector unsigned char) vec_ld(0, dstp);
  1183                 vd = vec_perm(vd, v0, vsdstPermute);
  1184                 vdstalpha = vec_and(vd, valphamask);
  1185 
  1186                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1187 
  1188                 /* set the alpha to the dest alpha */
  1189                 vd = vec_and(vd, vpixelmask);
  1190                 vd = vec_or(vd, vdstalpha);
  1191                 vd = vec_perm(vd, v0, vdstPermute);
  1192 
  1193                 /* *dstp = res */
  1194                 vec_st((vector unsigned int) vd, 0, dstp);
  1195 
  1196                 srcp += 4;
  1197                 dstp += 4;
  1198                 width -= 4;
  1199                 vs = voverflow;
  1200 
  1201             }
  1202             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1203         }
  1204         srcp += srcskip;
  1205         dstp += dstskip;
  1206 #undef ONE_PIXEL_BLEND
  1207     }
  1208 }
  1209 
  1210 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1211 static void
  1212 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1213 {
  1214     int width = info->d_width;
  1215     int height = info->d_height;
  1216     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1217     int srcskip = info->s_skip >> 2;
  1218     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1219     int dstskip = info->d_skip >> 2;
  1220     vector unsigned char mergePermute;
  1221     vector unsigned char valphaPermute;
  1222     vector unsigned char valphamask;
  1223     vector unsigned char vpixelmask;
  1224     vector unsigned char v0;
  1225     vector unsigned short v1;
  1226     vector unsigned short v8;
  1227     v0 = vec_splat_u8(0);
  1228     v1 = vec_splat_u16(1);
  1229     v8 = vec_splat_u16(8);
  1230     mergePermute = VEC_MERGE_PERMUTE();
  1231     valphamask = VEC_ALPHA_MASK();
  1232     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1233 
  1234 
  1235     vpixelmask = vec_nor(valphamask, v0);
  1236     while (height--) {
  1237         width = info->d_width;
  1238 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1239         while ((condition)) { \
  1240             Uint32 dalpha; \
  1241             Uint32 d; \
  1242             Uint32 s1; \
  1243             Uint32 d1; \
  1244             Uint32 s = *srcp; \
  1245             Uint32 alpha = s >> 24; \
  1246             if(alpha) { \
  1247               if(alpha == SDL_ALPHA_OPAQUE) { \
  1248                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1249               } else { \
  1250                 d = *dstp; \
  1251                 dalpha = d & 0xff000000; \
  1252                 s1 = s & 0xff00ff; \
  1253                 d1 = d & 0xff00ff; \
  1254                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1255                 s &= 0xff00; \
  1256                 d &= 0xff00; \
  1257                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1258                 *dstp = d1 | d | dalpha; \
  1259               } \
  1260             } \
  1261             ++srcp; \
  1262             ++dstp; \
  1263             widthvar--; \
  1264 	    }
  1265         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1266         if (width > 0) {
  1267             int extrawidth = (width % 4);
  1268             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1269             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1270             width -= extrawidth;
  1271             while (width) {
  1272                 vector unsigned char voverflow;
  1273                 vector unsigned char vd;
  1274                 vector unsigned char valpha;
  1275                 vector unsigned char vdstalpha;
  1276                 /* s = *srcp */
  1277                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1278                 vs = vec_perm(vs, voverflow, valigner);
  1279 
  1280                 valpha = vec_perm(vs, v0, valphaPermute);
  1281 
  1282                 /* d = *dstp */
  1283                 vd = (vector unsigned char) vec_ld(0, dstp);
  1284                 vdstalpha = vec_and(vd, valphamask);
  1285 
  1286                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1287 
  1288                 /* set the alpha to the dest alpha */
  1289                 vd = vec_and(vd, vpixelmask);
  1290                 vd = vec_or(vd, vdstalpha);
  1291 
  1292                 /* *dstp = res */
  1293                 vec_st((vector unsigned int) vd, 0, dstp);
  1294 
  1295                 srcp += 4;
  1296                 dstp += 4;
  1297                 width -= 4;
  1298                 vs = voverflow;
  1299             }
  1300             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1301         }
  1302         srcp += srcskip;
  1303         dstp += dstskip;
  1304     }
  1305 #undef ONE_PIXEL_BLEND
  1306 }
  1307 
  1308 static void
  1309 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1310 {
  1311     /* XXX : 6 */
  1312     unsigned alpha = info->src->alpha;
  1313     int height = info->d_height;
  1314     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1315     int srcskip = info->s_skip >> 2;
  1316     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1317     int dstskip = info->d_skip >> 2;
  1318     SDL_PixelFormat *srcfmt = info->src;
  1319     SDL_PixelFormat *dstfmt = info->dst;
  1320     unsigned sA = srcfmt->alpha;
  1321     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1322     vector unsigned char mergePermute;
  1323     vector unsigned char vsrcPermute;
  1324     vector unsigned char vdstPermute;
  1325     vector unsigned char vsdstPermute;
  1326     vector unsigned char valpha;
  1327     vector unsigned char valphamask;
  1328     vector unsigned char vbits;
  1329     vector unsigned short v1;
  1330     vector unsigned short v8;
  1331 
  1332     mergePermute = VEC_MERGE_PERMUTE();
  1333     v1 = vec_splat_u16(1);
  1334     v8 = vec_splat_u16(8);
  1335 
  1336     /* set the alpha to 255 on the destination surf */
  1337     valphamask = VEC_ALPHA_MASK();
  1338 
  1339     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1340     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1341     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1342 
  1343     /* set a vector full of alpha and 255-alpha */
  1344     ((unsigned char *) &valpha)[0] = alpha;
  1345     valpha = vec_splat(valpha, 0);
  1346     vbits = (vector unsigned char) vec_splat_s8(-1);
  1347 
  1348     while (height--) {
  1349         int width = info->d_width;
  1350 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1351             Uint32 Pixel; \
  1352             unsigned sR, sG, sB, dR, dG, dB; \
  1353             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1354             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1355             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1356             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1357             ++srcp; \
  1358             ++dstp; \
  1359             widthvar--; \
  1360         }
  1361         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1362         if (width > 0) {
  1363             int extrawidth = (width % 4);
  1364             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1365             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1366             width -= extrawidth;
  1367             while (width) {
  1368                 vector unsigned char voverflow;
  1369                 vector unsigned char vd;
  1370 
  1371                 /* s = *srcp */
  1372                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1373                 vs = vec_perm(vs, voverflow, valigner);
  1374                 vs = vec_perm(vs, valpha, vsrcPermute);
  1375 
  1376                 /* d = *dstp */
  1377                 vd = (vector unsigned char) vec_ld(0, dstp);
  1378                 vd = vec_perm(vd, vd, vsdstPermute);
  1379 
  1380                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1381 
  1382                 /* set the alpha channel to full on */
  1383                 vd = vec_or(vd, valphamask);
  1384                 vd = vec_perm(vd, vbits, vdstPermute);
  1385 
  1386                 /* *dstp = res */
  1387                 vec_st((vector unsigned int) vd, 0, dstp);
  1388 
  1389                 srcp += 4;
  1390                 dstp += 4;
  1391                 width -= 4;
  1392                 vs = voverflow;
  1393             }
  1394             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1395         }
  1396 #undef ONE_PIXEL_BLEND
  1397 
  1398         srcp += srcskip;
  1399         dstp += dstskip;
  1400     }
  1401 
  1402 }
  1403 
  1404 
  1405 /* fast RGB888->(A)RGB888 blending */
  1406 static void
  1407 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1408 {
  1409     unsigned alpha = info->src->alpha;
  1410     int height = info->d_height;
  1411     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1412     int srcskip = info->s_skip >> 2;
  1413     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1414     int dstskip = info->d_skip >> 2;
  1415     vector unsigned char mergePermute;
  1416     vector unsigned char valpha;
  1417     vector unsigned char valphamask;
  1418     vector unsigned short v1;
  1419     vector unsigned short v8;
  1420 
  1421     mergePermute = VEC_MERGE_PERMUTE();
  1422     v1 = vec_splat_u16(1);
  1423     v8 = vec_splat_u16(8);
  1424 
  1425     /* set the alpha to 255 on the destination surf */
  1426     valphamask = VEC_ALPHA_MASK();
  1427 
  1428     /* set a vector full of alpha and 255-alpha */
  1429     ((unsigned char *) &valpha)[0] = alpha;
  1430     valpha = vec_splat(valpha, 0);
  1431 
  1432     while (height--) {
  1433         int width = info->d_width;
  1434 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1435             Uint32 s = *srcp; \
  1436             Uint32 d = *dstp; \
  1437             Uint32 s1 = s & 0xff00ff; \
  1438             Uint32 d1 = d & 0xff00ff; \
  1439             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1440                  & 0xff00ff; \
  1441             s &= 0xff00; \
  1442             d &= 0xff00; \
  1443             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1444             *dstp = d1 | d | 0xff000000; \
  1445             ++srcp; \
  1446             ++dstp; \
  1447             widthvar--; \
  1448         }
  1449         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1450         if (width > 0) {
  1451             int extrawidth = (width % 4);
  1452             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1453             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1454             width -= extrawidth;
  1455             while (width) {
  1456                 vector unsigned char voverflow;
  1457                 vector unsigned char vd;
  1458 
  1459                 /* s = *srcp */
  1460                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1461                 vs = vec_perm(vs, voverflow, valigner);
  1462 
  1463                 /* d = *dstp */
  1464                 vd = (vector unsigned char) vec_ld(0, dstp);
  1465 
  1466                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1467 
  1468                 /* set the alpha channel to full on */
  1469                 vd = vec_or(vd, valphamask);
  1470 
  1471                 /* *dstp = res */
  1472                 vec_st((vector unsigned int) vd, 0, dstp);
  1473 
  1474                 srcp += 4;
  1475                 dstp += 4;
  1476                 width -= 4;
  1477                 vs = voverflow;
  1478             }
  1479             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1480         }
  1481 #undef ONE_PIXEL_BLEND
  1482 
  1483         srcp += srcskip;
  1484         dstp += dstskip;
  1485     }
  1486 }
  1487 
  1488 #if __MWERKS__
  1489 #pragma altivec_model off
  1490 #endif
  1491 #endif /* SDL_ALTIVEC_BLITTERS */
  1492 
  1493 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1494 static void
  1495 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1496 {
  1497     int width = info->d_width;
  1498     int height = info->d_height;
  1499     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1500     int srcskip = info->s_skip >> 2;
  1501     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1502     int dstskip = info->d_skip >> 2;
  1503 
  1504     while (height--) {
  1505 	    /* *INDENT-OFF* */
  1506 	    DUFFS_LOOP4({
  1507 		    Uint32 s = *srcp++;
  1508 		    Uint32 d = *dstp;
  1509 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1510 			       + (s & d & 0x00010101)) | 0xff000000;
  1511 	    }, width);
  1512 	    /* *INDENT-ON* */
  1513         srcp += srcskip;
  1514         dstp += dstskip;
  1515     }
  1516 }
  1517 
  1518 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1519 static void
  1520 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1521 {
  1522     unsigned alpha = info->src->alpha;
  1523     if (alpha == 128) {
  1524         BlitRGBtoRGBSurfaceAlpha128(info);
  1525     } else {
  1526         int width = info->d_width;
  1527         int height = info->d_height;
  1528         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1529         int srcskip = info->s_skip >> 2;
  1530         Uint32 *dstp = (Uint32 *) info->d_pixels;
  1531         int dstskip = info->d_skip >> 2;
  1532         Uint32 s;
  1533         Uint32 d;
  1534         Uint32 s1;
  1535         Uint32 d1;
  1536 
  1537         while (height--) {
  1538 			/* *INDENT-OFF* */
  1539 			DUFFS_LOOP_DOUBLE2({
  1540 				/* One Pixel Blend */
  1541 				s = *srcp;
  1542 				d = *dstp;
  1543 				s1 = s & 0xff00ff;
  1544 				d1 = d & 0xff00ff;
  1545 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1546 				     & 0xff00ff;
  1547 				s &= 0xff00;
  1548 				d &= 0xff00;
  1549 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1550 				*dstp = d1 | d | 0xff000000;
  1551 				++srcp;
  1552 				++dstp;
  1553 			},{
  1554 			        /* Two Pixels Blend */
  1555 				s = *srcp;
  1556 				d = *dstp;
  1557 				s1 = s & 0xff00ff;
  1558 				d1 = d & 0xff00ff;
  1559 				d1 += (s1 - d1) * alpha >> 8;
  1560 				d1 &= 0xff00ff;
  1561 				     
  1562 				s = ((s & 0xff00) >> 8) | 
  1563 					((srcp[1] & 0xff00) << 8);
  1564 				d = ((d & 0xff00) >> 8) |
  1565 					((dstp[1] & 0xff00) << 8);
  1566 				d += (s - d) * alpha >> 8;
  1567 				d &= 0x00ff00ff;
  1568 				
  1569 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1570 				++srcp;
  1571 				
  1572 			        s1 = *srcp;
  1573 				d1 = *dstp;
  1574 				s1 &= 0xff00ff;
  1575 				d1 &= 0xff00ff;
  1576 				d1 += (s1 - d1) * alpha >> 8;
  1577 				d1 &= 0xff00ff;
  1578 				
  1579 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1580 				++srcp;
  1581 				++dstp;
  1582 			}, width);
  1583 			/* *INDENT-ON* */
  1584             srcp += srcskip;
  1585             dstp += dstskip;
  1586         }
  1587     }
  1588 }
  1589 
  1590 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1591 static void
  1592 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1593 {
  1594     int width = info->d_width;
  1595     int height = info->d_height;
  1596     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1597     int srcskip = info->s_skip >> 2;
  1598     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1599     int dstskip = info->d_skip >> 2;
  1600 
  1601     while (height--) {
  1602 	    /* *INDENT-OFF* */
  1603 	    DUFFS_LOOP4({
  1604 		Uint32 dalpha;
  1605 		Uint32 d;
  1606 		Uint32 s1;
  1607 		Uint32 d1;
  1608 		Uint32 s = *srcp;
  1609 		Uint32 alpha = s >> 24;
  1610 		/* FIXME: Here we special-case opaque alpha since the
  1611 		   compositioning used (>>8 instead of /255) doesn't handle
  1612 		   it correctly. Also special-case alpha=0 for speed?
  1613 		   Benchmark this! */
  1614 		if(alpha) {   
  1615 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1616 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1617 		  } else {
  1618 		    /*
  1619 		     * take out the middle component (green), and process
  1620 		     * the other two in parallel. One multiply less.
  1621 		     */
  1622 		    d = *dstp;
  1623 		    dalpha = d & 0xff000000;
  1624 		    s1 = s & 0xff00ff;
  1625 		    d1 = d & 0xff00ff;
  1626 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1627 		    s &= 0xff00;
  1628 		    d &= 0xff00;
  1629 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1630 		    *dstp = d1 | d | dalpha;
  1631 		  }
  1632 		}
  1633 		++srcp;
  1634 		++dstp;
  1635 	    }, width);
  1636 	    /* *INDENT-ON* */
  1637         srcp += srcskip;
  1638         dstp += dstskip;
  1639     }
  1640 }
  1641 
  1642 #if GCC_ASMBLIT
  1643 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1644 static void
  1645 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1646 {
  1647     int width = info->d_width;
  1648     int height = info->d_height;
  1649     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1650     int srcskip = info->s_skip >> 2;
  1651     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1652     int dstskip = info->d_skip >> 2;
  1653     SDL_PixelFormat *sf = info->src;
  1654     Uint32 amask = sf->Amask;
  1655 
  1656     __asm__(
  1657                /* make mm6 all zeros. */
  1658                "pxor       %%mm6, %%mm6\n"
  1659                /* Make a mask to preserve the alpha. */
  1660                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1661                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1662                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1663                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1664                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1665                /* form channel masks */
  1666                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1667                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1668                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1669                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1670                /* get alpha channel shift */
  1671                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1672   : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
  1673 
  1674     while (height--) {
  1675 
  1676 	    /* *INDENT-OFF* */
  1677 	    DUFFS_LOOP4({
  1678 		Uint32 alpha;
  1679 
  1680 		__asm__ (
  1681 		"prefetch 64(%0)\n"
  1682 		"prefetch 64(%1)\n"
  1683 			: : "r" (srcp), "r" (dstp) );
  1684 
  1685 		alpha = *srcp & amask;
  1686 		/* FIXME: Here we special-case opaque alpha since the
  1687 		   compositioning used (>>8 instead of /255) doesn't handle
  1688 		   it correctly. Also special-case alpha=0 for speed?
  1689 		   Benchmark this! */
  1690 		if(alpha == 0) {
  1691 		    /* do nothing */
  1692 		}
  1693 		else if(alpha == amask) {
  1694 			/* opaque alpha -- copy RGB, keep dst alpha */
  1695 		    /* using MMX here to free up regular registers for other things */
  1696 			    __asm__ (
  1697 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1698 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1699 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1700 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1701 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1702 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1703 
  1704 		     : : "r" (srcp), "r" (dstp) );
  1705 		} 
  1706 
  1707 		else {
  1708 			    __asm__ (
  1709 		    /* load in the source, and dst. */
  1710 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1711 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1712 
  1713 		    /* Move the src alpha into mm2 */
  1714 
  1715 		    /* if supporting pshufw */
  1716 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1717 		    /*"psrlw     $8, %%mm2\n" */
  1718 		    
  1719 		    /* else: */
  1720 		    "movd       %2,    %%mm2\n"
  1721 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1722 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1723 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1724 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1725 
  1726 		    /* move the colors into words. */
  1727 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1728 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1729 
  1730 		    /* src - dst */
  1731 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1732 
  1733 		    /* A * (src-dst) */
  1734 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1735 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1736 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1737 
  1738 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1739 		    
  1740 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1741 
  1742 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1743 
  1744 		}
  1745 		++srcp;
  1746 		++dstp;
  1747 	    }, width);
  1748 	    /* *INDENT-ON* */
  1749         srcp += srcskip;
  1750         dstp += dstskip;
  1751     }
  1752 
  1753   __asm__("emms\n":);
  1754 }
  1755 
  1756 /* End GCC_ASMBLIT*/
  1757 
  1758 #elif MSVC_ASMBLIT
  1759 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1760 static void
  1761 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1762 {
  1763     int width = info->d_width;
  1764     int height = info->d_height;
  1765     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1766     int srcskip = info->s_skip >> 2;
  1767     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1768     int dstskip = info->d_skip >> 2;
  1769     SDL_PixelFormat *sf = info->src;
  1770     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1771     Uint32 amask = sf->Amask;
  1772     Uint32 ashift = sf->Ashift;
  1773     Uint64 multmask;
  1774 
  1775     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1776 
  1777     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1778 	/* *INDENT-OFF* */
  1779     multmask = ~(0xFFFFI64 << (ashift * 2));
  1780 	/* *INDENT-ON* */
  1781     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1782 
  1783     while (height--) {
  1784 	    /* *INDENT-OFF* */
  1785 	    DUFFS_LOOP4({
  1786 		Uint32 alpha;
  1787 
  1788 		_m_prefetch(srcp + 16);
  1789 		_m_prefetch(dstp + 16);
  1790 
  1791 		alpha = *srcp & amask;
  1792 		if (alpha == 0) {
  1793 			/* do nothing */
  1794 		} else if (alpha == amask) {
  1795 			/* copy RGB, keep dst alpha */
  1796 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1797 		} else {
  1798 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1799 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1800 
  1801 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1802 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1803 
  1804 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1805 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1806 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1807 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1808 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1809 
  1810 			/* blend */		    
  1811 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1812 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1813 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1814 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1815 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1816 			
  1817 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1818 		}
  1819 		++srcp;
  1820 		++dstp;
  1821 	    }, width);
  1822 	    /* *INDENT-ON* */
  1823         srcp += srcskip;
  1824         dstp += dstskip;
  1825     }
  1826     _mm_empty();
  1827 }
  1828 
  1829 /* End MSVC_ASMBLIT */
  1830 
  1831 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1832 
  1833 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1834 
  1835 /* blend a single 16 bit pixel at 50% */
  1836 #define BLEND16_50(d, s, mask)						\
  1837 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1838 
  1839 /* blend two 16 bit pixels at 50% */
  1840 #define BLEND2x16_50(d, s, mask)					     \
  1841 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1842 	 + (s & d & (~(mask | mask << 16))))
  1843 
  1844 static void
  1845 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1846 {
  1847     int width = info->d_width;
  1848     int height = info->d_height;
  1849     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1850     int srcskip = info->s_skip >> 1;
  1851     Uint16 *dstp = (Uint16 *) info->d_pixels;
  1852     int dstskip = info->d_skip >> 1;
  1853 
  1854     while (height--) {
  1855         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1856             /*
  1857              * Source and destination not aligned, pipeline it.
  1858              * This is mostly a win for big blits but no loss for
  1859              * small ones
  1860              */
  1861             Uint32 prev_sw;
  1862             int w = width;
  1863 
  1864             /* handle odd destination */
  1865             if ((uintptr_t) dstp & 2) {
  1866                 Uint16 d = *dstp, s = *srcp;
  1867                 *dstp = BLEND16_50(d, s, mask);
  1868                 dstp++;
  1869                 srcp++;
  1870                 w--;
  1871             }
  1872             srcp++;             /* srcp is now 32-bit aligned */
  1873 
  1874             /* bootstrap pipeline with first halfword */
  1875             prev_sw = ((Uint32 *) srcp)[-1];
  1876 
  1877             while (w > 1) {
  1878                 Uint32 sw, dw, s;
  1879                 sw = *(Uint32 *) srcp;
  1880                 dw = *(Uint32 *) dstp;
  1881 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1882                 s = (prev_sw << 16) + (sw >> 16);
  1883 #else
  1884                 s = (prev_sw >> 16) + (sw << 16);
  1885 #endif
  1886                 prev_sw = sw;
  1887                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1888                 dstp += 2;
  1889                 srcp += 2;
  1890                 w -= 2;
  1891             }
  1892 
  1893             /* final pixel if any */
  1894             if (w) {
  1895                 Uint16 d = *dstp, s;
  1896 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1897                 s = (Uint16) prev_sw;
  1898 #else
  1899                 s = (Uint16) (prev_sw >> 16);
  1900 #endif
  1901                 *dstp = BLEND16_50(d, s, mask);
  1902                 srcp++;
  1903                 dstp++;
  1904             }
  1905             srcp += srcskip - 1;
  1906             dstp += dstskip;
  1907         } else {
  1908             /* source and destination are aligned */
  1909             int w = width;
  1910 
  1911             /* first odd pixel? */
  1912             if ((uintptr_t) srcp & 2) {
  1913                 Uint16 d = *dstp, s = *srcp;
  1914                 *dstp = BLEND16_50(d, s, mask);
  1915                 srcp++;
  1916                 dstp++;
  1917                 w--;
  1918             }
  1919             /* srcp and dstp are now 32-bit aligned */
  1920 
  1921             while (w > 1) {
  1922                 Uint32 sw = *(Uint32 *) srcp;
  1923                 Uint32 dw = *(Uint32 *) dstp;
  1924                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1925                 srcp += 2;
  1926                 dstp += 2;
  1927                 w -= 2;
  1928             }
  1929 
  1930             /* last odd pixel? */
  1931             if (w) {
  1932                 Uint16 d = *dstp, s = *srcp;
  1933                 *dstp = BLEND16_50(d, s, mask);
  1934                 srcp++;
  1935                 dstp++;
  1936             }
  1937             srcp += srcskip;
  1938             dstp += dstskip;
  1939         }
  1940     }
  1941 }
  1942 
  1943 #if GCC_ASMBLIT
  1944 /* fast RGB565->RGB565 blending with surface alpha */
  1945 static void
  1946 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1947 {
  1948     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1949     if (alpha == 128) {
  1950         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1951     } else {
  1952         int width = info->d_width;
  1953         int height = info->d_height;
  1954         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1955         int srcskip = info->s_skip >> 1;
  1956         Uint16 *dstp = (Uint16 *) info->d_pixels;
  1957         int dstskip = info->d_skip >> 1;
  1958         Uint32 s, d;
  1959         Uint8 load[8];
  1960 
  1961         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1962         *(Uint64 *) load = alpha;
  1963         alpha >>= 3;            /* downscale alpha to 5 bits */
  1964 
  1965         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1966         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1967         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1968         /* position alpha to allow for mullo and mulhi on diff channels
  1969            to reduce the number of operations */
  1970         psllq_i2r(3, mm0);
  1971 
  1972         /* Setup the 565 color channel masks */
  1973         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1974         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1975         *(Uint64 *) load = 0x001F001F001F001FULL;
  1976         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1977         while (height--) {
  1978 			/* *INDENT-OFF* */
  1979 			DUFFS_LOOP_QUATRO2(
  1980 			{
  1981 				s = *srcp++;
  1982 				d = *dstp;
  1983 				/*
  1984 				 * shift out the middle component (green) to
  1985 				 * the high 16 bits, and process all three RGB
  1986 				 * components at the same time.
  1987 				 */
  1988 				s = (s | s << 16) & 0x07e0f81f;
  1989 				d = (d | d << 16) & 0x07e0f81f;
  1990 				d += (s - d) * alpha >> 5;
  1991 				d &= 0x07e0f81f;
  1992 				*dstp++ = d | d >> 16;
  1993 			},{
  1994 				s = *srcp++;
  1995 				d = *dstp;
  1996 				/*
  1997 				 * shift out the middle component (green) to
  1998 				 * the high 16 bits, and process all three RGB
  1999 				 * components at the same time.
  2000 				 */
  2001 				s = (s | s << 16) & 0x07e0f81f;
  2002 				d = (d | d << 16) & 0x07e0f81f;
  2003 				d += (s - d) * alpha >> 5;
  2004 				d &= 0x07e0f81f;
  2005 				*dstp++ = d | d >> 16;
  2006 				s = *srcp++;
  2007 				d = *dstp;
  2008 				/*
  2009 				 * shift out the middle component (green) to
  2010 				 * the high 16 bits, and process all three RGB
  2011 				 * components at the same time.
  2012 				 */
  2013 				s = (s | s << 16) & 0x07e0f81f;
  2014 				d = (d | d << 16) & 0x07e0f81f;
  2015 				d += (s - d) * alpha >> 5;
  2016 				d &= 0x07e0f81f;
  2017 				*dstp++ = d | d >> 16;
  2018 			},{
  2019 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2020 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2021 
  2022 				/* red -- does not need a mask since the right shift clears
  2023 				   the uninteresting bits */
  2024 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2025 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2026 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2027 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2028 
  2029 				/* blend */
  2030 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2031 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2032 				/* alpha used is actually 11 bits
  2033 				   11 + 5 = 16 bits, so the sign bits are lost */
  2034 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2035 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2036 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2037 
  2038 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2039 
  2040 				/* green -- process the bits in place */
  2041 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2042 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2043 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2044 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2045 
  2046 				/* blend */
  2047 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2048 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2049 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2050 				   bits are gone and the sign bits present */
  2051 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2052 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2053 
  2054 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2055 
  2056 				/* blue */
  2057 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2058 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2059 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2060 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2061 
  2062 				/* blend */
  2063 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2064 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2065 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2066 				   the interesting bits will need to be MASKed */
  2067 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2068 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2069 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2070 
  2071 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2072 
  2073 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2074 
  2075 				srcp += 4;
  2076 				dstp += 4;
  2077 			}, width);			
  2078 			/* *INDENT-ON* */
  2079             srcp += srcskip;
  2080             dstp += dstskip;
  2081         }
  2082         emms();
  2083     }
  2084 }
  2085 
  2086 /* fast RGB555->RGB555 blending with surface alpha */
  2087 static void
  2088 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2089 {
  2090     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2091     if (alpha == 128) {
  2092         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2093     } else {
  2094         int width = info->d_width;
  2095         int height = info->d_height;
  2096         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2097         int srcskip = info->s_skip >> 1;
  2098         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2099         int dstskip = info->d_skip >> 1;
  2100         Uint32 s, d;
  2101         Uint8 load[8];
  2102 
  2103         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2104         *(Uint64 *) load = alpha;
  2105         alpha >>= 3;            /* downscale alpha to 5 bits */
  2106 
  2107         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2108         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2109         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2110         /* position alpha to allow for mullo and mulhi on diff channels
  2111            to reduce the number of operations */
  2112         psllq_i2r(3, mm0);
  2113 
  2114         /* Setup the 555 color channel masks */
  2115         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2116         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2117         *(Uint64 *) load = 0x001F001F001F001FULL;
  2118         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2119         while (height--) {
  2120 			/* *INDENT-OFF* */
  2121 			DUFFS_LOOP_QUATRO2(
  2122 			{
  2123 				s = *srcp++;
  2124 				d = *dstp;
  2125 				/*
  2126 				 * shift out the middle component (green) to
  2127 				 * the high 16 bits, and process all three RGB
  2128 				 * components at the same time.
  2129 				 */
  2130 				s = (s | s << 16) & 0x03e07c1f;
  2131 				d = (d | d << 16) & 0x03e07c1f;
  2132 				d += (s - d) * alpha >> 5;
  2133 				d &= 0x03e07c1f;
  2134 				*dstp++ = d | d >> 16;
  2135 			},{
  2136 				s = *srcp++;
  2137 				d = *dstp;
  2138 				/*
  2139 				 * shift out the middle component (green) to
  2140 				 * the high 16 bits, and process all three RGB
  2141 				 * components at the same time.
  2142 				 */
  2143 				s = (s | s << 16) & 0x03e07c1f;
  2144 				d = (d | d << 16) & 0x03e07c1f;
  2145 				d += (s - d) * alpha >> 5;
  2146 				d &= 0x03e07c1f;
  2147 				*dstp++ = d | d >> 16;
  2148 			        s = *srcp++;
  2149 				d = *dstp;
  2150 				/*
  2151 				 * shift out the middle component (green) to
  2152 				 * the high 16 bits, and process all three RGB
  2153 				 * components at the same time.
  2154 				 */
  2155 				s = (s | s << 16) & 0x03e07c1f;
  2156 				d = (d | d << 16) & 0x03e07c1f;
  2157 				d += (s - d) * alpha >> 5;
  2158 				d &= 0x03e07c1f;
  2159 				*dstp++ = d | d >> 16;
  2160 			},{
  2161 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2162 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2163 
  2164 				/* red -- process the bits in place */
  2165 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2166 					/* by reusing the GREEN mask we free up another mmx
  2167 					   register to accumulate the result */
  2168 
  2169 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2170 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2171 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2172 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2173 
  2174 				/* blend */
  2175 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2176 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2177 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2178 				   cleared by a MASK below */
  2179 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2180 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2181 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2182 
  2183 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2184 
  2185 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2186 
  2187 				/* green -- process the bits in place */
  2188 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2189 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2190 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2191 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2192 
  2193 				/* blend */
  2194 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2195 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2196 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2197 				   bits are gone and the sign bits present */
  2198 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2199 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2200 
  2201 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2202 
  2203 				/* blue */
  2204 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2205 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2206 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2207 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2208 
  2209 				/* blend */
  2210 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2211 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2212 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2213 				   the interesting bits will need to be MASKed */
  2214 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2215 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2216 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2217 
  2218 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2219 
  2220 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2221 
  2222 				srcp += 4;
  2223 				dstp += 4;
  2224 			}, width);
  2225 			/* *INDENT-ON* */
  2226             srcp += srcskip;
  2227             dstp += dstskip;
  2228         }
  2229         emms();
  2230     }
  2231 }
  2232 
  2233 /* End GCC_ASMBLIT */
  2234 
  2235 #elif MSVC_ASMBLIT
  2236 /* fast RGB565->RGB565 blending with surface alpha */
  2237 static void
  2238 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2239 {
  2240     unsigned alpha = info->src->alpha;
  2241     if (alpha == 128) {
  2242         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2243     } else {
  2244         int width = info->d_width;
  2245         int height = info->d_height;
  2246         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2247         int srcskip = info->s_skip >> 1;
  2248         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2249         int dstskip = info->d_skip >> 1;
  2250         Uint32 s, d;
  2251 
  2252         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2253 
  2254         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2255         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2256         alpha >>= 3;            /* downscale alpha to 5 bits */
  2257 
  2258         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2259         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2260         /* position alpha to allow for mullo and mulhi on diff channels
  2261            to reduce the number of operations */
  2262         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2263 
  2264         /* Setup the 565 color channel masks */
  2265         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2266         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2267 
  2268         while (height--) {
  2269 			/* *INDENT-OFF* */
  2270 			DUFFS_LOOP_QUATRO2(
  2271 			{
  2272 				s = *srcp++;
  2273 				d = *dstp;
  2274 				/*
  2275 				 * shift out the middle component (green) to
  2276 				 * the high 16 bits, and process all three RGB
  2277 				 * components at the same time.
  2278 				 */
  2279 				s = (s | s << 16) & 0x07e0f81f;
  2280 				d = (d | d << 16) & 0x07e0f81f;
  2281 				d += (s - d) * alpha >> 5;
  2282 				d &= 0x07e0f81f;
  2283 				*dstp++ = (Uint16)(d | d >> 16);
  2284 			},{
  2285 				s = *srcp++;
  2286 				d = *dstp;
  2287 				/*
  2288 				 * shift out the middle component (green) to
  2289 				 * the high 16 bits, and process all three RGB
  2290 				 * components at the same time.
  2291 				 */
  2292 				s = (s | s << 16) & 0x07e0f81f;
  2293 				d = (d | d << 16) & 0x07e0f81f;
  2294 				d += (s - d) * alpha >> 5;
  2295 				d &= 0x07e0f81f;
  2296 				*dstp++ = (Uint16)(d | d >> 16);
  2297 				s = *srcp++;
  2298 				d = *dstp;
  2299 				/*
  2300 				 * shift out the middle component (green) to
  2301 				 * the high 16 bits, and process all three RGB
  2302 				 * components at the same time.
  2303 				 */
  2304 				s = (s | s << 16) & 0x07e0f81f;
  2305 				d = (d | d << 16) & 0x07e0f81f;
  2306 				d += (s - d) * alpha >> 5;
  2307 				d &= 0x07e0f81f;
  2308 				*dstp++ = (Uint16)(d | d >> 16);
  2309 			},{
  2310 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2311 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2312 
  2313 				/* red */
  2314 				src2 = src1;
  2315 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2316 
  2317 				dst2 = dst1;
  2318 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2319 
  2320 				/* blend */
  2321 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2322 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2323 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2324 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2325 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2326 
  2327 				mm_res = dst2; /* RED -> mm_res */
  2328 
  2329 				/* green -- process the bits in place */
  2330 				src2 = src1;
  2331 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2332 
  2333 				dst2 = dst1;
  2334 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2335 
  2336 				/* blend */
  2337 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2338 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2339 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2340 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2341 
  2342 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2343 
  2344 				/* blue */
  2345 				src2 = src1;
  2346 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2347 
  2348 				dst2 = dst1;
  2349 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2350 
  2351 				/* blend */
  2352 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2353 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2354 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2355 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2356 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2357 
  2358 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2359 
  2360 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2361 
  2362 				srcp += 4;
  2363 				dstp += 4;
  2364 			}, width);
  2365 			/* *INDENT-ON* */
  2366             srcp += srcskip;
  2367             dstp += dstskip;
  2368         }
  2369         _mm_empty();
  2370     }
  2371 }
  2372 
  2373 /* fast RGB555->RGB555 blending with surface alpha */
  2374 static void
  2375 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2376 {
  2377     unsigned alpha = info->src->alpha;
  2378     if (alpha == 128) {
  2379         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2380     } else {
  2381         int width = info->d_width;
  2382         int height = info->d_height;
  2383         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2384         int srcskip = info->s_skip >> 1;
  2385         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2386         int dstskip = info->d_skip >> 1;
  2387         Uint32 s, d;
  2388 
  2389         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2390 
  2391         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2392         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2393         alpha >>= 3;            /* downscale alpha to 5 bits */
  2394 
  2395         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2396         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2397         /* position alpha to allow for mullo and mulhi on diff channels
  2398            to reduce the number of operations */
  2399         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2400 
  2401         /* Setup the 555 color channel masks */
  2402         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2403         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2404         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2405 
  2406         while (height--) {
  2407 			/* *INDENT-OFF* */
  2408 			DUFFS_LOOP_QUATRO2(
  2409 			{
  2410 				s = *srcp++;
  2411 				d = *dstp;
  2412 				/*
  2413 				 * shift out the middle component (green) to
  2414 				 * the high 16 bits, and process all three RGB
  2415 				 * components at the same time.
  2416 				 */
  2417 				s = (s | s << 16) & 0x03e07c1f;
  2418 				d = (d | d << 16) & 0x03e07c1f;
  2419 				d += (s - d) * alpha >> 5;
  2420 				d &= 0x03e07c1f;
  2421 				*dstp++ = (Uint16)(d | d >> 16);
  2422 			},{
  2423 				s = *srcp++;
  2424 				d = *dstp;
  2425 				/*
  2426 				 * shift out the middle component (green) to
  2427 				 * the high 16 bits, and process all three RGB
  2428 				 * components at the same time.
  2429 				 */
  2430 				s = (s | s << 16) & 0x03e07c1f;
  2431 				d = (d | d << 16) & 0x03e07c1f;
  2432 				d += (s - d) * alpha >> 5;
  2433 				d &= 0x03e07c1f;
  2434 				*dstp++ = (Uint16)(d | d >> 16);
  2435 			        s = *srcp++;
  2436 				d = *dstp;
  2437 				/*
  2438 				 * shift out the middle component (green) to
  2439 				 * the high 16 bits, and process all three RGB
  2440 				 * components at the same time.
  2441 				 */
  2442 				s = (s | s << 16) & 0x03e07c1f;
  2443 				d = (d | d << 16) & 0x03e07c1f;
  2444 				d += (s - d) * alpha >> 5;
  2445 				d &= 0x03e07c1f;
  2446 				*dstp++ = (Uint16)(d | d >> 16);
  2447 			},{
  2448 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2449 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2450 
  2451 				/* red -- process the bits in place */
  2452 				src2 = src1;
  2453 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2454 
  2455 				dst2 = dst1;
  2456 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2457 
  2458 				/* blend */
  2459 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2460 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2461 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2462 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2463 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2464 
  2465 				mm_res = dst2; /* RED -> mm_res */
  2466 				
  2467 				/* green -- process the bits in place */
  2468 				src2 = src1;
  2469 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2470 
  2471 				dst2 = dst1;
  2472 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2473 
  2474 				/* blend */
  2475 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2476 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2477 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2478 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2479 
  2480 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2481 
  2482 				/* blue */
  2483 				src2 = src1; /* src -> src2 */
  2484 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2485 
  2486 				dst2 = dst1; /* dst -> dst2 */
  2487 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2488 
  2489 				/* blend */
  2490 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2491 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2492 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2493 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2494 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2495 
  2496 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2497 
  2498 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2499 
  2500 				srcp += 4;
  2501 				dstp += 4;
  2502 			}, width);
  2503 			/* *INDENT-ON* */
  2504             srcp += srcskip;
  2505             dstp += dstskip;
  2506         }
  2507         _mm_empty();
  2508     }
  2509 }
  2510 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2511 
  2512 /* fast RGB565->RGB565 blending with surface alpha */
  2513 static void
  2514 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2515 {
  2516     unsigned alpha = info->src->alpha;
  2517     if (alpha == 128) {
  2518         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2519     } else {
  2520         int width = info->d_width;
  2521         int height = info->d_height;
  2522         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2523         int srcskip = info->s_skip >> 1;
  2524         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2525         int dstskip = info->d_skip >> 1;
  2526         alpha >>= 3;            /* downscale alpha to 5 bits */
  2527 
  2528         while (height--) {
  2529 			/* *INDENT-OFF* */
  2530 			DUFFS_LOOP4({
  2531 				Uint32 s = *srcp++;
  2532 				Uint32 d = *dstp;
  2533 				/*
  2534 				 * shift out the middle component (green) to
  2535 				 * the high 16 bits, and process all three RGB
  2536 				 * components at the same time.
  2537 				 */
  2538 				s = (s | s << 16) & 0x07e0f81f;
  2539 				d = (d | d << 16) & 0x07e0f81f;
  2540 				d += (s - d) * alpha >> 5;
  2541 				d &= 0x07e0f81f;
  2542 				*dstp++ = (Uint16)(d | d >> 16);
  2543 			}, width);
  2544 			/* *INDENT-ON* */
  2545             srcp += srcskip;
  2546             dstp += dstskip;
  2547         }
  2548     }
  2549 }
  2550 
  2551 /* fast RGB555->RGB555 blending with surface alpha */
  2552 static void
  2553 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2554 {
  2555     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2556     if (alpha == 128) {
  2557         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2558     } else {
  2559         int width = info->d_width;
  2560         int height = info->d_height;
  2561         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2562         int srcskip = info->s_skip >> 1;
  2563         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2564         int dstskip = info->d_skip >> 1;
  2565         alpha >>= 3;            /* downscale alpha to 5 bits */
  2566 
  2567         while (height--) {
  2568 			/* *INDENT-OFF* */
  2569 			DUFFS_LOOP4({
  2570 				Uint32 s = *srcp++;
  2571 				Uint32 d = *dstp;
  2572 				/*
  2573 				 * shift out the middle component (green) to
  2574 				 * the high 16 bits, and process all three RGB
  2575 				 * components at the same time.
  2576 				 */
  2577 				s = (s | s << 16) & 0x03e07c1f;
  2578 				d = (d | d << 16) & 0x03e07c1f;
  2579 				d += (s - d) * alpha >> 5;
  2580 				d &= 0x03e07c1f;
  2581 				*dstp++ = (Uint16)(d | d >> 16);
  2582 			}, width);
  2583 			/* *INDENT-ON* */
  2584             srcp += srcskip;
  2585             dstp += dstskip;
  2586         }
  2587     }
  2588 }
  2589 
  2590 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2591 static void
  2592 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2593 {
  2594     int width = info->d_width;
  2595     int height = info->d_height;
  2596     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2597     int srcskip = info->s_skip >> 2;
  2598     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2599     int dstskip = info->d_skip >> 1;
  2600 
  2601     while (height--) {
  2602 	    /* *INDENT-OFF* */
  2603 	    DUFFS_LOOP4({
  2604 		Uint32 s = *srcp;
  2605 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2606 		/* FIXME: Here we special-case opaque alpha since the
  2607 		   compositioning used (>>8 instead of /255) doesn't handle
  2608 		   it correctly. Also special-case alpha=0 for speed?
  2609 		   Benchmark this! */
  2610 		if(alpha) {   
  2611 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2612 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2613 		  } else {
  2614 		    Uint32 d = *dstp;
  2615 		    /*
  2616 		     * convert source and destination to G0RAB65565
  2617 		     * and blend all components at the same time
  2618 		     */
  2619 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2620 		      + (s >> 3 & 0x1f);
  2621 		    d = (d | d << 16) & 0x07e0f81f;
  2622 		    d += (s - d) * alpha >> 5;
  2623 		    d &= 0x07e0f81f;
  2624 		    *dstp = (Uint16)(d | d >> 16);
  2625 		  }
  2626 		}
  2627 		srcp++;
  2628 		dstp++;
  2629 	    }, width);
  2630 	    /* *INDENT-ON* */
  2631         srcp += srcskip;
  2632         dstp += dstskip;
  2633     }
  2634 }
  2635 
  2636 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2637 static void
  2638 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2639 {
  2640     int width = info->d_width;
  2641     int height = info->d_height;
  2642     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2643     int srcskip = info->s_skip >> 2;
  2644     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2645     int dstskip = info->d_skip >> 1;
  2646 
  2647     while (height--) {
  2648 	    /* *INDENT-OFF* */
  2649 	    DUFFS_LOOP4({
  2650 		unsigned alpha;
  2651 		Uint32 s = *srcp;
  2652 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2653 		/* FIXME: Here we special-case opaque alpha since the
  2654 		   compositioning used (>>8 instead of /255) doesn't handle
  2655 		   it correctly. Also special-case alpha=0 for speed?
  2656 		   Benchmark this! */
  2657 		if(alpha) {   
  2658 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2659 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2660 		  } else {
  2661 		    Uint32 d = *dstp;
  2662 		    /*
  2663 		     * convert source and destination to G0RAB65565
  2664 		     * and blend all components at the same time
  2665 		     */
  2666 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2667 		      + (s >> 3 & 0x1f);
  2668 		    d = (d | d << 16) & 0x03e07c1f;
  2669 		    d += (s - d) * alpha >> 5;
  2670 		    d &= 0x03e07c1f;
  2671 		    *dstp = (Uint16)(d | d >> 16);
  2672 		  }
  2673 		}
  2674 		srcp++;
  2675 		dstp++;
  2676 	    }, width);
  2677 	    /* *INDENT-ON* */
  2678         srcp += srcskip;
  2679         dstp += dstskip;
  2680     }
  2681 }
  2682 
  2683 /* General (slow) N->N blending with per-surface alpha */
  2684 static void
  2685 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2686 {
  2687     int width = info->d_width;
  2688     int height = info->d_height;
  2689     Uint8 *src = info->s_pixels;
  2690     int srcskip = info->s_skip;
  2691     Uint8 *dst = info->d_pixels;
  2692     int dstskip = info->d_skip;
  2693     SDL_PixelFormat *srcfmt = info->src;
  2694     SDL_PixelFormat *dstfmt = info->dst;
  2695     int srcbpp = srcfmt->BytesPerPixel;
  2696     int dstbpp = dstfmt->BytesPerPixel;
  2697     unsigned sA = srcfmt->alpha;
  2698     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2699 
  2700     if (sA) {
  2701         while (height--) {
  2702 	    /* *INDENT-OFF* */
  2703 	    DUFFS_LOOP4(
  2704 	    {
  2705 		Uint32 Pixel;
  2706 		unsigned sR;
  2707 		unsigned sG;
  2708 		unsigned sB;
  2709 		unsigned dR;
  2710 		unsigned dG;
  2711 		unsigned dB;
  2712 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2713 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2714 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2715 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2716 		src += srcbpp;
  2717 		dst += dstbpp;
  2718 	    },
  2719 	    width);
  2720 	    /* *INDENT-ON* */
  2721             src += srcskip;
  2722             dst += dstskip;
  2723         }
  2724     }
  2725 }
  2726 
  2727 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2728 static void
  2729 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2730 {
  2731     int width = info->d_width;
  2732     int height = info->d_height;
  2733     Uint8 *src = info->s_pixels;
  2734     int srcskip = info->s_skip;
  2735     Uint8 *dst = info->d_pixels;
  2736     int dstskip = info->d_skip;
  2737     SDL_PixelFormat *srcfmt = info->src;
  2738     SDL_PixelFormat *dstfmt = info->dst;
  2739     Uint32 ckey = srcfmt->colorkey;
  2740     int srcbpp = srcfmt->BytesPerPixel;
  2741     int dstbpp = dstfmt->BytesPerPixel;
  2742     unsigned sA = srcfmt->alpha;
  2743     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2744 
  2745     while (height--) {
  2746 	    /* *INDENT-OFF* */
  2747 	    DUFFS_LOOP4(
  2748 	    {
  2749 		Uint32 Pixel;
  2750 		unsigned sR;
  2751 		unsigned sG;
  2752 		unsigned sB;
  2753 		unsigned dR;
  2754 		unsigned dG;
  2755 		unsigned dB;
  2756 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2757 		if(sA && Pixel != ckey) {
  2758 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2759 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2760 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2761 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2762 		}
  2763 		src += srcbpp;
  2764 		dst += dstbpp;
  2765 	    },
  2766 	    width);
  2767 	    /* *INDENT-ON* */
  2768         src += srcskip;
  2769         dst += dstskip;
  2770     }
  2771 }
  2772 
  2773 /* General (slow) N->N blending with pixel alpha */
  2774 static void
  2775 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2776 {
  2777     int width = info->d_width;
  2778     int height = info->d_height;
  2779     Uint8 *src = info->s_pixels;
  2780     int srcskip = info->s_skip;
  2781     Uint8 *dst = info->d_pixels;
  2782     int dstskip = info->d_skip;
  2783     SDL_PixelFormat *srcfmt = info->src;
  2784     SDL_PixelFormat *dstfmt = info->dst;
  2785 
  2786     int srcbpp;
  2787     int dstbpp;
  2788 
  2789     /* Set up some basic variables */
  2790     srcbpp = srcfmt->BytesPerPixel;
  2791     dstbpp = dstfmt->BytesPerPixel;
  2792 
  2793     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2794        quite right. for <8bpp source alpha, it gets them very wrong
  2795        (check all macros!)
  2796        It is unclear whether there is a good general solution that doesn't
  2797        need a branch (or a divide). */
  2798     while (height--) {
  2799 	    /* *INDENT-OFF* */
  2800 	    DUFFS_LOOP4(
  2801 	    {
  2802 		Uint32 Pixel;
  2803 		unsigned sR;
  2804 		unsigned sG;
  2805 		unsigned sB;
  2806 		unsigned dR;
  2807 		unsigned dG;
  2808 		unsigned dB;
  2809 		unsigned sA;
  2810 		unsigned dA;
  2811 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2812 		if(sA) {
  2813 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2814 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2815 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2816 		}
  2817 		src += srcbpp;
  2818 		dst += dstbpp;
  2819 	    },
  2820 	    width);
  2821 	    /* *INDENT-ON* */
  2822         src += srcskip;
  2823         dst += dstskip;
  2824     }
  2825 }
  2826 
  2827 
  2828 SDL_loblit
  2829 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2830 {
  2831     SDL_PixelFormat *sf = surface->format;
  2832     SDL_PixelFormat *df = surface->map->dst->format;
  2833 
  2834     if (sf->Amask == 0) {
  2835         if ((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2836             if (df->BytesPerPixel == 1)
  2837                 return BlitNto1SurfaceAlphaKey;
  2838             else
  2839 #if SDL_ALTIVEC_BLITTERS
  2840                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2841                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2842                     && SDL_HasAltiVec())
  2843                 return Blit32to32SurfaceAlphaKeyAltivec;
  2844             else
  2845 #endif
  2846                 return BlitNtoNSurfaceAlphaKey;
  2847         } else {
  2848             /* Per-surface alpha blits */
  2849             switch (df->BytesPerPixel) {
  2850             case 1:
  2851                 return BlitNto1SurfaceAlpha;
  2852 
  2853             case 2:
  2854                 if (surface->map->identity) {
  2855                     if (df->Gmask == 0x7e0) {
  2856 #if MMX_ASMBLIT
  2857                         if (SDL_HasMMX())
  2858                             return Blit565to565SurfaceAlphaMMX;
  2859                         else
  2860 #endif
  2861                             return Blit565to565SurfaceAlpha;
  2862                     } else if (df->Gmask == 0x3e0) {
  2863 #if MMX_ASMBLIT
  2864                         if (SDL_HasMMX())
  2865                             return Blit555to555SurfaceAlphaMMX;
  2866                         else
  2867 #endif
  2868                             return Blit555to555SurfaceAlpha;
  2869                     }
  2870                 }
  2871                 return BlitNtoNSurfaceAlpha;
  2872 
  2873             case 4:
  2874                 if (sf->Rmask == df->Rmask
  2875                     && sf->Gmask == df->Gmask
  2876                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2877 #if MMX_ASMBLIT
  2878                     if (sf->Rshift % 8 == 0
  2879                         && sf->Gshift % 8 == 0
  2880                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2881                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2882 #endif
  2883                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2884 #if SDL_ALTIVEC_BLITTERS
  2885                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2886                             && SDL_HasAltiVec())
  2887                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2888 #endif
  2889                         return BlitRGBtoRGBSurfaceAlpha;
  2890                     }
  2891                 }
  2892 #if SDL_ALTIVEC_BLITTERS
  2893                 if ((sf->BytesPerPixel == 4) &&
  2894                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2895                     && SDL_HasAltiVec())
  2896                     return Blit32to32SurfaceAlphaAltivec;
  2897                 else
  2898 #endif
  2899                     return BlitNtoNSurfaceAlpha;
  2900 
  2901             case 3:
  2902             default:
  2903                 return BlitNtoNSurfaceAlpha;
  2904             }
  2905         }
  2906     } else {
  2907         /* Per-pixel alpha blits */
  2908         switch (df->BytesPerPixel) {
  2909         case 1:
  2910             return BlitNto1PixelAlpha;
  2911 
  2912         case 2:
  2913 #if SDL_ALTIVEC_BLITTERS
  2914             if (sf->BytesPerPixel == 4
  2915                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2916                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2917                 && SDL_HasAltiVec())
  2918                 return Blit32to565PixelAlphaAltivec;
  2919             else
  2920 #endif
  2921                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2922                     && sf->Gmask == 0xff00
  2923                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2924                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2925                 if (df->Gmask == 0x7e0)
  2926                     return BlitARGBto565PixelAlpha;
  2927                 else if (df->Gmask == 0x3e0)
  2928                     return BlitARGBto555PixelAlpha;
  2929             }
  2930             return BlitNtoNPixelAlpha;
  2931 
  2932         case 4:
  2933             if (sf->Rmask == df->Rmask
  2934                 && sf->Gmask == df->Gmask
  2935                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2936 #if MMX_ASMBLIT
  2937                 if (sf->Rshift % 8 == 0
  2938                     && sf->Gshift % 8 == 0
  2939                     && sf->Bshift % 8 == 0
  2940                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2941                     if (SDL_Has3DNow())
  2942                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2943                     if (SDL_HasMMX())
  2944                         return BlitRGBtoRGBPixelAlphaMMX;
  2945                 }
  2946 #endif
  2947                 if (sf->Amask == 0xff000000) {
  2948 #if SDL_ALTIVEC_BLITTERS
  2949                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2950                         && SDL_HasAltiVec())
  2951                         return BlitRGBtoRGBPixelAlphaAltivec;
  2952 #endif
  2953                     return BlitRGBtoRGBPixelAlpha;
  2954                 }
  2955             }
  2956 #if SDL_ALTIVEC_BLITTERS
  2957             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2958                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2959                 && SDL_HasAltiVec())
  2960                 return Blit32to32PixelAlphaAltivec;
  2961             else
  2962 #endif
  2963                 return BlitNtoNPixelAlpha;
  2964 
  2965         case 3:
  2966         default:
  2967             return BlitNtoNPixelAlpha;
  2968         }
  2969     }
  2970 }
  2971 
  2972 /* vi: set ts=4 sw=4 expandtab: */