src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Sat, 23 Sep 2006 23:15:56 +0000
changeset 2038 eb5aedc79992
parent 1895 c121d94672cb
child 2074 9e6dc39f48b6
permissions -rw-r--r--
(none)
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void
    50 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    51 {
    52     int width = info->d_width;
    53     int height = info->d_height;
    54     Uint8 *src = info->s_pixels;
    55     int srcskip = info->s_skip;
    56     Uint8 *dst = info->d_pixels;
    57     int dstskip = info->d_skip;
    58     Uint8 *palmap = info->table;
    59     SDL_PixelFormat *srcfmt = info->src;
    60     SDL_PixelFormat *dstfmt = info->dst;
    61     int srcbpp = srcfmt->BytesPerPixel;
    62 
    63     const unsigned A = srcfmt->alpha;
    64 
    65     while (height--) {
    66 	    /* *INDENT-OFF* */
    67 	    DUFFS_LOOP4(
    68 	    {
    69 		Uint32 Pixel;
    70 		unsigned sR;
    71 		unsigned sG;
    72 		unsigned sB;
    73 		unsigned dR;
    74 		unsigned dG;
    75 		unsigned dB;
    76 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    77 		dR = dstfmt->palette->colors[*dst].r;
    78 		dG = dstfmt->palette->colors[*dst].g;
    79 		dB = dstfmt->palette->colors[*dst].b;
    80 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    81 		dR &= 0xff;
    82 		dG &= 0xff;
    83 		dB &= 0xff;
    84 		/* Pack RGB into 8bit pixel */
    85 		if ( palmap == NULL ) {
    86 		    *dst =((dR>>5)<<(3+2))|
    87 			  ((dG>>5)<<(2))|
    88 			  ((dB>>6)<<(0));
    89 		} else {
    90 		    *dst = palmap[((dR>>5)<<(3+2))|
    91 				  ((dG>>5)<<(2))  |
    92 				  ((dB>>6)<<(0))];
    93 		}
    94 		dst++;
    95 		src += srcbpp;
    96 	    },
    97 	    width);
    98 	    /* *INDENT-ON* */
    99         src += srcskip;
   100         dst += dstskip;
   101     }
   102 }
   103 
   104 /* N->1 blending with pixel alpha */
   105 static void
   106 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   107 {
   108     int width = info->d_width;
   109     int height = info->d_height;
   110     Uint8 *src = info->s_pixels;
   111     int srcskip = info->s_skip;
   112     Uint8 *dst = info->d_pixels;
   113     int dstskip = info->d_skip;
   114     Uint8 *palmap = info->table;
   115     SDL_PixelFormat *srcfmt = info->src;
   116     SDL_PixelFormat *dstfmt = info->dst;
   117     int srcbpp = srcfmt->BytesPerPixel;
   118 
   119     /* FIXME: fix alpha bit field expansion here too? */
   120     while (height--) {
   121 	    /* *INDENT-OFF* */
   122 	    DUFFS_LOOP4(
   123 	    {
   124 		Uint32 Pixel;
   125 		unsigned sR;
   126 		unsigned sG;
   127 		unsigned sB;
   128 		unsigned sA;
   129 		unsigned dR;
   130 		unsigned dG;
   131 		unsigned dB;
   132 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   133 		dR = dstfmt->palette->colors[*dst].r;
   134 		dG = dstfmt->palette->colors[*dst].g;
   135 		dB = dstfmt->palette->colors[*dst].b;
   136 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   137 		dR &= 0xff;
   138 		dG &= 0xff;
   139 		dB &= 0xff;
   140 		/* Pack RGB into 8bit pixel */
   141 		if ( palmap == NULL ) {
   142 		    *dst =((dR>>5)<<(3+2))|
   143 			  ((dG>>5)<<(2))|
   144 			  ((dB>>6)<<(0));
   145 		} else {
   146 		    *dst = palmap[((dR>>5)<<(3+2))|
   147 				  ((dG>>5)<<(2))  |
   148 				  ((dB>>6)<<(0))  ];
   149 		}
   150 		dst++;
   151 		src += srcbpp;
   152 	    },
   153 	    width);
   154 	    /* *INDENT-ON* */
   155         src += srcskip;
   156         dst += dstskip;
   157     }
   158 }
   159 
   160 /* colorkeyed N->1 blending with per-surface alpha */
   161 static void
   162 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   163 {
   164     int width = info->d_width;
   165     int height = info->d_height;
   166     Uint8 *src = info->s_pixels;
   167     int srcskip = info->s_skip;
   168     Uint8 *dst = info->d_pixels;
   169     int dstskip = info->d_skip;
   170     Uint8 *palmap = info->table;
   171     SDL_PixelFormat *srcfmt = info->src;
   172     SDL_PixelFormat *dstfmt = info->dst;
   173     int srcbpp = srcfmt->BytesPerPixel;
   174     Uint32 ckey = srcfmt->colorkey;
   175 
   176     const int A = srcfmt->alpha;
   177 
   178     while (height--) {
   179 	    /* *INDENT-OFF* */
   180 	    DUFFS_LOOP(
   181 	    {
   182 		Uint32 Pixel;
   183 		unsigned sR;
   184 		unsigned sG;
   185 		unsigned sB;
   186 		unsigned dR;
   187 		unsigned dG;
   188 		unsigned dB;
   189 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   190 		if ( Pixel != ckey ) {
   191 		    dR = dstfmt->palette->colors[*dst].r;
   192 		    dG = dstfmt->palette->colors[*dst].g;
   193 		    dB = dstfmt->palette->colors[*dst].b;
   194 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   195 		    dR &= 0xff;
   196 		    dG &= 0xff;
   197 		    dB &= 0xff;
   198 		    /* Pack RGB into 8bit pixel */
   199 		    if ( palmap == NULL ) {
   200 			*dst =((dR>>5)<<(3+2))|
   201 			      ((dG>>5)<<(2)) |
   202 			      ((dB>>6)<<(0));
   203 		    } else {
   204 			*dst = palmap[((dR>>5)<<(3+2))|
   205 				      ((dG>>5)<<(2))  |
   206 				      ((dB>>6)<<(0))  ];
   207 		    }
   208 		}
   209 		dst++;
   210 		src += srcbpp;
   211 	    },
   212 	    width);
   213 	    /* *INDENT-ON* */
   214         src += srcskip;
   215         dst += dstskip;
   216     }
   217 }
   218 
   219 #if GCC_ASMBLIT
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   221 static void
   222 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   223 {
   224     int width = info->d_width;
   225     int height = info->d_height;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   227     int srcskip = info->s_skip >> 2;
   228     Uint32 *dstp = (Uint32 *) info->d_pixels;
   229     int dstskip = info->d_skip >> 2;
   230     Uint32 dalpha = info->dst->Amask;
   231     Uint8 load[8];
   232 
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   234     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   236     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   237     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   238     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   239     while (height--) {
   240 		/* *INDENT-OFF* */
   241 		DUFFS_LOOP_DOUBLE2(
   242 		{
   243 			Uint32 s = *srcp++;
   244 			Uint32 d = *dstp;
   245 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   246 				   + (s & d & 0x00010101)) | dalpha;
   247 		},{
   248 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   249 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   250 
   251 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   252 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   253 
   254 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   255 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   256 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   257 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   258 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   259 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   260 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   261 			
   262 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   263 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   264 			dstp += 2;
   265 			srcp += 2;
   266 		}, width);
   267 		/* *INDENT-ON* */
   268         srcp += srcskip;
   269         dstp += dstskip;
   270     }
   271     emms();
   272 }
   273 
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   275 static void
   276 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   277 {
   278     SDL_PixelFormat *df = info->dst;
   279     unsigned alpha = info->src->alpha;
   280 
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   282         /* only call a128 version when R,G,B occupy lower bits */
   283         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   284     } else {
   285         int width = info->d_width;
   286         int height = info->d_height;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   288         int srcskip = info->s_skip >> 2;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   290         int dstskip = info->d_skip >> 2;
   291 
   292         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   293         /* form the alpha mult */
   294         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   295         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   296         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   297         alpha =
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   299                                                            Bshift);
   300         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   301         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   302         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   304         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   305         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   306 
   307         while (height--) {
   308 			/* *INDENT-OFF* */
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   315 
   316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   320 
   321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   324 				++srcp;
   325 				++dstp;
   326 			},{
   327 				/* Two Pixels Blend */
   328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   332 
   333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   337 
   338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   342 
   343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   347 
   348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   350 				
   351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   352 
   353   				srcp += 2;
   354   				dstp += 2;
   355   			}, width);
   356 			/* *INDENT-ON* */
   357             srcp += srcskip;
   358             dstp += dstskip;
   359         }
   360         emms();
   361     }
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void
   366 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   367 {
   368     int width = info->d_width;
   369     int height = info->d_height;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   371     int srcskip = info->s_skip >> 2;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   373     int dstskip = info->d_skip >> 2;
   374     SDL_PixelFormat *sf = info->src;
   375     Uint32 amask = sf->Amask;
   376 
   377     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   378     /* form multiplication mask */
   379     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   380     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   381     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   382     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   383     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   384     /* form channel masks */
   385     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   386     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   387     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   388     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   389     /* get alpha channel shift */
   390     movd_m2r(sf->Ashift, mm5);  /* Ashift -> mm5 */
   391 
   392     while (height--) {
   393 	    /* *INDENT-OFF* */
   394 	    DUFFS_LOOP4({
   395 		Uint32 alpha = *srcp & amask;
   396 		/* FIXME: Here we special-case opaque alpha since the
   397 			compositioning used (>>8 instead of /255) doesn't handle
   398 			it correctly. Also special-case alpha=0 for speed?
   399 			Benchmark this! */
   400 		if(alpha == 0) {
   401 			/* do nothing */
   402 		} else if(alpha == amask) {
   403 			/* opaque alpha -- copy RGB, keep dst alpha */
   404 			/* using MMX here to free up regular registers for other things */
   405 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   406 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   407 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   408 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   409 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   410 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   411 		} else {
   412 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   413 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   414 
   415 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   416 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   417 
   418 			__asm__ __volatile__ (
   419 				"movd %0, %%mm4"
   420 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   421 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   422 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   423 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   424 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   425 
   426 			/* blend */		    
   427 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   428 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   429 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   430 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   431 			
   432 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   433 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   434 		}
   435 		++srcp;
   436 		++dstp;
   437 	    }, width);
   438 	    /* *INDENT-ON* */
   439         srcp += srcskip;
   440         dstp += dstskip;
   441     }
   442     emms();
   443 }
   444 
   445 /* End GCC_ASMBLIT */
   446 
   447 #elif MSVC_ASMBLIT
   448 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   449 static void
   450 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   451 {
   452     int width = info->d_width;
   453     int height = info->d_height;
   454     Uint32 *srcp = (Uint32 *) info->s_pixels;
   455     int srcskip = info->s_skip >> 2;
   456     Uint32 *dstp = (Uint32 *) info->d_pixels;
   457     int dstskip = info->d_skip >> 2;
   458     Uint32 dalpha = info->dst->Amask;
   459 
   460     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   461 
   462     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   463     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   464     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   465 
   466     while (height--) {
   467         int n = width;
   468         if (n & 1) {
   469             Uint32 s = *srcp++;
   470             Uint32 d = *dstp;
   471             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   472                        + (s & d & 0x00010101)) | dalpha;
   473             n--;
   474         }
   475 
   476         for (n >>= 1; n > 0; --n) {
   477             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   478             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   479 
   480             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   481             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   482 
   483             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   484             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   485             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   486             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   487 
   488             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   489             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   490             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   491             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   492 
   493             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   494             dstp += 2;
   495             srcp += 2;
   496         }
   497 
   498         srcp += srcskip;
   499         dstp += dstskip;
   500     }
   501     _mm_empty();
   502 }
   503 
   504 /* fast RGB888->(A)RGB888 blending with surface alpha */
   505 static void
   506 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   507 {
   508     SDL_PixelFormat *df = info->dst;
   509     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   510     unsigned alpha = info->src->alpha;
   511 
   512     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   513         /* only call a128 version when R,G,B occupy lower bits */
   514         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   515     } else {
   516         int width = info->d_width;
   517         int height = info->d_height;
   518         Uint32 *srcp = (Uint32 *) info->s_pixels;
   519         int srcskip = info->s_skip >> 2;
   520         Uint32 *dstp = (Uint32 *) info->d_pixels;
   521         int dstskip = info->d_skip >> 2;
   522         Uint32 dalpha = df->Amask;
   523         Uint32 amult;
   524 
   525         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   526 
   527         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   528         /* form the alpha mult */
   529         amult = alpha | (alpha << 8);
   530         amult = amult | (amult << 16);
   531         chanmask =
   532             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   533                                                            Bshift);
   534         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   535         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   536         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   537         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   538 
   539         while (height--) {
   540             int n = width;
   541             if (n & 1) {
   542                 /* One Pixel Blend */
   543                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   544                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   545 
   546                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   547                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   548 
   549                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   550                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   551                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   552                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   553 
   554                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   555                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   556                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   557 
   558                 ++srcp;
   559                 ++dstp;
   560 
   561                 n--;
   562             }
   563 
   564             for (n >>= 1; n > 0; --n) {
   565                 /* Two Pixels Blend */
   566                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   567                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   568                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   569                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   570 
   571                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   572                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   573                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   574                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   575 
   576                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   577                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   578                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   579                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   580 
   581                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   582                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   583                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   584                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   585 
   586                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   587                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   588 
   589                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   590 
   591                 srcp += 2;
   592                 dstp += 2;
   593             }
   594             srcp += srcskip;
   595             dstp += dstskip;
   596         }
   597         _mm_empty();
   598     }
   599 }
   600 
   601 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   602 static void
   603 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   604 {
   605     int width = info->d_width;
   606     int height = info->d_height;
   607     Uint32 *srcp = (Uint32 *) info->s_pixels;
   608     int srcskip = info->s_skip >> 2;
   609     Uint32 *dstp = (Uint32 *) info->d_pixels;
   610     int dstskip = info->d_skip >> 2;
   611     SDL_PixelFormat *sf = info->src;
   612     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   613     Uint32 amask = sf->Amask;
   614     Uint32 ashift = sf->Ashift;
   615     Uint64 multmask;
   616 
   617     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   618 
   619     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   620 	/* *INDENT-OFF* */
   621 	multmask = ~(0xFFFFI64 << (ashift * 2));
   622 	/* *INDENT-ON* */
   623     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   624 
   625     while (height--) {
   626 		/* *INDENT-OFF* */
   627 		DUFFS_LOOP4({
   628 		Uint32 alpha = *srcp & amask;
   629 		if (alpha == 0) {
   630 			/* do nothing */
   631 		} else if (alpha == amask) {
   632 			/* opaque alpha -- copy RGB, keep dst alpha */
   633 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   634 		} else {
   635 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   636 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   637 
   638 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   639 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   640 
   641 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   642 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   643 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   644 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   645 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   646 
   647 			/* blend */		    
   648 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   649 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   650 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   651 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   652 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   653 			
   654 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   655 		}
   656 		++srcp;
   657 		++dstp;
   658 	    }, width);
   659 		/* *INDENT-ON* */
   660         srcp += srcskip;
   661         dstp += dstskip;
   662     }
   663     _mm_empty();
   664 }
   665 
   666 /* End MSVC_ASMBLIT */
   667 
   668 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   669 
   670 #if SDL_ALTIVEC_BLITTERS
   671 #if __MWERKS__
   672 #pragma altivec_model on
   673 #endif
   674 #if HAVE_ALTIVEC_H
   675 #include <altivec.h>
   676 #endif
   677 #include <assert.h>
   678 
   679 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   680 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   681         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   682 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   683         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   684 #else
   685 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   686         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   687 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   688         (vector unsigned short) { a,b,c,d,e,f,g,h }
   689 #endif
   690 
   691 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   692 #define VECPRINT(msg, v) do { \
   693     vector unsigned int tmpvec = (vector unsigned int)(v); \
   694     unsigned int *vp = (unsigned int *)&tmpvec; \
   695     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   696 } while (0)
   697 
   698 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   699     (vector unsigned char)(
   700         0x00, 0x10, 0x02, 0x12,
   701         0x04, 0x14, 0x06, 0x16,
   702         0x08, 0x18, 0x0A, 0x1A,
   703         0x0C, 0x1C, 0x0E, 0x1E );
   704 */
   705 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   706 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   707 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   708 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   709     ? vec_lvsl(0, src) \
   710     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   711 
   712 
   713 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   714     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   715     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   716     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   717     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   718     /* valpha2 is 255-alpha */ \
   719     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   720     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   721     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   722     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   723     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   724     /* add source and dest */ \
   725     vtemp1 = vec_add(vtemp1, vtemp3); \
   726     vtemp2 = vec_add(vtemp2, vtemp4); \
   727     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   728     vtemp1 = vec_add(vtemp1, v1_16); \
   729     vtemp3 = vec_sr(vtemp1, v8_16); \
   730     vtemp1 = vec_add(vtemp1, vtemp3); \
   731     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   732     vtemp2 = vec_add(vtemp2, v1_16); \
   733     vtemp4 = vec_sr(vtemp2, v8_16); \
   734     vtemp2 = vec_add(vtemp2, vtemp4); \
   735     /* (>>8) and get ARGBARGBARGBARGB */ \
   736     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   737 } while (0)
   738 
   739 /* Calculate the permute vector used for 32->32 swizzling */
   740 static vector unsigned char
   741 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   742 {
   743     /*
   744      * We have to assume that the bits that aren't used by other
   745      *  colors is alpha, and it's one complete byte, since some formats
   746      *  leave alpha with a zero mask, but we should still swizzle the bits.
   747      */
   748     /* ARGB */
   749     const static struct SDL_PixelFormat default_pixel_format = {
   750         NULL, 0, 0,
   751         0, 0, 0, 0,
   752         16, 8, 0, 24,
   753         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   754         0, 0
   755     };
   756     if (!srcfmt) {
   757         srcfmt = &default_pixel_format;
   758     }
   759     if (!dstfmt) {
   760         dstfmt = &default_pixel_format;
   761     }
   762     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   763                                                        0x04, 0x04, 0x04, 0x04,
   764                                                        0x08, 0x08, 0x08, 0x08,
   765                                                        0x0C, 0x0C, 0x0C,
   766                                                        0x0C);
   767     vector unsigned char vswiz;
   768     vector unsigned int srcvec;
   769 #define RESHIFT(X) (3 - ((X) >> 3))
   770     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   771     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   772     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   773     Uint32 amask;
   774     /* Use zero for alpha if either surface doesn't have alpha */
   775     if (dstfmt->Amask) {
   776         amask =
   777             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   778                                                                    Ashift);
   779     } else {
   780         amask =
   781             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   782                           0xFFFFFFFF);
   783     }
   784 #undef RESHIFT
   785     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   786     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   787     return (vswiz);
   788 }
   789 
   790 static void
   791 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   792 {
   793     int height = info->d_height;
   794     Uint8 *src = (Uint8 *) info->s_pixels;
   795     int srcskip = info->s_skip;
   796     Uint8 *dst = (Uint8 *) info->d_pixels;
   797     int dstskip = info->d_skip;
   798     SDL_PixelFormat *srcfmt = info->src;
   799 
   800     vector unsigned char v0 = vec_splat_u8(0);
   801     vector unsigned short v8_16 = vec_splat_u16(8);
   802     vector unsigned short v1_16 = vec_splat_u16(1);
   803     vector unsigned short v2_16 = vec_splat_u16(2);
   804     vector unsigned short v3_16 = vec_splat_u16(3);
   805     vector unsigned int v8_32 = vec_splat_u32(8);
   806     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   807     vector unsigned short v3f =
   808         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   809                           0x003f, 0x003f, 0x003f, 0x003f);
   810     vector unsigned short vfc =
   811         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   812                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   813 
   814     /* 
   815        0x10 - 0x1f is the alpha
   816        0x00 - 0x0e evens are the red
   817        0x01 - 0x0f odds are zero
   818      */
   819     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   820                                                        0x10, 0x02, 0x01, 0x01,
   821                                                        0x10, 0x04, 0x01, 0x01,
   822                                                        0x10, 0x06, 0x01,
   823                                                        0x01);
   824     vector unsigned char vredalpha2 =
   825         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   826                                         vec_sl(v8_32, v16_32))
   827         );
   828     /*
   829        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   830        0x11 - 0x0f odds are blue
   831      */
   832     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   833                                                    0x04, 0x05, 0x06, 0x13,
   834                                                    0x08, 0x09, 0x0a, 0x15,
   835                                                    0x0c, 0x0d, 0x0e, 0x17);
   836     vector unsigned char vblue2 =
   837         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   838         );
   839     /*
   840        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   841        0x10 - 0x0e evens are green
   842      */
   843     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   844                                                     0x04, 0x05, 0x12, 0x07,
   845                                                     0x08, 0x09, 0x14, 0x0b,
   846                                                     0x0c, 0x0d, 0x16, 0x0f);
   847     vector unsigned char vgreen2 =
   848         (vector unsigned
   849          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   850         );
   851     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   852                                                     0x00, 0x0a, 0x00, 0x0e,
   853                                                     0x00, 0x12, 0x00, 0x16,
   854                                                     0x00, 0x1a, 0x00, 0x1e);
   855     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   856     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   857     vector unsigned char valphaPermute =
   858         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   859 
   860     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   861     vf800 = vec_sl(vf800, vec_splat_u16(8));
   862 
   863     while (height--) {
   864         int extrawidth;
   865         vector unsigned char valigner;
   866         vector unsigned char vsrc;
   867         vector unsigned char voverflow;
   868         int width = info->d_width;
   869 
   870 #define ONE_PIXEL_BLEND(condition, widthvar) \
   871         while (condition) { \
   872             Uint32 Pixel; \
   873             unsigned sR, sG, sB, dR, dG, dB, sA; \
   874             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   875             if(sA) { \
   876                 unsigned short dstpixel = *((unsigned short *)dst); \
   877                 dR = (dstpixel >> 8) & 0xf8; \
   878                 dG = (dstpixel >> 3) & 0xfc; \
   879                 dB = (dstpixel << 3) & 0xf8; \
   880                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   881                 *((unsigned short *)dst) = ( \
   882                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   883                 ); \
   884             } \
   885             src += 4; \
   886             dst += 2; \
   887             widthvar--; \
   888         }
   889         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   890         extrawidth = (width % 8);
   891         valigner = VEC_ALIGNER(src);
   892         vsrc = (vector unsigned char) vec_ld(0, src);
   893         width -= extrawidth;
   894         while (width) {
   895             vector unsigned char valpha;
   896             vector unsigned char vsrc1, vsrc2;
   897             vector unsigned char vdst1, vdst2;
   898             vector unsigned short vR, vG, vB;
   899             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   900 
   901             /* Load 8 pixels from src as ARGB */
   902             voverflow = (vector unsigned char) vec_ld(15, src);
   903             vsrc = vec_perm(vsrc, voverflow, valigner);
   904             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   905             src += 16;
   906             vsrc = (vector unsigned char) vec_ld(15, src);
   907             voverflow = vec_perm(voverflow, vsrc, valigner);
   908             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   909             src += 16;
   910 
   911             /* Load 8 pixels from dst as XRGB */
   912             voverflow = vec_ld(0, dst);
   913             vR = vec_and((vector unsigned short) voverflow, vf800);
   914             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   915             vG = vec_sl(vB, v2_16);
   916             vdst1 =
   917                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   918                                                 (vector unsigned char) vR,
   919                                                 vredalpha1);
   920             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   921             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   922             vdst2 =
   923                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   924                                                 (vector unsigned char) vR,
   925                                                 vredalpha2);
   926             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   927             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   928 
   929             /* Alpha blend 8 pixels as ARGB */
   930             valpha = vec_perm(vsrc1, v0, valphaPermute);
   931             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   932                                v8_16);
   933             valpha = vec_perm(vsrc2, v0, valphaPermute);
   934             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   935                                v8_16);
   936 
   937             /* Convert 8 pixels to 565 */
   938             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   939                                                         vdst1,
   940                                                         (vector unsigned int)
   941                                                         vdst2);
   942             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   943             vgpixel = vec_and(vgpixel, vfc);
   944             vgpixel = vec_sl(vgpixel, v3_16);
   945             vrpixel = vec_sl(vpixel, v1_16);
   946             vrpixel = vec_and(vrpixel, vf800);
   947             vbpixel = vec_and(vpixel, v3f);
   948             vdst1 =
   949                 vec_or((vector unsigned char) vrpixel,
   950                        (vector unsigned char) vgpixel);
   951             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   952 
   953             /* Store 8 pixels */
   954             vec_st(vdst1, 0, dst);
   955 
   956             width -= 8;
   957             dst += 16;
   958         }
   959         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   960 #undef ONE_PIXEL_BLEND
   961         src += srcskip;
   962         dst += dstskip;
   963     }
   964 }
   965 
   966 static void
   967 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   968 {
   969     unsigned alpha = info->src->alpha;
   970     int height = info->d_height;
   971     Uint32 *srcp = (Uint32 *) info->s_pixels;
   972     int srcskip = info->s_skip >> 2;
   973     Uint32 *dstp = (Uint32 *) info->d_pixels;
   974     int dstskip = info->d_skip >> 2;
   975     SDL_PixelFormat *srcfmt = info->src;
   976     SDL_PixelFormat *dstfmt = info->dst;
   977     unsigned sA = srcfmt->alpha;
   978     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   979     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   980     Uint32 ckey = info->src->colorkey;
   981     vector unsigned char mergePermute;
   982     vector unsigned char vsrcPermute;
   983     vector unsigned char vdstPermute;
   984     vector unsigned char vsdstPermute;
   985     vector unsigned char valpha;
   986     vector unsigned char valphamask;
   987     vector unsigned char vbits;
   988     vector unsigned char v0;
   989     vector unsigned short v1;
   990     vector unsigned short v8;
   991     vector unsigned int vckey;
   992     vector unsigned int vrgbmask;
   993 
   994     mergePermute = VEC_MERGE_PERMUTE();
   995     v0 = vec_splat_u8(0);
   996     v1 = vec_splat_u16(1);
   997     v8 = vec_splat_u16(8);
   998 
   999     /* set the alpha to 255 on the destination surf */
  1000     valphamask = VEC_ALPHA_MASK();
  1001 
  1002     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1003     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1004     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1005 
  1006     /* set a vector full of alpha and 255-alpha */
  1007     ((unsigned char *) &valpha)[0] = alpha;
  1008     valpha = vec_splat(valpha, 0);
  1009     vbits = (vector unsigned char) vec_splat_s8(-1);
  1010 
  1011     ckey &= rgbmask;
  1012     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1013     vckey = vec_splat(vckey, 0);
  1014     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1015     vrgbmask = vec_splat(vrgbmask, 0);
  1016 
  1017     while (height--) {
  1018         int width = info->d_width;
  1019 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1020         while (condition) { \
  1021             Uint32 Pixel; \
  1022             unsigned sR, sG, sB, dR, dG, dB; \
  1023             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  1024             if(sA && Pixel != ckey) { \
  1025                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  1026                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1027                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1028                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1029             } \
  1030             dstp++; \
  1031             srcp++; \
  1032             widthvar--; \
  1033         }
  1034         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1035         if (width > 0) {
  1036             int extrawidth = (width % 4);
  1037             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1038             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1039             width -= extrawidth;
  1040             while (width) {
  1041                 vector unsigned char vsel;
  1042                 vector unsigned char voverflow;
  1043                 vector unsigned char vd;
  1044                 vector unsigned char vd_orig;
  1045 
  1046                 /* s = *srcp */
  1047                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1048                 vs = vec_perm(vs, voverflow, valigner);
  1049 
  1050                 /* vsel is set for items that match the key */
  1051                 vsel =
  1052                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1053                                                    vrgbmask);
  1054                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1055                                                         vsel, vckey);
  1056 
  1057                 /* permute to source format */
  1058                 vs = vec_perm(vs, valpha, vsrcPermute);
  1059 
  1060                 /* d = *dstp */
  1061                 vd = (vector unsigned char) vec_ld(0, dstp);
  1062                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1063 
  1064                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1065 
  1066                 /* set the alpha channel to full on */
  1067                 vd = vec_or(vd, valphamask);
  1068 
  1069                 /* mask out color key */
  1070                 vd = vec_sel(vd, vd_orig, vsel);
  1071 
  1072                 /* permute to dest format */
  1073                 vd = vec_perm(vd, vbits, vdstPermute);
  1074 
  1075                 /* *dstp = res */
  1076                 vec_st((vector unsigned int) vd, 0, dstp);
  1077 
  1078                 srcp += 4;
  1079                 dstp += 4;
  1080                 width -= 4;
  1081                 vs = voverflow;
  1082             }
  1083             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1084         }
  1085 #undef ONE_PIXEL_BLEND
  1086 
  1087         srcp += srcskip;
  1088         dstp += dstskip;
  1089     }
  1090 }
  1091 
  1092 
  1093 static void
  1094 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1095 {
  1096     int width = info->d_width;
  1097     int height = info->d_height;
  1098     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1099     int srcskip = info->s_skip >> 2;
  1100     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1101     int dstskip = info->d_skip >> 2;
  1102     SDL_PixelFormat *srcfmt = info->src;
  1103     SDL_PixelFormat *dstfmt = info->dst;
  1104     vector unsigned char mergePermute;
  1105     vector unsigned char valphaPermute;
  1106     vector unsigned char vsrcPermute;
  1107     vector unsigned char vdstPermute;
  1108     vector unsigned char vsdstPermute;
  1109     vector unsigned char valphamask;
  1110     vector unsigned char vpixelmask;
  1111     vector unsigned char v0;
  1112     vector unsigned short v1;
  1113     vector unsigned short v8;
  1114 
  1115     v0 = vec_splat_u8(0);
  1116     v1 = vec_splat_u16(1);
  1117     v8 = vec_splat_u16(8);
  1118     mergePermute = VEC_MERGE_PERMUTE();
  1119     valphamask = VEC_ALPHA_MASK();
  1120     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1121     vpixelmask = vec_nor(valphamask, v0);
  1122     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1123     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1124     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1125 
  1126     while (height--) {
  1127         width = info->d_width;
  1128 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1129             Uint32 Pixel; \
  1130             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1131             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1132             if(sA) { \
  1133               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1134               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1135               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1136             } \
  1137             ++srcp; \
  1138             ++dstp; \
  1139             widthvar--; \
  1140         }
  1141         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1142         if (width > 0) {
  1143             /* vsrcPermute */
  1144             /* vdstPermute */
  1145             int extrawidth = (width % 4);
  1146             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1147             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1148             width -= extrawidth;
  1149             while (width) {
  1150                 vector unsigned char voverflow;
  1151                 vector unsigned char vd;
  1152                 vector unsigned char valpha;
  1153                 vector unsigned char vdstalpha;
  1154                 /* s = *srcp */
  1155                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1156                 vs = vec_perm(vs, voverflow, valigner);
  1157                 vs = vec_perm(vs, v0, vsrcPermute);
  1158 
  1159                 valpha = vec_perm(vs, v0, valphaPermute);
  1160 
  1161                 /* d = *dstp */
  1162                 vd = (vector unsigned char) vec_ld(0, dstp);
  1163                 vd = vec_perm(vd, v0, vsdstPermute);
  1164                 vdstalpha = vec_and(vd, valphamask);
  1165 
  1166                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1167 
  1168                 /* set the alpha to the dest alpha */
  1169                 vd = vec_and(vd, vpixelmask);
  1170                 vd = vec_or(vd, vdstalpha);
  1171                 vd = vec_perm(vd, v0, vdstPermute);
  1172 
  1173                 /* *dstp = res */
  1174                 vec_st((vector unsigned int) vd, 0, dstp);
  1175 
  1176                 srcp += 4;
  1177                 dstp += 4;
  1178                 width -= 4;
  1179                 vs = voverflow;
  1180 
  1181             }
  1182             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1183         }
  1184         srcp += srcskip;
  1185         dstp += dstskip;
  1186 #undef ONE_PIXEL_BLEND
  1187     }
  1188 }
  1189 
  1190 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1191 static void
  1192 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1193 {
  1194     int width = info->d_width;
  1195     int height = info->d_height;
  1196     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1197     int srcskip = info->s_skip >> 2;
  1198     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1199     int dstskip = info->d_skip >> 2;
  1200     vector unsigned char mergePermute;
  1201     vector unsigned char valphaPermute;
  1202     vector unsigned char valphamask;
  1203     vector unsigned char vpixelmask;
  1204     vector unsigned char v0;
  1205     vector unsigned short v1;
  1206     vector unsigned short v8;
  1207     v0 = vec_splat_u8(0);
  1208     v1 = vec_splat_u16(1);
  1209     v8 = vec_splat_u16(8);
  1210     mergePermute = VEC_MERGE_PERMUTE();
  1211     valphamask = VEC_ALPHA_MASK();
  1212     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1213 
  1214 
  1215     vpixelmask = vec_nor(valphamask, v0);
  1216     while (height--) {
  1217         width = info->d_width;
  1218 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1219         while ((condition)) { \
  1220             Uint32 dalpha; \
  1221             Uint32 d; \
  1222             Uint32 s1; \
  1223             Uint32 d1; \
  1224             Uint32 s = *srcp; \
  1225             Uint32 alpha = s >> 24; \
  1226             if(alpha) { \
  1227               if(alpha == SDL_ALPHA_OPAQUE) { \
  1228                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1229               } else { \
  1230                 d = *dstp; \
  1231                 dalpha = d & 0xff000000; \
  1232                 s1 = s & 0xff00ff; \
  1233                 d1 = d & 0xff00ff; \
  1234                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1235                 s &= 0xff00; \
  1236                 d &= 0xff00; \
  1237                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1238                 *dstp = d1 | d | dalpha; \
  1239               } \
  1240             } \
  1241             ++srcp; \
  1242             ++dstp; \
  1243             widthvar--; \
  1244 	    }
  1245         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1246         if (width > 0) {
  1247             int extrawidth = (width % 4);
  1248             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1249             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1250             width -= extrawidth;
  1251             while (width) {
  1252                 vector unsigned char voverflow;
  1253                 vector unsigned char vd;
  1254                 vector unsigned char valpha;
  1255                 vector unsigned char vdstalpha;
  1256                 /* s = *srcp */
  1257                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1258                 vs = vec_perm(vs, voverflow, valigner);
  1259 
  1260                 valpha = vec_perm(vs, v0, valphaPermute);
  1261 
  1262                 /* d = *dstp */
  1263                 vd = (vector unsigned char) vec_ld(0, dstp);
  1264                 vdstalpha = vec_and(vd, valphamask);
  1265 
  1266                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1267 
  1268                 /* set the alpha to the dest alpha */
  1269                 vd = vec_and(vd, vpixelmask);
  1270                 vd = vec_or(vd, vdstalpha);
  1271 
  1272                 /* *dstp = res */
  1273                 vec_st((vector unsigned int) vd, 0, dstp);
  1274 
  1275                 srcp += 4;
  1276                 dstp += 4;
  1277                 width -= 4;
  1278                 vs = voverflow;
  1279             }
  1280             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1281         }
  1282         srcp += srcskip;
  1283         dstp += dstskip;
  1284     }
  1285 #undef ONE_PIXEL_BLEND
  1286 }
  1287 
  1288 static void
  1289 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1290 {
  1291     /* XXX : 6 */
  1292     unsigned alpha = info->src->alpha;
  1293     int height = info->d_height;
  1294     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1295     int srcskip = info->s_skip >> 2;
  1296     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1297     int dstskip = info->d_skip >> 2;
  1298     SDL_PixelFormat *srcfmt = info->src;
  1299     SDL_PixelFormat *dstfmt = info->dst;
  1300     unsigned sA = srcfmt->alpha;
  1301     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1302     vector unsigned char mergePermute;
  1303     vector unsigned char vsrcPermute;
  1304     vector unsigned char vdstPermute;
  1305     vector unsigned char vsdstPermute;
  1306     vector unsigned char valpha;
  1307     vector unsigned char valphamask;
  1308     vector unsigned char vbits;
  1309     vector unsigned short v1;
  1310     vector unsigned short v8;
  1311 
  1312     mergePermute = VEC_MERGE_PERMUTE();
  1313     v1 = vec_splat_u16(1);
  1314     v8 = vec_splat_u16(8);
  1315 
  1316     /* set the alpha to 255 on the destination surf */
  1317     valphamask = VEC_ALPHA_MASK();
  1318 
  1319     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1320     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1321     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1322 
  1323     /* set a vector full of alpha and 255-alpha */
  1324     ((unsigned char *) &valpha)[0] = alpha;
  1325     valpha = vec_splat(valpha, 0);
  1326     vbits = (vector unsigned char) vec_splat_s8(-1);
  1327 
  1328     while (height--) {
  1329         int width = info->d_width;
  1330 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1331             Uint32 Pixel; \
  1332             unsigned sR, sG, sB, dR, dG, dB; \
  1333             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1334             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1335             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1336             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1337             ++srcp; \
  1338             ++dstp; \
  1339             widthvar--; \
  1340         }
  1341         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1342         if (width > 0) {
  1343             int extrawidth = (width % 4);
  1344             vector unsigned char valigner = vec_lvsl(0, srcp);
  1345             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1346             width -= extrawidth;
  1347             while (width) {
  1348                 vector unsigned char voverflow;
  1349                 vector unsigned char vd;
  1350 
  1351                 /* s = *srcp */
  1352                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1353                 vs = vec_perm(vs, voverflow, valigner);
  1354                 vs = vec_perm(vs, valpha, vsrcPermute);
  1355 
  1356                 /* d = *dstp */
  1357                 vd = (vector unsigned char) vec_ld(0, dstp);
  1358                 vd = vec_perm(vd, vd, vsdstPermute);
  1359 
  1360                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1361 
  1362                 /* set the alpha channel to full on */
  1363                 vd = vec_or(vd, valphamask);
  1364                 vd = vec_perm(vd, vbits, vdstPermute);
  1365 
  1366                 /* *dstp = res */
  1367                 vec_st((vector unsigned int) vd, 0, dstp);
  1368 
  1369                 srcp += 4;
  1370                 dstp += 4;
  1371                 width -= 4;
  1372                 vs = voverflow;
  1373             }
  1374             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1375         }
  1376 #undef ONE_PIXEL_BLEND
  1377 
  1378         srcp += srcskip;
  1379         dstp += dstskip;
  1380     }
  1381 
  1382 }
  1383 
  1384 
  1385 /* fast RGB888->(A)RGB888 blending */
  1386 static void
  1387 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1388 {
  1389     unsigned alpha = info->src->alpha;
  1390     int height = info->d_height;
  1391     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1392     int srcskip = info->s_skip >> 2;
  1393     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1394     int dstskip = info->d_skip >> 2;
  1395     vector unsigned char mergePermute;
  1396     vector unsigned char valpha;
  1397     vector unsigned char valphamask;
  1398     vector unsigned short v1;
  1399     vector unsigned short v8;
  1400 
  1401     mergePermute = VEC_MERGE_PERMUTE();
  1402     v1 = vec_splat_u16(1);
  1403     v8 = vec_splat_u16(8);
  1404 
  1405     /* set the alpha to 255 on the destination surf */
  1406     valphamask = VEC_ALPHA_MASK();
  1407 
  1408     /* set a vector full of alpha and 255-alpha */
  1409     ((unsigned char *) &valpha)[0] = alpha;
  1410     valpha = vec_splat(valpha, 0);
  1411 
  1412     while (height--) {
  1413         int width = info->d_width;
  1414 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1415             Uint32 s = *srcp; \
  1416             Uint32 d = *dstp; \
  1417             Uint32 s1 = s & 0xff00ff; \
  1418             Uint32 d1 = d & 0xff00ff; \
  1419             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1420                  & 0xff00ff; \
  1421             s &= 0xff00; \
  1422             d &= 0xff00; \
  1423             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1424             *dstp = d1 | d | 0xff000000; \
  1425             ++srcp; \
  1426             ++dstp; \
  1427             widthvar--; \
  1428         }
  1429         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1430         if (width > 0) {
  1431             int extrawidth = (width % 4);
  1432             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1433             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1434             width -= extrawidth;
  1435             while (width) {
  1436                 vector unsigned char voverflow;
  1437                 vector unsigned char vd;
  1438 
  1439                 /* s = *srcp */
  1440                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1441                 vs = vec_perm(vs, voverflow, valigner);
  1442 
  1443                 /* d = *dstp */
  1444                 vd = (vector unsigned char) vec_ld(0, dstp);
  1445 
  1446                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1447 
  1448                 /* set the alpha channel to full on */
  1449                 vd = vec_or(vd, valphamask);
  1450 
  1451                 /* *dstp = res */
  1452                 vec_st((vector unsigned int) vd, 0, dstp);
  1453 
  1454                 srcp += 4;
  1455                 dstp += 4;
  1456                 width -= 4;
  1457                 vs = voverflow;
  1458             }
  1459             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1460         }
  1461 #undef ONE_PIXEL_BLEND
  1462 
  1463         srcp += srcskip;
  1464         dstp += dstskip;
  1465     }
  1466 }
  1467 
  1468 #if __MWERKS__
  1469 #pragma altivec_model off
  1470 #endif
  1471 #endif /* SDL_ALTIVEC_BLITTERS */
  1472 
  1473 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1474 static void
  1475 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1476 {
  1477     int width = info->d_width;
  1478     int height = info->d_height;
  1479     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1480     int srcskip = info->s_skip >> 2;
  1481     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1482     int dstskip = info->d_skip >> 2;
  1483 
  1484     while (height--) {
  1485 	    /* *INDENT-OFF* */
  1486 	    DUFFS_LOOP4({
  1487 		    Uint32 s = *srcp++;
  1488 		    Uint32 d = *dstp;
  1489 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1490 			       + (s & d & 0x00010101)) | 0xff000000;
  1491 	    }, width);
  1492 	    /* *INDENT-ON* */
  1493         srcp += srcskip;
  1494         dstp += dstskip;
  1495     }
  1496 }
  1497 
  1498 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1499 static void
  1500 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1501 {
  1502     unsigned alpha = info->src->alpha;
  1503     if (alpha == 128) {
  1504         BlitRGBtoRGBSurfaceAlpha128(info);
  1505     } else {
  1506         int width = info->d_width;
  1507         int height = info->d_height;
  1508         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1509         int srcskip = info->s_skip >> 2;
  1510         Uint32 *dstp = (Uint32 *) info->d_pixels;
  1511         int dstskip = info->d_skip >> 2;
  1512         Uint32 s;
  1513         Uint32 d;
  1514         Uint32 s1;
  1515         Uint32 d1;
  1516 
  1517         while (height--) {
  1518 			/* *INDENT-OFF* */
  1519 			DUFFS_LOOP_DOUBLE2({
  1520 				/* One Pixel Blend */
  1521 				s = *srcp;
  1522 				d = *dstp;
  1523 				s1 = s & 0xff00ff;
  1524 				d1 = d & 0xff00ff;
  1525 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1526 				     & 0xff00ff;
  1527 				s &= 0xff00;
  1528 				d &= 0xff00;
  1529 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1530 				*dstp = d1 | d | 0xff000000;
  1531 				++srcp;
  1532 				++dstp;
  1533 			},{
  1534 			        /* Two Pixels Blend */
  1535 				s = *srcp;
  1536 				d = *dstp;
  1537 				s1 = s & 0xff00ff;
  1538 				d1 = d & 0xff00ff;
  1539 				d1 += (s1 - d1) * alpha >> 8;
  1540 				d1 &= 0xff00ff;
  1541 				     
  1542 				s = ((s & 0xff00) >> 8) | 
  1543 					((srcp[1] & 0xff00) << 8);
  1544 				d = ((d & 0xff00) >> 8) |
  1545 					((dstp[1] & 0xff00) << 8);
  1546 				d += (s - d) * alpha >> 8;
  1547 				d &= 0x00ff00ff;
  1548 				
  1549 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1550 				++srcp;
  1551 				
  1552 			        s1 = *srcp;
  1553 				d1 = *dstp;
  1554 				s1 &= 0xff00ff;
  1555 				d1 &= 0xff00ff;
  1556 				d1 += (s1 - d1) * alpha >> 8;
  1557 				d1 &= 0xff00ff;
  1558 				
  1559 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1560 				++srcp;
  1561 				++dstp;
  1562 			}, width);
  1563 			/* *INDENT-ON* */
  1564             srcp += srcskip;
  1565             dstp += dstskip;
  1566         }
  1567     }
  1568 }
  1569 
  1570 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1571 static void
  1572 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1573 {
  1574     int width = info->d_width;
  1575     int height = info->d_height;
  1576     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1577     int srcskip = info->s_skip >> 2;
  1578     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1579     int dstskip = info->d_skip >> 2;
  1580 
  1581     while (height--) {
  1582 	    /* *INDENT-OFF* */
  1583 	    DUFFS_LOOP4({
  1584 		Uint32 dalpha;
  1585 		Uint32 d;
  1586 		Uint32 s1;
  1587 		Uint32 d1;
  1588 		Uint32 s = *srcp;
  1589 		Uint32 alpha = s >> 24;
  1590 		/* FIXME: Here we special-case opaque alpha since the
  1591 		   compositioning used (>>8 instead of /255) doesn't handle
  1592 		   it correctly. Also special-case alpha=0 for speed?
  1593 		   Benchmark this! */
  1594 		if(alpha) {   
  1595 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1596 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1597 		  } else {
  1598 		    /*
  1599 		     * take out the middle component (green), and process
  1600 		     * the other two in parallel. One multiply less.
  1601 		     */
  1602 		    d = *dstp;
  1603 		    dalpha = d & 0xff000000;
  1604 		    s1 = s & 0xff00ff;
  1605 		    d1 = d & 0xff00ff;
  1606 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1607 		    s &= 0xff00;
  1608 		    d &= 0xff00;
  1609 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1610 		    *dstp = d1 | d | dalpha;
  1611 		  }
  1612 		}
  1613 		++srcp;
  1614 		++dstp;
  1615 	    }, width);
  1616 	    /* *INDENT-ON* */
  1617         srcp += srcskip;
  1618         dstp += dstskip;
  1619     }
  1620 }
  1621 
  1622 #if GCC_ASMBLIT
  1623 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1624 static void
  1625 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1626 {
  1627     int width = info->d_width;
  1628     int height = info->d_height;
  1629     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1630     int srcskip = info->s_skip >> 2;
  1631     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1632     int dstskip = info->d_skip >> 2;
  1633     SDL_PixelFormat *sf = info->src;
  1634     Uint32 amask = sf->Amask;
  1635 
  1636     __asm__(
  1637                /* make mm6 all zeros. */
  1638                "pxor       %%mm6, %%mm6\n"
  1639                /* Make a mask to preserve the alpha. */
  1640                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1641                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1642                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1643                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1644                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1645                /* form channel masks */
  1646                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1647                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1648                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1649                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1650                /* get alpha channel shift */
  1651                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1652   : /* nothing */ :            "m"(sf->Amask), "m"(sf->Ashift));
  1653 
  1654     while (height--) {
  1655 
  1656 	    /* *INDENT-OFF* */
  1657 	    DUFFS_LOOP4({
  1658 		Uint32 alpha;
  1659 
  1660 		__asm__ (
  1661 		"prefetch 64(%0)\n"
  1662 		"prefetch 64(%1)\n"
  1663 			: : "r" (srcp), "r" (dstp) );
  1664 
  1665 		alpha = *srcp & amask;
  1666 		/* FIXME: Here we special-case opaque alpha since the
  1667 		   compositioning used (>>8 instead of /255) doesn't handle
  1668 		   it correctly. Also special-case alpha=0 for speed?
  1669 		   Benchmark this! */
  1670 		if(alpha == 0) {
  1671 		    /* do nothing */
  1672 		}
  1673 		else if(alpha == amask) {
  1674 			/* opaque alpha -- copy RGB, keep dst alpha */
  1675 		    /* using MMX here to free up regular registers for other things */
  1676 			    __asm__ (
  1677 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1678 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1679 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1680 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1681 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1682 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1683 
  1684 		     : : "r" (srcp), "r" (dstp) );
  1685 		} 
  1686 
  1687 		else {
  1688 			    __asm__ (
  1689 		    /* load in the source, and dst. */
  1690 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1691 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1692 
  1693 		    /* Move the src alpha into mm2 */
  1694 
  1695 		    /* if supporting pshufw */
  1696 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1697 		    /*"psrlw     $8, %%mm2\n" */
  1698 		    
  1699 		    /* else: */
  1700 		    "movd       %2,    %%mm2\n"
  1701 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1702 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1703 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1704 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1705 
  1706 		    /* move the colors into words. */
  1707 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1708 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1709 
  1710 		    /* src - dst */
  1711 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1712 
  1713 		    /* A * (src-dst) */
  1714 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1715 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1716 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1717 
  1718 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1719 		    
  1720 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1721 
  1722 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1723 
  1724 		}
  1725 		++srcp;
  1726 		++dstp;
  1727 	    }, width);
  1728 	    /* *INDENT-ON* */
  1729         srcp += srcskip;
  1730         dstp += dstskip;
  1731     }
  1732 
  1733   __asm__("emms\n":);
  1734 }
  1735 
  1736 /* End GCC_ASMBLIT*/
  1737 
  1738 #elif MSVC_ASMBLIT
  1739 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1740 static void
  1741 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1742 {
  1743     int width = info->d_width;
  1744     int height = info->d_height;
  1745     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1746     int srcskip = info->s_skip >> 2;
  1747     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1748     int dstskip = info->d_skip >> 2;
  1749     SDL_PixelFormat *sf = info->src;
  1750     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1751     Uint32 amask = sf->Amask;
  1752     Uint32 ashift = sf->Ashift;
  1753     Uint64 multmask;
  1754 
  1755     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1756 
  1757     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1758 	/* *INDENT-OFF* */
  1759     multmask = ~(0xFFFFI64 << (ashift * 2));
  1760 	/* *INDENT-ON* */
  1761     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1762 
  1763     while (height--) {
  1764 	    /* *INDENT-OFF* */
  1765 	    DUFFS_LOOP4({
  1766 		Uint32 alpha;
  1767 
  1768 		_m_prefetch(srcp + 16);
  1769 		_m_prefetch(dstp + 16);
  1770 
  1771 		alpha = *srcp & amask;
  1772 		if (alpha == 0) {
  1773 			/* do nothing */
  1774 		} else if (alpha == amask) {
  1775 			/* copy RGB, keep dst alpha */
  1776 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1777 		} else {
  1778 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1779 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1780 
  1781 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1782 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1783 
  1784 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1785 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1786 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1787 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1788 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1789 
  1790 			/* blend */		    
  1791 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1792 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1793 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1794 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1795 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1796 			
  1797 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1798 		}
  1799 		++srcp;
  1800 		++dstp;
  1801 	    }, width);
  1802 	    /* *INDENT-ON* */
  1803         srcp += srcskip;
  1804         dstp += dstskip;
  1805     }
  1806     _mm_empty();
  1807 }
  1808 
  1809 /* End MSVC_ASMBLIT */
  1810 
  1811 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1812 
  1813 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1814 
  1815 /* blend a single 16 bit pixel at 50% */
  1816 #define BLEND16_50(d, s, mask)						\
  1817 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1818 
  1819 /* blend two 16 bit pixels at 50% */
  1820 #define BLEND2x16_50(d, s, mask)					     \
  1821 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1822 	 + (s & d & (~(mask | mask << 16))))
  1823 
  1824 static void
  1825 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1826 {
  1827     int width = info->d_width;
  1828     int height = info->d_height;
  1829     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1830     int srcskip = info->s_skip >> 1;
  1831     Uint16 *dstp = (Uint16 *) info->d_pixels;
  1832     int dstskip = info->d_skip >> 1;
  1833 
  1834     while (height--) {
  1835         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1836             /*
  1837              * Source and destination not aligned, pipeline it.
  1838              * This is mostly a win for big blits but no loss for
  1839              * small ones
  1840              */
  1841             Uint32 prev_sw;
  1842             int w = width;
  1843 
  1844             /* handle odd destination */
  1845             if ((uintptr_t) dstp & 2) {
  1846                 Uint16 d = *dstp, s = *srcp;
  1847                 *dstp = BLEND16_50(d, s, mask);
  1848                 dstp++;
  1849                 srcp++;
  1850                 w--;
  1851             }
  1852             srcp++;             /* srcp is now 32-bit aligned */
  1853 
  1854             /* bootstrap pipeline with first halfword */
  1855             prev_sw = ((Uint32 *) srcp)[-1];
  1856 
  1857             while (w > 1) {
  1858                 Uint32 sw, dw, s;
  1859                 sw = *(Uint32 *) srcp;
  1860                 dw = *(Uint32 *) dstp;
  1861 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1862                 s = (prev_sw << 16) + (sw >> 16);
  1863 #else
  1864                 s = (prev_sw >> 16) + (sw << 16);
  1865 #endif
  1866                 prev_sw = sw;
  1867                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1868                 dstp += 2;
  1869                 srcp += 2;
  1870                 w -= 2;
  1871             }
  1872 
  1873             /* final pixel if any */
  1874             if (w) {
  1875                 Uint16 d = *dstp, s;
  1876 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1877                 s = (Uint16) prev_sw;
  1878 #else
  1879                 s = (Uint16) (prev_sw >> 16);
  1880 #endif
  1881                 *dstp = BLEND16_50(d, s, mask);
  1882                 srcp++;
  1883                 dstp++;
  1884             }
  1885             srcp += srcskip - 1;
  1886             dstp += dstskip;
  1887         } else {
  1888             /* source and destination are aligned */
  1889             int w = width;
  1890 
  1891             /* first odd pixel? */
  1892             if ((uintptr_t) srcp & 2) {
  1893                 Uint16 d = *dstp, s = *srcp;
  1894                 *dstp = BLEND16_50(d, s, mask);
  1895                 srcp++;
  1896                 dstp++;
  1897                 w--;
  1898             }
  1899             /* srcp and dstp are now 32-bit aligned */
  1900 
  1901             while (w > 1) {
  1902                 Uint32 sw = *(Uint32 *) srcp;
  1903                 Uint32 dw = *(Uint32 *) dstp;
  1904                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1905                 srcp += 2;
  1906                 dstp += 2;
  1907                 w -= 2;
  1908             }
  1909 
  1910             /* last odd pixel? */
  1911             if (w) {
  1912                 Uint16 d = *dstp, s = *srcp;
  1913                 *dstp = BLEND16_50(d, s, mask);
  1914                 srcp++;
  1915                 dstp++;
  1916             }
  1917             srcp += srcskip;
  1918             dstp += dstskip;
  1919         }
  1920     }
  1921 }
  1922 
  1923 #if GCC_ASMBLIT
  1924 /* fast RGB565->RGB565 blending with surface alpha */
  1925 static void
  1926 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1927 {
  1928     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1929     if (alpha == 128) {
  1930         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1931     } else {
  1932         int width = info->d_width;
  1933         int height = info->d_height;
  1934         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1935         int srcskip = info->s_skip >> 1;
  1936         Uint16 *dstp = (Uint16 *) info->d_pixels;
  1937         int dstskip = info->d_skip >> 1;
  1938         Uint32 s, d;
  1939         Uint8 load[8];
  1940 
  1941         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1942         *(Uint64 *) load = alpha;
  1943         alpha >>= 3;            /* downscale alpha to 5 bits */
  1944 
  1945         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1946         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1947         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1948         /* position alpha to allow for mullo and mulhi on diff channels
  1949            to reduce the number of operations */
  1950         psllq_i2r(3, mm0);
  1951 
  1952         /* Setup the 565 color channel masks */
  1953         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1954         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1955         *(Uint64 *) load = 0x001F001F001F001FULL;
  1956         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1957         while (height--) {
  1958 			/* *INDENT-OFF* */
  1959 			DUFFS_LOOP_QUATRO2(
  1960 			{
  1961 				s = *srcp++;
  1962 				d = *dstp;
  1963 				/*
  1964 				 * shift out the middle component (green) to
  1965 				 * the high 16 bits, and process all three RGB
  1966 				 * components at the same time.
  1967 				 */
  1968 				s = (s | s << 16) & 0x07e0f81f;
  1969 				d = (d | d << 16) & 0x07e0f81f;
  1970 				d += (s - d) * alpha >> 5;
  1971 				d &= 0x07e0f81f;
  1972 				*dstp++ = d | d >> 16;
  1973 			},{
  1974 				s = *srcp++;
  1975 				d = *dstp;
  1976 				/*
  1977 				 * shift out the middle component (green) to
  1978 				 * the high 16 bits, and process all three RGB
  1979 				 * components at the same time.
  1980 				 */
  1981 				s = (s | s << 16) & 0x07e0f81f;
  1982 				d = (d | d << 16) & 0x07e0f81f;
  1983 				d += (s - d) * alpha >> 5;
  1984 				d &= 0x07e0f81f;
  1985 				*dstp++ = d | d >> 16;
  1986 				s = *srcp++;
  1987 				d = *dstp;
  1988 				/*
  1989 				 * shift out the middle component (green) to
  1990 				 * the high 16 bits, and process all three RGB
  1991 				 * components at the same time.
  1992 				 */
  1993 				s = (s | s << 16) & 0x07e0f81f;
  1994 				d = (d | d << 16) & 0x07e0f81f;
  1995 				d += (s - d) * alpha >> 5;
  1996 				d &= 0x07e0f81f;
  1997 				*dstp++ = d | d >> 16;
  1998 			},{
  1999 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2000 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2001 
  2002 				/* red -- does not need a mask since the right shift clears
  2003 				   the uninteresting bits */
  2004 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2005 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2006 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2007 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2008 
  2009 				/* blend */
  2010 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2011 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2012 				/* alpha used is actually 11 bits
  2013 				   11 + 5 = 16 bits, so the sign bits are lost */
  2014 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2015 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2016 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2017 
  2018 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2019 
  2020 				/* green -- process the bits in place */
  2021 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2022 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2023 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2024 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2025 
  2026 				/* blend */
  2027 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2028 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2029 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2030 				   bits are gone and the sign bits present */
  2031 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2032 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2033 
  2034 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2035 
  2036 				/* blue */
  2037 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2038 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2039 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2040 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2041 
  2042 				/* blend */
  2043 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2044 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2045 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2046 				   the interesting bits will need to be MASKed */
  2047 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2048 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2049 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2050 
  2051 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2052 
  2053 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2054 
  2055 				srcp += 4;
  2056 				dstp += 4;
  2057 			}, width);			
  2058 			/* *INDENT-ON* */
  2059             srcp += srcskip;
  2060             dstp += dstskip;
  2061         }
  2062         emms();
  2063     }
  2064 }
  2065 
  2066 /* fast RGB555->RGB555 blending with surface alpha */
  2067 static void
  2068 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2069 {
  2070     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2071     if (alpha == 128) {
  2072         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2073     } else {
  2074         int width = info->d_width;
  2075         int height = info->d_height;
  2076         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2077         int srcskip = info->s_skip >> 1;
  2078         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2079         int dstskip = info->d_skip >> 1;
  2080         Uint32 s, d;
  2081         Uint8 load[8];
  2082 
  2083         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2084         *(Uint64 *) load = alpha;
  2085         alpha >>= 3;            /* downscale alpha to 5 bits */
  2086 
  2087         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2088         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2089         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2090         /* position alpha to allow for mullo and mulhi on diff channels
  2091            to reduce the number of operations */
  2092         psllq_i2r(3, mm0);
  2093 
  2094         /* Setup the 555 color channel masks */
  2095         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2096         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2097         *(Uint64 *) load = 0x001F001F001F001FULL;
  2098         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2099         while (height--) {
  2100 			/* *INDENT-OFF* */
  2101 			DUFFS_LOOP_QUATRO2(
  2102 			{
  2103 				s = *srcp++;
  2104 				d = *dstp;
  2105 				/*
  2106 				 * shift out the middle component (green) to
  2107 				 * the high 16 bits, and process all three RGB
  2108 				 * components at the same time.
  2109 				 */
  2110 				s = (s | s << 16) & 0x03e07c1f;
  2111 				d = (d | d << 16) & 0x03e07c1f;
  2112 				d += (s - d) * alpha >> 5;
  2113 				d &= 0x03e07c1f;
  2114 				*dstp++ = d | d >> 16;
  2115 			},{
  2116 				s = *srcp++;
  2117 				d = *dstp;
  2118 				/*
  2119 				 * shift out the middle component (green) to
  2120 				 * the high 16 bits, and process all three RGB
  2121 				 * components at the same time.
  2122 				 */
  2123 				s = (s | s << 16) & 0x03e07c1f;
  2124 				d = (d | d << 16) & 0x03e07c1f;
  2125 				d += (s - d) * alpha >> 5;
  2126 				d &= 0x03e07c1f;
  2127 				*dstp++ = d | d >> 16;
  2128 			        s = *srcp++;
  2129 				d = *dstp;
  2130 				/*
  2131 				 * shift out the middle component (green) to
  2132 				 * the high 16 bits, and process all three RGB
  2133 				 * components at the same time.
  2134 				 */
  2135 				s = (s | s << 16) & 0x03e07c1f;
  2136 				d = (d | d << 16) & 0x03e07c1f;
  2137 				d += (s - d) * alpha >> 5;
  2138 				d &= 0x03e07c1f;
  2139 				*dstp++ = d | d >> 16;
  2140 			},{
  2141 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2142 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2143 
  2144 				/* red -- process the bits in place */
  2145 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2146 					/* by reusing the GREEN mask we free up another mmx
  2147 					   register to accumulate the result */
  2148 
  2149 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2150 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2151 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2152 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2153 
  2154 				/* blend */
  2155 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2156 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2157 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2158 				   cleared by a MASK below */
  2159 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2160 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2161 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2162 
  2163 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2164 
  2165 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2166 
  2167 				/* green -- process the bits in place */
  2168 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2169 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2170 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2171 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2172 
  2173 				/* blend */
  2174 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2175 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2176 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2177 				   bits are gone and the sign bits present */
  2178 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2179 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2180 
  2181 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2182 
  2183 				/* blue */
  2184 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2185 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2186 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2187 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2188 
  2189 				/* blend */
  2190 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2191 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2192 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2193 				   the interesting bits will need to be MASKed */
  2194 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2195 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2196 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2197 
  2198 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2199 
  2200 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2201 
  2202 				srcp += 4;
  2203 				dstp += 4;
  2204 			}, width);
  2205 			/* *INDENT-ON* */
  2206             srcp += srcskip;
  2207             dstp += dstskip;
  2208         }
  2209         emms();
  2210     }
  2211 }
  2212 
  2213 /* End GCC_ASMBLIT */
  2214 
  2215 #elif MSVC_ASMBLIT
  2216 /* fast RGB565->RGB565 blending with surface alpha */
  2217 static void
  2218 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2219 {
  2220     unsigned alpha = info->src->alpha;
  2221     if (alpha == 128) {
  2222         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2223     } else {
  2224         int width = info->d_width;
  2225         int height = info->d_height;
  2226         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2227         int srcskip = info->s_skip >> 1;
  2228         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2229         int dstskip = info->d_skip >> 1;
  2230         Uint32 s, d;
  2231 
  2232         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2233 
  2234         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2235         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2236         alpha >>= 3;            /* downscale alpha to 5 bits */
  2237 
  2238         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2239         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2240         /* position alpha to allow for mullo and mulhi on diff channels
  2241            to reduce the number of operations */
  2242         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2243 
  2244         /* Setup the 565 color channel masks */
  2245         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2246         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2247 
  2248         while (height--) {
  2249 			/* *INDENT-OFF* */
  2250 			DUFFS_LOOP_QUATRO2(
  2251 			{
  2252 				s = *srcp++;
  2253 				d = *dstp;
  2254 				/*
  2255 				 * shift out the middle component (green) to
  2256 				 * the high 16 bits, and process all three RGB
  2257 				 * components at the same time.
  2258 				 */
  2259 				s = (s | s << 16) & 0x07e0f81f;
  2260 				d = (d | d << 16) & 0x07e0f81f;
  2261 				d += (s - d) * alpha >> 5;
  2262 				d &= 0x07e0f81f;
  2263 				*dstp++ = (Uint16)(d | d >> 16);
  2264 			},{
  2265 				s = *srcp++;
  2266 				d = *dstp;
  2267 				/*
  2268 				 * shift out the middle component (green) to
  2269 				 * the high 16 bits, and process all three RGB
  2270 				 * components at the same time.
  2271 				 */
  2272 				s = (s | s << 16) & 0x07e0f81f;
  2273 				d = (d | d << 16) & 0x07e0f81f;
  2274 				d += (s - d) * alpha >> 5;
  2275 				d &= 0x07e0f81f;
  2276 				*dstp++ = (Uint16)(d | d >> 16);
  2277 				s = *srcp++;
  2278 				d = *dstp;
  2279 				/*
  2280 				 * shift out the middle component (green) to
  2281 				 * the high 16 bits, and process all three RGB
  2282 				 * components at the same time.
  2283 				 */
  2284 				s = (s | s << 16) & 0x07e0f81f;
  2285 				d = (d | d << 16) & 0x07e0f81f;
  2286 				d += (s - d) * alpha >> 5;
  2287 				d &= 0x07e0f81f;
  2288 				*dstp++ = (Uint16)(d | d >> 16);
  2289 			},{
  2290 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2291 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2292 
  2293 				/* red */
  2294 				src2 = src1;
  2295 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2296 
  2297 				dst2 = dst1;
  2298 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2299 
  2300 				/* blend */
  2301 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2302 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2303 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2304 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2305 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2306 
  2307 				mm_res = dst2; /* RED -> mm_res */
  2308 
  2309 				/* green -- process the bits in place */
  2310 				src2 = src1;
  2311 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2312 
  2313 				dst2 = dst1;
  2314 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2315 
  2316 				/* blend */
  2317 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2318 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2319 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2320 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2321 
  2322 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2323 
  2324 				/* blue */
  2325 				src2 = src1;
  2326 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2327 
  2328 				dst2 = dst1;
  2329 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2330 
  2331 				/* blend */
  2332 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2333 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2334 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2335 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2336 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2337 
  2338 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2339 
  2340 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2341 
  2342 				srcp += 4;
  2343 				dstp += 4;
  2344 			}, width);
  2345 			/* *INDENT-ON* */
  2346             srcp += srcskip;
  2347             dstp += dstskip;
  2348         }
  2349         _mm_empty();
  2350     }
  2351 }
  2352 
  2353 /* fast RGB555->RGB555 blending with surface alpha */
  2354 static void
  2355 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2356 {
  2357     unsigned alpha = info->src->alpha;
  2358     if (alpha == 128) {
  2359         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2360     } else {
  2361         int width = info->d_width;
  2362         int height = info->d_height;
  2363         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2364         int srcskip = info->s_skip >> 1;
  2365         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2366         int dstskip = info->d_skip >> 1;
  2367         Uint32 s, d;
  2368 
  2369         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2370 
  2371         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2372         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2373         alpha >>= 3;            /* downscale alpha to 5 bits */
  2374 
  2375         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2376         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2377         /* position alpha to allow for mullo and mulhi on diff channels
  2378            to reduce the number of operations */
  2379         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2380 
  2381         /* Setup the 555 color channel masks */
  2382         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2383         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2384         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2385 
  2386         while (height--) {
  2387 			/* *INDENT-OFF* */
  2388 			DUFFS_LOOP_QUATRO2(
  2389 			{
  2390 				s = *srcp++;
  2391 				d = *dstp;
  2392 				/*
  2393 				 * shift out the middle component (green) to
  2394 				 * the high 16 bits, and process all three RGB
  2395 				 * components at the same time.
  2396 				 */
  2397 				s = (s | s << 16) & 0x03e07c1f;
  2398 				d = (d | d << 16) & 0x03e07c1f;
  2399 				d += (s - d) * alpha >> 5;
  2400 				d &= 0x03e07c1f;
  2401 				*dstp++ = (Uint16)(d | d >> 16);
  2402 			},{
  2403 				s = *srcp++;
  2404 				d = *dstp;
  2405 				/*
  2406 				 * shift out the middle component (green) to
  2407 				 * the high 16 bits, and process all three RGB
  2408 				 * components at the same time.
  2409 				 */
  2410 				s = (s | s << 16) & 0x03e07c1f;
  2411 				d = (d | d << 16) & 0x03e07c1f;
  2412 				d += (s - d) * alpha >> 5;
  2413 				d &= 0x03e07c1f;
  2414 				*dstp++ = (Uint16)(d | d >> 16);
  2415 			        s = *srcp++;
  2416 				d = *dstp;
  2417 				/*
  2418 				 * shift out the middle component (green) to
  2419 				 * the high 16 bits, and process all three RGB
  2420 				 * components at the same time.
  2421 				 */
  2422 				s = (s | s << 16) & 0x03e07c1f;
  2423 				d = (d | d << 16) & 0x03e07c1f;
  2424 				d += (s - d) * alpha >> 5;
  2425 				d &= 0x03e07c1f;
  2426 				*dstp++ = (Uint16)(d | d >> 16);
  2427 			},{
  2428 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2429 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2430 
  2431 				/* red -- process the bits in place */
  2432 				src2 = src1;
  2433 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2434 
  2435 				dst2 = dst1;
  2436 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2437 
  2438 				/* blend */
  2439 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2440 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2441 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2442 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2443 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2444 
  2445 				mm_res = dst2; /* RED -> mm_res */
  2446 				
  2447 				/* green -- process the bits in place */
  2448 				src2 = src1;
  2449 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2450 
  2451 				dst2 = dst1;
  2452 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2453 
  2454 				/* blend */
  2455 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2456 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2457 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2458 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2459 
  2460 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2461 
  2462 				/* blue */
  2463 				src2 = src1; /* src -> src2 */
  2464 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2465 
  2466 				dst2 = dst1; /* dst -> dst2 */
  2467 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2468 
  2469 				/* blend */
  2470 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2471 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2472 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2473 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2474 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2475 
  2476 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2477 
  2478 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2479 
  2480 				srcp += 4;
  2481 				dstp += 4;
  2482 			}, width);
  2483 			/* *INDENT-ON* */
  2484             srcp += srcskip;
  2485             dstp += dstskip;
  2486         }
  2487         _mm_empty();
  2488     }
  2489 }
  2490 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2491 
  2492 /* fast RGB565->RGB565 blending with surface alpha */
  2493 static void
  2494 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2495 {
  2496     unsigned alpha = info->src->alpha;
  2497     if (alpha == 128) {
  2498         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2499     } else {
  2500         int width = info->d_width;
  2501         int height = info->d_height;
  2502         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2503         int srcskip = info->s_skip >> 1;
  2504         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2505         int dstskip = info->d_skip >> 1;
  2506         alpha >>= 3;            /* downscale alpha to 5 bits */
  2507 
  2508         while (height--) {
  2509 			/* *INDENT-OFF* */
  2510 			DUFFS_LOOP4({
  2511 				Uint32 s = *srcp++;
  2512 				Uint32 d = *dstp;
  2513 				/*
  2514 				 * shift out the middle component (green) to
  2515 				 * the high 16 bits, and process all three RGB
  2516 				 * components at the same time.
  2517 				 */
  2518 				s = (s | s << 16) & 0x07e0f81f;
  2519 				d = (d | d << 16) & 0x07e0f81f;
  2520 				d += (s - d) * alpha >> 5;
  2521 				d &= 0x07e0f81f;
  2522 				*dstp++ = (Uint16)(d | d >> 16);
  2523 			}, width);
  2524 			/* *INDENT-ON* */
  2525             srcp += srcskip;
  2526             dstp += dstskip;
  2527         }
  2528     }
  2529 }
  2530 
  2531 /* fast RGB555->RGB555 blending with surface alpha */
  2532 static void
  2533 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2534 {
  2535     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2536     if (alpha == 128) {
  2537         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2538     } else {
  2539         int width = info->d_width;
  2540         int height = info->d_height;
  2541         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2542         int srcskip = info->s_skip >> 1;
  2543         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2544         int dstskip = info->d_skip >> 1;
  2545         alpha >>= 3;            /* downscale alpha to 5 bits */
  2546 
  2547         while (height--) {
  2548 			/* *INDENT-OFF* */
  2549 			DUFFS_LOOP4({
  2550 				Uint32 s = *srcp++;
  2551 				Uint32 d = *dstp;
  2552 				/*
  2553 				 * shift out the middle component (green) to
  2554 				 * the high 16 bits, and process all three RGB
  2555 				 * components at the same time.
  2556 				 */
  2557 				s = (s | s << 16) & 0x03e07c1f;
  2558 				d = (d | d << 16) & 0x03e07c1f;
  2559 				d += (s - d) * alpha >> 5;
  2560 				d &= 0x03e07c1f;
  2561 				*dstp++ = (Uint16)(d | d >> 16);
  2562 			}, width);
  2563 			/* *INDENT-ON* */
  2564             srcp += srcskip;
  2565             dstp += dstskip;
  2566         }
  2567     }
  2568 }
  2569 
  2570 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2571 static void
  2572 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2573 {
  2574     int width = info->d_width;
  2575     int height = info->d_height;
  2576     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2577     int srcskip = info->s_skip >> 2;
  2578     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2579     int dstskip = info->d_skip >> 1;
  2580 
  2581     while (height--) {
  2582 	    /* *INDENT-OFF* */
  2583 	    DUFFS_LOOP4({
  2584 		Uint32 s = *srcp;
  2585 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2586 		/* FIXME: Here we special-case opaque alpha since the
  2587 		   compositioning used (>>8 instead of /255) doesn't handle
  2588 		   it correctly. Also special-case alpha=0 for speed?
  2589 		   Benchmark this! */
  2590 		if(alpha) {   
  2591 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2592 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2593 		  } else {
  2594 		    Uint32 d = *dstp;
  2595 		    /*
  2596 		     * convert source and destination to G0RAB65565
  2597 		     * and blend all components at the same time
  2598 		     */
  2599 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2600 		      + (s >> 3 & 0x1f);
  2601 		    d = (d | d << 16) & 0x07e0f81f;
  2602 		    d += (s - d) * alpha >> 5;
  2603 		    d &= 0x07e0f81f;
  2604 		    *dstp = (Uint16)(d | d >> 16);
  2605 		  }
  2606 		}
  2607 		srcp++;
  2608 		dstp++;
  2609 	    }, width);
  2610 	    /* *INDENT-ON* */
  2611         srcp += srcskip;
  2612         dstp += dstskip;
  2613     }
  2614 }
  2615 
  2616 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2617 static void
  2618 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2619 {
  2620     int width = info->d_width;
  2621     int height = info->d_height;
  2622     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2623     int srcskip = info->s_skip >> 2;
  2624     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2625     int dstskip = info->d_skip >> 1;
  2626 
  2627     while (height--) {
  2628 	    /* *INDENT-OFF* */
  2629 	    DUFFS_LOOP4({
  2630 		unsigned alpha;
  2631 		Uint32 s = *srcp;
  2632 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2633 		/* FIXME: Here we special-case opaque alpha since the
  2634 		   compositioning used (>>8 instead of /255) doesn't handle
  2635 		   it correctly. Also special-case alpha=0 for speed?
  2636 		   Benchmark this! */
  2637 		if(alpha) {   
  2638 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2639 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2640 		  } else {
  2641 		    Uint32 d = *dstp;
  2642 		    /*
  2643 		     * convert source and destination to G0RAB65565
  2644 		     * and blend all components at the same time
  2645 		     */
  2646 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2647 		      + (s >> 3 & 0x1f);
  2648 		    d = (d | d << 16) & 0x03e07c1f;
  2649 		    d += (s - d) * alpha >> 5;
  2650 		    d &= 0x03e07c1f;
  2651 		    *dstp = (Uint16)(d | d >> 16);
  2652 		  }
  2653 		}
  2654 		srcp++;
  2655 		dstp++;
  2656 	    }, width);
  2657 	    /* *INDENT-ON* */
  2658         srcp += srcskip;
  2659         dstp += dstskip;
  2660     }
  2661 }
  2662 
  2663 /* General (slow) N->N blending with per-surface alpha */
  2664 static void
  2665 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2666 {
  2667     int width = info->d_width;
  2668     int height = info->d_height;
  2669     Uint8 *src = info->s_pixels;
  2670     int srcskip = info->s_skip;
  2671     Uint8 *dst = info->d_pixels;
  2672     int dstskip = info->d_skip;
  2673     SDL_PixelFormat *srcfmt = info->src;
  2674     SDL_PixelFormat *dstfmt = info->dst;
  2675     int srcbpp = srcfmt->BytesPerPixel;
  2676     int dstbpp = dstfmt->BytesPerPixel;
  2677     unsigned sA = srcfmt->alpha;
  2678     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2679 
  2680     if (sA) {
  2681         while (height--) {
  2682 	    /* *INDENT-OFF* */
  2683 	    DUFFS_LOOP4(
  2684 	    {
  2685 		Uint32 Pixel;
  2686 		unsigned sR;
  2687 		unsigned sG;
  2688 		unsigned sB;
  2689 		unsigned dR;
  2690 		unsigned dG;
  2691 		unsigned dB;
  2692 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2693 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2694 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2695 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2696 		src += srcbpp;
  2697 		dst += dstbpp;
  2698 	    },
  2699 	    width);
  2700 	    /* *INDENT-ON* */
  2701             src += srcskip;
  2702             dst += dstskip;
  2703         }
  2704     }
  2705 }
  2706 
  2707 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2708 static void
  2709 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2710 {
  2711     int width = info->d_width;
  2712     int height = info->d_height;
  2713     Uint8 *src = info->s_pixels;
  2714     int srcskip = info->s_skip;
  2715     Uint8 *dst = info->d_pixels;
  2716     int dstskip = info->d_skip;
  2717     SDL_PixelFormat *srcfmt = info->src;
  2718     SDL_PixelFormat *dstfmt = info->dst;
  2719     Uint32 ckey = srcfmt->colorkey;
  2720     int srcbpp = srcfmt->BytesPerPixel;
  2721     int dstbpp = dstfmt->BytesPerPixel;
  2722     unsigned sA = srcfmt->alpha;
  2723     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2724 
  2725     while (height--) {
  2726 	    /* *INDENT-OFF* */
  2727 	    DUFFS_LOOP4(
  2728 	    {
  2729 		Uint32 Pixel;
  2730 		unsigned sR;
  2731 		unsigned sG;
  2732 		unsigned sB;
  2733 		unsigned dR;
  2734 		unsigned dG;
  2735 		unsigned dB;
  2736 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2737 		if(sA && Pixel != ckey) {
  2738 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2739 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2740 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2741 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2742 		}
  2743 		src += srcbpp;
  2744 		dst += dstbpp;
  2745 	    },
  2746 	    width);
  2747 	    /* *INDENT-ON* */
  2748         src += srcskip;
  2749         dst += dstskip;
  2750     }
  2751 }
  2752 
  2753 /* General (slow) N->N blending with pixel alpha */
  2754 static void
  2755 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2756 {
  2757     int width = info->d_width;
  2758     int height = info->d_height;
  2759     Uint8 *src = info->s_pixels;
  2760     int srcskip = info->s_skip;
  2761     Uint8 *dst = info->d_pixels;
  2762     int dstskip = info->d_skip;
  2763     SDL_PixelFormat *srcfmt = info->src;
  2764     SDL_PixelFormat *dstfmt = info->dst;
  2765 
  2766     int srcbpp;
  2767     int dstbpp;
  2768 
  2769     /* Set up some basic variables */
  2770     srcbpp = srcfmt->BytesPerPixel;
  2771     dstbpp = dstfmt->BytesPerPixel;
  2772 
  2773     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2774        quite right. for <8bpp source alpha, it gets them very wrong
  2775        (check all macros!)
  2776        It is unclear whether there is a good general solution that doesn't
  2777        need a branch (or a divide). */
  2778     while (height--) {
  2779 	    /* *INDENT-OFF* */
  2780 	    DUFFS_LOOP4(
  2781 	    {
  2782 		Uint32 Pixel;
  2783 		unsigned sR;
  2784 		unsigned sG;
  2785 		unsigned sB;
  2786 		unsigned dR;
  2787 		unsigned dG;
  2788 		unsigned dB;
  2789 		unsigned sA;
  2790 		unsigned dA;
  2791 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2792 		if(sA) {
  2793 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2794 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2795 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2796 		}
  2797 		src += srcbpp;
  2798 		dst += dstbpp;
  2799 	    },
  2800 	    width);
  2801 	    /* *INDENT-ON* */
  2802         src += srcskip;
  2803         dst += dstskip;
  2804     }
  2805 }
  2806 
  2807 
  2808 SDL_loblit
  2809 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2810 {
  2811     SDL_PixelFormat *sf = surface->format;
  2812     SDL_PixelFormat *df = surface->map->dst->format;
  2813 
  2814     if (sf->Amask == 0) {
  2815         if ((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2816             if (df->BytesPerPixel == 1)
  2817                 return BlitNto1SurfaceAlphaKey;
  2818             else
  2819 #if SDL_ALTIVEC_BLITTERS
  2820                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2821                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2822                     && SDL_HasAltiVec())
  2823                 return Blit32to32SurfaceAlphaKeyAltivec;
  2824             else
  2825 #endif
  2826                 return BlitNtoNSurfaceAlphaKey;
  2827         } else {
  2828             /* Per-surface alpha blits */
  2829             switch (df->BytesPerPixel) {
  2830             case 1:
  2831                 return BlitNto1SurfaceAlpha;
  2832 
  2833             case 2:
  2834                 if (surface->map->identity) {
  2835                     if (df->Gmask == 0x7e0) {
  2836 #if MMX_ASMBLIT
  2837                         if (SDL_HasMMX())
  2838                             return Blit565to565SurfaceAlphaMMX;
  2839                         else
  2840 #endif
  2841                             return Blit565to565SurfaceAlpha;
  2842                     } else if (df->Gmask == 0x3e0) {
  2843 #if MMX_ASMBLIT
  2844                         if (SDL_HasMMX())
  2845                             return Blit555to555SurfaceAlphaMMX;
  2846                         else
  2847 #endif
  2848                             return Blit555to555SurfaceAlpha;
  2849                     }
  2850                 }
  2851                 return BlitNtoNSurfaceAlpha;
  2852 
  2853             case 4:
  2854                 if (sf->Rmask == df->Rmask
  2855                     && sf->Gmask == df->Gmask
  2856                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2857 #if MMX_ASMBLIT
  2858                     if (sf->Rshift % 8 == 0
  2859                         && sf->Gshift % 8 == 0
  2860                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2861                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2862 #endif
  2863                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2864 #if SDL_ALTIVEC_BLITTERS
  2865                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2866                             && SDL_HasAltiVec())
  2867                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2868 #endif
  2869                         return BlitRGBtoRGBSurfaceAlpha;
  2870                     }
  2871                 }
  2872 #if SDL_ALTIVEC_BLITTERS
  2873                 if ((sf->BytesPerPixel == 4) &&
  2874                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2875                     && SDL_HasAltiVec())
  2876                     return Blit32to32SurfaceAlphaAltivec;
  2877                 else
  2878 #endif
  2879                     return BlitNtoNSurfaceAlpha;
  2880 
  2881             case 3:
  2882             default:
  2883                 return BlitNtoNSurfaceAlpha;
  2884             }
  2885         }
  2886     } else {
  2887         /* Per-pixel alpha blits */
  2888         switch (df->BytesPerPixel) {
  2889         case 1:
  2890             return BlitNto1PixelAlpha;
  2891 
  2892         case 2:
  2893 #if SDL_ALTIVEC_BLITTERS
  2894             if (sf->BytesPerPixel == 4
  2895                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2896                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2897                 && SDL_HasAltiVec())
  2898                 return Blit32to565PixelAlphaAltivec;
  2899             else
  2900 #endif
  2901                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2902                     && sf->Gmask == 0xff00
  2903                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2904                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2905                 if (df->Gmask == 0x7e0)
  2906                     return BlitARGBto565PixelAlpha;
  2907                 else if (df->Gmask == 0x3e0)
  2908                     return BlitARGBto555PixelAlpha;
  2909             }
  2910             return BlitNtoNPixelAlpha;
  2911 
  2912         case 4:
  2913             if (sf->Rmask == df->Rmask
  2914                 && sf->Gmask == df->Gmask
  2915                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2916 #if MMX_ASMBLIT
  2917                 if (sf->Rshift % 8 == 0
  2918                     && sf->Gshift % 8 == 0
  2919                     && sf->Bshift % 8 == 0
  2920                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2921                     if (SDL_Has3DNow())
  2922                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2923                     if (SDL_HasMMX())
  2924                         return BlitRGBtoRGBPixelAlphaMMX;
  2925                 }
  2926 #endif
  2927                 if (sf->Amask == 0xff000000) {
  2928 #if SDL_ALTIVEC_BLITTERS
  2929                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2930                         && SDL_HasAltiVec())
  2931                         return BlitRGBtoRGBPixelAlphaAltivec;
  2932 #endif
  2933                     return BlitRGBtoRGBPixelAlpha;
  2934                 }
  2935             }
  2936 #if SDL_ALTIVEC_BLITTERS
  2937             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2938                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2939                 && SDL_HasAltiVec())
  2940                 return Blit32to32PixelAlphaAltivec;
  2941             else
  2942 #endif
  2943                 return BlitNtoNPixelAlpha;
  2944 
  2945         case 3:
  2946         default:
  2947             return BlitNtoNPixelAlpha;
  2948         }
  2949     }
  2950 }
  2951 
  2952 /* vi: set ts=4 sw=4 expandtab: */