src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Tue, 21 Nov 2006 23:24:33 +0000
changeset 2074 9e6dc39f48b6
parent 2038 eb5aedc79992
child 2086 fffea8d6bf92
permissions -rw-r--r--
Merged r2913:2914 from SDL-1.2 branch into trunk: alpha blit GCC MMX asm fix.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void
    50 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    51 {
    52     int width = info->d_width;
    53     int height = info->d_height;
    54     Uint8 *src = info->s_pixels;
    55     int srcskip = info->s_skip;
    56     Uint8 *dst = info->d_pixels;
    57     int dstskip = info->d_skip;
    58     Uint8 *palmap = info->table;
    59     SDL_PixelFormat *srcfmt = info->src;
    60     SDL_PixelFormat *dstfmt = info->dst;
    61     int srcbpp = srcfmt->BytesPerPixel;
    62 
    63     const unsigned A = srcfmt->alpha;
    64 
    65     while (height--) {
    66 	    /* *INDENT-OFF* */
    67 	    DUFFS_LOOP4(
    68 	    {
    69 		Uint32 Pixel;
    70 		unsigned sR;
    71 		unsigned sG;
    72 		unsigned sB;
    73 		unsigned dR;
    74 		unsigned dG;
    75 		unsigned dB;
    76 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    77 		dR = dstfmt->palette->colors[*dst].r;
    78 		dG = dstfmt->palette->colors[*dst].g;
    79 		dB = dstfmt->palette->colors[*dst].b;
    80 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    81 		dR &= 0xff;
    82 		dG &= 0xff;
    83 		dB &= 0xff;
    84 		/* Pack RGB into 8bit pixel */
    85 		if ( palmap == NULL ) {
    86 		    *dst =((dR>>5)<<(3+2))|
    87 			  ((dG>>5)<<(2))|
    88 			  ((dB>>6)<<(0));
    89 		} else {
    90 		    *dst = palmap[((dR>>5)<<(3+2))|
    91 				  ((dG>>5)<<(2))  |
    92 				  ((dB>>6)<<(0))];
    93 		}
    94 		dst++;
    95 		src += srcbpp;
    96 	    },
    97 	    width);
    98 	    /* *INDENT-ON* */
    99         src += srcskip;
   100         dst += dstskip;
   101     }
   102 }
   103 
   104 /* N->1 blending with pixel alpha */
   105 static void
   106 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   107 {
   108     int width = info->d_width;
   109     int height = info->d_height;
   110     Uint8 *src = info->s_pixels;
   111     int srcskip = info->s_skip;
   112     Uint8 *dst = info->d_pixels;
   113     int dstskip = info->d_skip;
   114     Uint8 *palmap = info->table;
   115     SDL_PixelFormat *srcfmt = info->src;
   116     SDL_PixelFormat *dstfmt = info->dst;
   117     int srcbpp = srcfmt->BytesPerPixel;
   118 
   119     /* FIXME: fix alpha bit field expansion here too? */
   120     while (height--) {
   121 	    /* *INDENT-OFF* */
   122 	    DUFFS_LOOP4(
   123 	    {
   124 		Uint32 Pixel;
   125 		unsigned sR;
   126 		unsigned sG;
   127 		unsigned sB;
   128 		unsigned sA;
   129 		unsigned dR;
   130 		unsigned dG;
   131 		unsigned dB;
   132 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   133 		dR = dstfmt->palette->colors[*dst].r;
   134 		dG = dstfmt->palette->colors[*dst].g;
   135 		dB = dstfmt->palette->colors[*dst].b;
   136 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   137 		dR &= 0xff;
   138 		dG &= 0xff;
   139 		dB &= 0xff;
   140 		/* Pack RGB into 8bit pixel */
   141 		if ( palmap == NULL ) {
   142 		    *dst =((dR>>5)<<(3+2))|
   143 			  ((dG>>5)<<(2))|
   144 			  ((dB>>6)<<(0));
   145 		} else {
   146 		    *dst = palmap[((dR>>5)<<(3+2))|
   147 				  ((dG>>5)<<(2))  |
   148 				  ((dB>>6)<<(0))  ];
   149 		}
   150 		dst++;
   151 		src += srcbpp;
   152 	    },
   153 	    width);
   154 	    /* *INDENT-ON* */
   155         src += srcskip;
   156         dst += dstskip;
   157     }
   158 }
   159 
   160 /* colorkeyed N->1 blending with per-surface alpha */
   161 static void
   162 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   163 {
   164     int width = info->d_width;
   165     int height = info->d_height;
   166     Uint8 *src = info->s_pixels;
   167     int srcskip = info->s_skip;
   168     Uint8 *dst = info->d_pixels;
   169     int dstskip = info->d_skip;
   170     Uint8 *palmap = info->table;
   171     SDL_PixelFormat *srcfmt = info->src;
   172     SDL_PixelFormat *dstfmt = info->dst;
   173     int srcbpp = srcfmt->BytesPerPixel;
   174     Uint32 ckey = srcfmt->colorkey;
   175 
   176     const int A = srcfmt->alpha;
   177 
   178     while (height--) {
   179 	    /* *INDENT-OFF* */
   180 	    DUFFS_LOOP(
   181 	    {
   182 		Uint32 Pixel;
   183 		unsigned sR;
   184 		unsigned sG;
   185 		unsigned sB;
   186 		unsigned dR;
   187 		unsigned dG;
   188 		unsigned dB;
   189 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   190 		if ( Pixel != ckey ) {
   191 		    dR = dstfmt->palette->colors[*dst].r;
   192 		    dG = dstfmt->palette->colors[*dst].g;
   193 		    dB = dstfmt->palette->colors[*dst].b;
   194 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   195 		    dR &= 0xff;
   196 		    dG &= 0xff;
   197 		    dB &= 0xff;
   198 		    /* Pack RGB into 8bit pixel */
   199 		    if ( palmap == NULL ) {
   200 			*dst =((dR>>5)<<(3+2))|
   201 			      ((dG>>5)<<(2)) |
   202 			      ((dB>>6)<<(0));
   203 		    } else {
   204 			*dst = palmap[((dR>>5)<<(3+2))|
   205 				      ((dG>>5)<<(2))  |
   206 				      ((dB>>6)<<(0))  ];
   207 		    }
   208 		}
   209 		dst++;
   210 		src += srcbpp;
   211 	    },
   212 	    width);
   213 	    /* *INDENT-ON* */
   214         src += srcskip;
   215         dst += dstskip;
   216     }
   217 }
   218 
   219 #if GCC_ASMBLIT
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   221 static void
   222 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   223 {
   224     int width = info->d_width;
   225     int height = info->d_height;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   227     int srcskip = info->s_skip >> 2;
   228     Uint32 *dstp = (Uint32 *) info->d_pixels;
   229     int dstskip = info->d_skip >> 2;
   230     Uint32 dalpha = info->dst->Amask;
   231     Uint8 load[8];
   232 
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   234     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   236     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   237     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   238     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   239     while (height--) {
   240 		/* *INDENT-OFF* */
   241 		DUFFS_LOOP_DOUBLE2(
   242 		{
   243 			Uint32 s = *srcp++;
   244 			Uint32 d = *dstp;
   245 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   246 				   + (s & d & 0x00010101)) | dalpha;
   247 		},{
   248 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   249 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   250 
   251 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   252 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   253 
   254 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   255 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   256 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   257 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   258 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   259 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   260 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   261 			
   262 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   263 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   264 			dstp += 2;
   265 			srcp += 2;
   266 		}, width);
   267 		/* *INDENT-ON* */
   268         srcp += srcskip;
   269         dstp += dstskip;
   270     }
   271     emms();
   272 }
   273 
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   275 static void
   276 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   277 {
   278     SDL_PixelFormat *df = info->dst;
   279     unsigned alpha = info->src->alpha;
   280 
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   282         /* only call a128 version when R,G,B occupy lower bits */
   283         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   284     } else {
   285         int width = info->d_width;
   286         int height = info->d_height;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   288         int srcskip = info->s_skip >> 2;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   290         int dstskip = info->d_skip >> 2;
   291 
   292         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   293         /* form the alpha mult */
   294         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   295         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   296         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   297         alpha =
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   299                                                            Bshift);
   300         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   301         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   302         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   304         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   305         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   306 
   307         while (height--) {
   308 			/* *INDENT-OFF* */
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   315 
   316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   320 
   321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   324 				++srcp;
   325 				++dstp;
   326 			},{
   327 				/* Two Pixels Blend */
   328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   332 
   333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   337 
   338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   342 
   343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   347 
   348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   350 				
   351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   352 
   353   				srcp += 2;
   354   				dstp += 2;
   355   			}, width);
   356 			/* *INDENT-ON* */
   357             srcp += srcskip;
   358             dstp += dstskip;
   359         }
   360         emms();
   361     }
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void
   366 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   367 {
   368     int width = info->d_width;
   369     int height = info->d_height;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   371     int srcskip = info->s_skip >> 2;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   373     int dstskip = info->d_skip >> 2;
   374     SDL_PixelFormat *sf = info->src;
   375     Uint32 amask = sf->Amask;
   376 
   377     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   378     /* form multiplication mask */
   379     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   380     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   381     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   382     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   383     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   384     /* form channel masks */
   385     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   386     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   387     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   388     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   389     /* get alpha channel shift */
   390     movd_m2r(sf->Ashift, mm5);  /* Ashift -> mm5 */
   391 
   392     while (height--) {
   393 	    /* *INDENT-OFF* */
   394 	    DUFFS_LOOP4({
   395 		Uint32 alpha = *srcp & amask;
   396 		/* FIXME: Here we special-case opaque alpha since the
   397 			compositioning used (>>8 instead of /255) doesn't handle
   398 			it correctly. Also special-case alpha=0 for speed?
   399 			Benchmark this! */
   400 		if(alpha == 0) {
   401 			/* do nothing */
   402 		} else if(alpha == amask) {
   403 			/* opaque alpha -- copy RGB, keep dst alpha */
   404 			/* using MMX here to free up regular registers for other things */
   405 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   406 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   407 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   408 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   409 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   410 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   411 		} else {
   412 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   413 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   414 
   415 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   416 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   417 
   418 			__asm__ __volatile__ (
   419 				"movd %0, %%mm4"
   420 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   421 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   422 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   423 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   424 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   425 
   426 			/* blend */		    
   427 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   428 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   429 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   430 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   431 			
   432 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   433 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   434 		}
   435 		++srcp;
   436 		++dstp;
   437 	    }, width);
   438 	    /* *INDENT-ON* */
   439         srcp += srcskip;
   440         dstp += dstskip;
   441     }
   442     emms();
   443 }
   444 
   445 /* End GCC_ASMBLIT */
   446 
   447 #elif MSVC_ASMBLIT
   448 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   449 static void
   450 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   451 {
   452     int width = info->d_width;
   453     int height = info->d_height;
   454     Uint32 *srcp = (Uint32 *) info->s_pixels;
   455     int srcskip = info->s_skip >> 2;
   456     Uint32 *dstp = (Uint32 *) info->d_pixels;
   457     int dstskip = info->d_skip >> 2;
   458     Uint32 dalpha = info->dst->Amask;
   459 
   460     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   461 
   462     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   463     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   464     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   465 
   466     while (height--) {
   467         int n = width;
   468         if (n & 1) {
   469             Uint32 s = *srcp++;
   470             Uint32 d = *dstp;
   471             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   472                        + (s & d & 0x00010101)) | dalpha;
   473             n--;
   474         }
   475 
   476         for (n >>= 1; n > 0; --n) {
   477             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   478             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   479 
   480             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   481             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   482 
   483             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   484             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   485             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   486             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   487 
   488             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   489             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   490             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   491             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   492 
   493             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   494             dstp += 2;
   495             srcp += 2;
   496         }
   497 
   498         srcp += srcskip;
   499         dstp += dstskip;
   500     }
   501     _mm_empty();
   502 }
   503 
   504 /* fast RGB888->(A)RGB888 blending with surface alpha */
   505 static void
   506 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   507 {
   508     SDL_PixelFormat *df = info->dst;
   509     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   510     unsigned alpha = info->src->alpha;
   511 
   512     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   513         /* only call a128 version when R,G,B occupy lower bits */
   514         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   515     } else {
   516         int width = info->d_width;
   517         int height = info->d_height;
   518         Uint32 *srcp = (Uint32 *) info->s_pixels;
   519         int srcskip = info->s_skip >> 2;
   520         Uint32 *dstp = (Uint32 *) info->d_pixels;
   521         int dstskip = info->d_skip >> 2;
   522         Uint32 dalpha = df->Amask;
   523         Uint32 amult;
   524 
   525         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   526 
   527         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   528         /* form the alpha mult */
   529         amult = alpha | (alpha << 8);
   530         amult = amult | (amult << 16);
   531         chanmask =
   532             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   533                                                            Bshift);
   534         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   535         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   536         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   537         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   538 
   539         while (height--) {
   540             int n = width;
   541             if (n & 1) {
   542                 /* One Pixel Blend */
   543                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   544                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   545 
   546                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   547                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   548 
   549                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   550                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   551                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   552                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   553 
   554                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   555                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   556                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   557 
   558                 ++srcp;
   559                 ++dstp;
   560 
   561                 n--;
   562             }
   563 
   564             for (n >>= 1; n > 0; --n) {
   565                 /* Two Pixels Blend */
   566                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   567                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   568                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   569                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   570 
   571                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   572                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   573                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   574                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   575 
   576                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   577                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   578                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   579                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   580 
   581                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   582                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   583                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   584                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   585 
   586                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   587                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   588 
   589                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   590 
   591                 srcp += 2;
   592                 dstp += 2;
   593             }
   594             srcp += srcskip;
   595             dstp += dstskip;
   596         }
   597         _mm_empty();
   598     }
   599 }
   600 
   601 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   602 static void
   603 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   604 {
   605     int width = info->d_width;
   606     int height = info->d_height;
   607     Uint32 *srcp = (Uint32 *) info->s_pixels;
   608     int srcskip = info->s_skip >> 2;
   609     Uint32 *dstp = (Uint32 *) info->d_pixels;
   610     int dstskip = info->d_skip >> 2;
   611     SDL_PixelFormat *sf = info->src;
   612     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   613     Uint32 amask = sf->Amask;
   614     Uint32 ashift = sf->Ashift;
   615     Uint64 multmask;
   616 
   617     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   618 
   619     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   620 	/* *INDENT-OFF* */
   621 	multmask = ~(0xFFFFI64 << (ashift * 2));
   622 	/* *INDENT-ON* */
   623     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   624 
   625     while (height--) {
   626 		/* *INDENT-OFF* */
   627 		DUFFS_LOOP4({
   628 		Uint32 alpha = *srcp & amask;
   629 		if (alpha == 0) {
   630 			/* do nothing */
   631 		} else if (alpha == amask) {
   632 			/* opaque alpha -- copy RGB, keep dst alpha */
   633 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   634 		} else {
   635 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   636 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   637 
   638 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   639 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   640 
   641 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   642 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   643 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   644 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   645 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   646 
   647 			/* blend */		    
   648 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   649 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   650 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   651 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   652 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   653 			
   654 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   655 		}
   656 		++srcp;
   657 		++dstp;
   658 	    }, width);
   659 		/* *INDENT-ON* */
   660         srcp += srcskip;
   661         dstp += dstskip;
   662     }
   663     _mm_empty();
   664 }
   665 
   666 /* End MSVC_ASMBLIT */
   667 
   668 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   669 
   670 #if SDL_ALTIVEC_BLITTERS
   671 #if __MWERKS__
   672 #pragma altivec_model on
   673 #endif
   674 #if HAVE_ALTIVEC_H
   675 #include <altivec.h>
   676 #endif
   677 #include <assert.h>
   678 
   679 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   680 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   681         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   682 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   683         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   684 #else
   685 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   686         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   687 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   688         (vector unsigned short) { a,b,c,d,e,f,g,h }
   689 #endif
   690 
   691 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   692 #define VECPRINT(msg, v) do { \
   693     vector unsigned int tmpvec = (vector unsigned int)(v); \
   694     unsigned int *vp = (unsigned int *)&tmpvec; \
   695     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   696 } while (0)
   697 
   698 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   699     (vector unsigned char)(
   700         0x00, 0x10, 0x02, 0x12,
   701         0x04, 0x14, 0x06, 0x16,
   702         0x08, 0x18, 0x0A, 0x1A,
   703         0x0C, 0x1C, 0x0E, 0x1E );
   704 */
   705 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   706 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   707 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   708 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   709     ? vec_lvsl(0, src) \
   710     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   711 
   712 
   713 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   714     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   715     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   716     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   717     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   718     /* valpha2 is 255-alpha */ \
   719     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   720     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   721     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   722     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   723     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   724     /* add source and dest */ \
   725     vtemp1 = vec_add(vtemp1, vtemp3); \
   726     vtemp2 = vec_add(vtemp2, vtemp4); \
   727     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   728     vtemp1 = vec_add(vtemp1, v1_16); \
   729     vtemp3 = vec_sr(vtemp1, v8_16); \
   730     vtemp1 = vec_add(vtemp1, vtemp3); \
   731     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   732     vtemp2 = vec_add(vtemp2, v1_16); \
   733     vtemp4 = vec_sr(vtemp2, v8_16); \
   734     vtemp2 = vec_add(vtemp2, vtemp4); \
   735     /* (>>8) and get ARGBARGBARGBARGB */ \
   736     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   737 } while (0)
   738 
   739 /* Calculate the permute vector used for 32->32 swizzling */
   740 static vector unsigned char
   741 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   742 {
   743     /*
   744      * We have to assume that the bits that aren't used by other
   745      *  colors is alpha, and it's one complete byte, since some formats
   746      *  leave alpha with a zero mask, but we should still swizzle the bits.
   747      */
   748     /* ARGB */
   749     const static struct SDL_PixelFormat default_pixel_format = {
   750         NULL, 0, 0,
   751         0, 0, 0, 0,
   752         16, 8, 0, 24,
   753         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   754         0, 0
   755     };
   756     if (!srcfmt) {
   757         srcfmt = &default_pixel_format;
   758     }
   759     if (!dstfmt) {
   760         dstfmt = &default_pixel_format;
   761     }
   762     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   763                                                        0x04, 0x04, 0x04, 0x04,
   764                                                        0x08, 0x08, 0x08, 0x08,
   765                                                        0x0C, 0x0C, 0x0C,
   766                                                        0x0C);
   767     vector unsigned char vswiz;
   768     vector unsigned int srcvec;
   769 #define RESHIFT(X) (3 - ((X) >> 3))
   770     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   771     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   772     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   773     Uint32 amask;
   774     /* Use zero for alpha if either surface doesn't have alpha */
   775     if (dstfmt->Amask) {
   776         amask =
   777             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   778                                                                    Ashift);
   779     } else {
   780         amask =
   781             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   782                           0xFFFFFFFF);
   783     }
   784 #undef RESHIFT
   785     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   786     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   787     return (vswiz);
   788 }
   789 
   790 static void
   791 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   792 {
   793     int height = info->d_height;
   794     Uint8 *src = (Uint8 *) info->s_pixels;
   795     int srcskip = info->s_skip;
   796     Uint8 *dst = (Uint8 *) info->d_pixels;
   797     int dstskip = info->d_skip;
   798     SDL_PixelFormat *srcfmt = info->src;
   799 
   800     vector unsigned char v0 = vec_splat_u8(0);
   801     vector unsigned short v8_16 = vec_splat_u16(8);
   802     vector unsigned short v1_16 = vec_splat_u16(1);
   803     vector unsigned short v2_16 = vec_splat_u16(2);
   804     vector unsigned short v3_16 = vec_splat_u16(3);
   805     vector unsigned int v8_32 = vec_splat_u32(8);
   806     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   807     vector unsigned short v3f =
   808         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   809                           0x003f, 0x003f, 0x003f, 0x003f);
   810     vector unsigned short vfc =
   811         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   812                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   813 
   814     /* 
   815        0x10 - 0x1f is the alpha
   816        0x00 - 0x0e evens are the red
   817        0x01 - 0x0f odds are zero
   818      */
   819     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   820                                                        0x10, 0x02, 0x01, 0x01,
   821                                                        0x10, 0x04, 0x01, 0x01,
   822                                                        0x10, 0x06, 0x01,
   823                                                        0x01);
   824     vector unsigned char vredalpha2 =
   825         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   826                                         vec_sl(v8_32, v16_32))
   827         );
   828     /*
   829        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   830        0x11 - 0x0f odds are blue
   831      */
   832     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   833                                                    0x04, 0x05, 0x06, 0x13,
   834                                                    0x08, 0x09, 0x0a, 0x15,
   835                                                    0x0c, 0x0d, 0x0e, 0x17);
   836     vector unsigned char vblue2 =
   837         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   838         );
   839     /*
   840        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   841        0x10 - 0x0e evens are green
   842      */
   843     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   844                                                     0x04, 0x05, 0x12, 0x07,
   845                                                     0x08, 0x09, 0x14, 0x0b,
   846                                                     0x0c, 0x0d, 0x16, 0x0f);
   847     vector unsigned char vgreen2 =
   848         (vector unsigned
   849          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   850         );
   851     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   852                                                     0x00, 0x0a, 0x00, 0x0e,
   853                                                     0x00, 0x12, 0x00, 0x16,
   854                                                     0x00, 0x1a, 0x00, 0x1e);
   855     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   856     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   857     vector unsigned char valphaPermute =
   858         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   859 
   860     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   861     vf800 = vec_sl(vf800, vec_splat_u16(8));
   862 
   863     while (height--) {
   864         int extrawidth;
   865         vector unsigned char valigner;
   866         vector unsigned char vsrc;
   867         vector unsigned char voverflow;
   868         int width = info->d_width;
   869 
   870 #define ONE_PIXEL_BLEND(condition, widthvar) \
   871         while (condition) { \
   872             Uint32 Pixel; \
   873             unsigned sR, sG, sB, dR, dG, dB, sA; \
   874             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   875             if(sA) { \
   876                 unsigned short dstpixel = *((unsigned short *)dst); \
   877                 dR = (dstpixel >> 8) & 0xf8; \
   878                 dG = (dstpixel >> 3) & 0xfc; \
   879                 dB = (dstpixel << 3) & 0xf8; \
   880                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   881                 *((unsigned short *)dst) = ( \
   882                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   883                 ); \
   884             } \
   885             src += 4; \
   886             dst += 2; \
   887             widthvar--; \
   888         }
   889         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   890         extrawidth = (width % 8);
   891         valigner = VEC_ALIGNER(src);
   892         vsrc = (vector unsigned char) vec_ld(0, src);
   893         width -= extrawidth;
   894         while (width) {
   895             vector unsigned char valpha;
   896             vector unsigned char vsrc1, vsrc2;
   897             vector unsigned char vdst1, vdst2;
   898             vector unsigned short vR, vG, vB;
   899             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   900 
   901             /* Load 8 pixels from src as ARGB */
   902             voverflow = (vector unsigned char) vec_ld(15, src);
   903             vsrc = vec_perm(vsrc, voverflow, valigner);
   904             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   905             src += 16;
   906             vsrc = (vector unsigned char) vec_ld(15, src);
   907             voverflow = vec_perm(voverflow, vsrc, valigner);
   908             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   909             src += 16;
   910 
   911             /* Load 8 pixels from dst as XRGB */
   912             voverflow = vec_ld(0, dst);
   913             vR = vec_and((vector unsigned short) voverflow, vf800);
   914             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   915             vG = vec_sl(vB, v2_16);
   916             vdst1 =
   917                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   918                                                 (vector unsigned char) vR,
   919                                                 vredalpha1);
   920             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   921             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   922             vdst2 =
   923                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   924                                                 (vector unsigned char) vR,
   925                                                 vredalpha2);
   926             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   927             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   928 
   929             /* Alpha blend 8 pixels as ARGB */
   930             valpha = vec_perm(vsrc1, v0, valphaPermute);
   931             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   932                                v8_16);
   933             valpha = vec_perm(vsrc2, v0, valphaPermute);
   934             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   935                                v8_16);
   936 
   937             /* Convert 8 pixels to 565 */
   938             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   939                                                         vdst1,
   940                                                         (vector unsigned int)
   941                                                         vdst2);
   942             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   943             vgpixel = vec_and(vgpixel, vfc);
   944             vgpixel = vec_sl(vgpixel, v3_16);
   945             vrpixel = vec_sl(vpixel, v1_16);
   946             vrpixel = vec_and(vrpixel, vf800);
   947             vbpixel = vec_and(vpixel, v3f);
   948             vdst1 =
   949                 vec_or((vector unsigned char) vrpixel,
   950                        (vector unsigned char) vgpixel);
   951             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   952 
   953             /* Store 8 pixels */
   954             vec_st(vdst1, 0, dst);
   955 
   956             width -= 8;
   957             dst += 16;
   958         }
   959         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   960 #undef ONE_PIXEL_BLEND
   961         src += srcskip;
   962         dst += dstskip;
   963     }
   964 }
   965 
   966 static void
   967 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   968 {
   969     unsigned alpha = info->src->alpha;
   970     int height = info->d_height;
   971     Uint32 *srcp = (Uint32 *) info->s_pixels;
   972     int srcskip = info->s_skip >> 2;
   973     Uint32 *dstp = (Uint32 *) info->d_pixels;
   974     int dstskip = info->d_skip >> 2;
   975     SDL_PixelFormat *srcfmt = info->src;
   976     SDL_PixelFormat *dstfmt = info->dst;
   977     unsigned sA = srcfmt->alpha;
   978     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   979     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   980     Uint32 ckey = info->src->colorkey;
   981     vector unsigned char mergePermute;
   982     vector unsigned char vsrcPermute;
   983     vector unsigned char vdstPermute;
   984     vector unsigned char vsdstPermute;
   985     vector unsigned char valpha;
   986     vector unsigned char valphamask;
   987     vector unsigned char vbits;
   988     vector unsigned char v0;
   989     vector unsigned short v1;
   990     vector unsigned short v8;
   991     vector unsigned int vckey;
   992     vector unsigned int vrgbmask;
   993 
   994     mergePermute = VEC_MERGE_PERMUTE();
   995     v0 = vec_splat_u8(0);
   996     v1 = vec_splat_u16(1);
   997     v8 = vec_splat_u16(8);
   998 
   999     /* set the alpha to 255 on the destination surf */
  1000     valphamask = VEC_ALPHA_MASK();
  1001 
  1002     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1003     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1004     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1005 
  1006     /* set a vector full of alpha and 255-alpha */
  1007     ((unsigned char *) &valpha)[0] = alpha;
  1008     valpha = vec_splat(valpha, 0);
  1009     vbits = (vector unsigned char) vec_splat_s8(-1);
  1010 
  1011     ckey &= rgbmask;
  1012     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1013     vckey = vec_splat(vckey, 0);
  1014     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1015     vrgbmask = vec_splat(vrgbmask, 0);
  1016 
  1017     while (height--) {
  1018         int width = info->d_width;
  1019 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1020         while (condition) { \
  1021             Uint32 Pixel; \
  1022             unsigned sR, sG, sB, dR, dG, dB; \
  1023             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  1024             if(sA && Pixel != ckey) { \
  1025                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  1026                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1027                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1028                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1029             } \
  1030             dstp++; \
  1031             srcp++; \
  1032             widthvar--; \
  1033         }
  1034         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1035         if (width > 0) {
  1036             int extrawidth = (width % 4);
  1037             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1038             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1039             width -= extrawidth;
  1040             while (width) {
  1041                 vector unsigned char vsel;
  1042                 vector unsigned char voverflow;
  1043                 vector unsigned char vd;
  1044                 vector unsigned char vd_orig;
  1045 
  1046                 /* s = *srcp */
  1047                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1048                 vs = vec_perm(vs, voverflow, valigner);
  1049 
  1050                 /* vsel is set for items that match the key */
  1051                 vsel =
  1052                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1053                                                    vrgbmask);
  1054                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1055                                                         vsel, vckey);
  1056 
  1057                 /* permute to source format */
  1058                 vs = vec_perm(vs, valpha, vsrcPermute);
  1059 
  1060                 /* d = *dstp */
  1061                 vd = (vector unsigned char) vec_ld(0, dstp);
  1062                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1063 
  1064                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1065 
  1066                 /* set the alpha channel to full on */
  1067                 vd = vec_or(vd, valphamask);
  1068 
  1069                 /* mask out color key */
  1070                 vd = vec_sel(vd, vd_orig, vsel);
  1071 
  1072                 /* permute to dest format */
  1073                 vd = vec_perm(vd, vbits, vdstPermute);
  1074 
  1075                 /* *dstp = res */
  1076                 vec_st((vector unsigned int) vd, 0, dstp);
  1077 
  1078                 srcp += 4;
  1079                 dstp += 4;
  1080                 width -= 4;
  1081                 vs = voverflow;
  1082             }
  1083             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1084         }
  1085 #undef ONE_PIXEL_BLEND
  1086 
  1087         srcp += srcskip;
  1088         dstp += dstskip;
  1089     }
  1090 }
  1091 
  1092 
  1093 static void
  1094 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1095 {
  1096     int width = info->d_width;
  1097     int height = info->d_height;
  1098     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1099     int srcskip = info->s_skip >> 2;
  1100     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1101     int dstskip = info->d_skip >> 2;
  1102     SDL_PixelFormat *srcfmt = info->src;
  1103     SDL_PixelFormat *dstfmt = info->dst;
  1104     vector unsigned char mergePermute;
  1105     vector unsigned char valphaPermute;
  1106     vector unsigned char vsrcPermute;
  1107     vector unsigned char vdstPermute;
  1108     vector unsigned char vsdstPermute;
  1109     vector unsigned char valphamask;
  1110     vector unsigned char vpixelmask;
  1111     vector unsigned char v0;
  1112     vector unsigned short v1;
  1113     vector unsigned short v8;
  1114 
  1115     v0 = vec_splat_u8(0);
  1116     v1 = vec_splat_u16(1);
  1117     v8 = vec_splat_u16(8);
  1118     mergePermute = VEC_MERGE_PERMUTE();
  1119     valphamask = VEC_ALPHA_MASK();
  1120     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1121     vpixelmask = vec_nor(valphamask, v0);
  1122     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1123     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1124     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1125 
  1126     while (height--) {
  1127         width = info->d_width;
  1128 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1129             Uint32 Pixel; \
  1130             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1131             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1132             if(sA) { \
  1133               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1134               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1135               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1136             } \
  1137             ++srcp; \
  1138             ++dstp; \
  1139             widthvar--; \
  1140         }
  1141         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1142         if (width > 0) {
  1143             /* vsrcPermute */
  1144             /* vdstPermute */
  1145             int extrawidth = (width % 4);
  1146             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1147             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1148             width -= extrawidth;
  1149             while (width) {
  1150                 vector unsigned char voverflow;
  1151                 vector unsigned char vd;
  1152                 vector unsigned char valpha;
  1153                 vector unsigned char vdstalpha;
  1154                 /* s = *srcp */
  1155                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1156                 vs = vec_perm(vs, voverflow, valigner);
  1157                 vs = vec_perm(vs, v0, vsrcPermute);
  1158 
  1159                 valpha = vec_perm(vs, v0, valphaPermute);
  1160 
  1161                 /* d = *dstp */
  1162                 vd = (vector unsigned char) vec_ld(0, dstp);
  1163                 vd = vec_perm(vd, v0, vsdstPermute);
  1164                 vdstalpha = vec_and(vd, valphamask);
  1165 
  1166                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1167 
  1168                 /* set the alpha to the dest alpha */
  1169                 vd = vec_and(vd, vpixelmask);
  1170                 vd = vec_or(vd, vdstalpha);
  1171                 vd = vec_perm(vd, v0, vdstPermute);
  1172 
  1173                 /* *dstp = res */
  1174                 vec_st((vector unsigned int) vd, 0, dstp);
  1175 
  1176                 srcp += 4;
  1177                 dstp += 4;
  1178                 width -= 4;
  1179                 vs = voverflow;
  1180 
  1181             }
  1182             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1183         }
  1184         srcp += srcskip;
  1185         dstp += dstskip;
  1186 #undef ONE_PIXEL_BLEND
  1187     }
  1188 }
  1189 
  1190 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1191 static void
  1192 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1193 {
  1194     int width = info->d_width;
  1195     int height = info->d_height;
  1196     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1197     int srcskip = info->s_skip >> 2;
  1198     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1199     int dstskip = info->d_skip >> 2;
  1200     vector unsigned char mergePermute;
  1201     vector unsigned char valphaPermute;
  1202     vector unsigned char valphamask;
  1203     vector unsigned char vpixelmask;
  1204     vector unsigned char v0;
  1205     vector unsigned short v1;
  1206     vector unsigned short v8;
  1207     v0 = vec_splat_u8(0);
  1208     v1 = vec_splat_u16(1);
  1209     v8 = vec_splat_u16(8);
  1210     mergePermute = VEC_MERGE_PERMUTE();
  1211     valphamask = VEC_ALPHA_MASK();
  1212     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1213 
  1214 
  1215     vpixelmask = vec_nor(valphamask, v0);
  1216     while (height--) {
  1217         width = info->d_width;
  1218 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1219         while ((condition)) { \
  1220             Uint32 dalpha; \
  1221             Uint32 d; \
  1222             Uint32 s1; \
  1223             Uint32 d1; \
  1224             Uint32 s = *srcp; \
  1225             Uint32 alpha = s >> 24; \
  1226             if(alpha) { \
  1227               if(alpha == SDL_ALPHA_OPAQUE) { \
  1228                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1229               } else { \
  1230                 d = *dstp; \
  1231                 dalpha = d & 0xff000000; \
  1232                 s1 = s & 0xff00ff; \
  1233                 d1 = d & 0xff00ff; \
  1234                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1235                 s &= 0xff00; \
  1236                 d &= 0xff00; \
  1237                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1238                 *dstp = d1 | d | dalpha; \
  1239               } \
  1240             } \
  1241             ++srcp; \
  1242             ++dstp; \
  1243             widthvar--; \
  1244 	    }
  1245         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1246         if (width > 0) {
  1247             int extrawidth = (width % 4);
  1248             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1249             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1250             width -= extrawidth;
  1251             while (width) {
  1252                 vector unsigned char voverflow;
  1253                 vector unsigned char vd;
  1254                 vector unsigned char valpha;
  1255                 vector unsigned char vdstalpha;
  1256                 /* s = *srcp */
  1257                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1258                 vs = vec_perm(vs, voverflow, valigner);
  1259 
  1260                 valpha = vec_perm(vs, v0, valphaPermute);
  1261 
  1262                 /* d = *dstp */
  1263                 vd = (vector unsigned char) vec_ld(0, dstp);
  1264                 vdstalpha = vec_and(vd, valphamask);
  1265 
  1266                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1267 
  1268                 /* set the alpha to the dest alpha */
  1269                 vd = vec_and(vd, vpixelmask);
  1270                 vd = vec_or(vd, vdstalpha);
  1271 
  1272                 /* *dstp = res */
  1273                 vec_st((vector unsigned int) vd, 0, dstp);
  1274 
  1275                 srcp += 4;
  1276                 dstp += 4;
  1277                 width -= 4;
  1278                 vs = voverflow;
  1279             }
  1280             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1281         }
  1282         srcp += srcskip;
  1283         dstp += dstskip;
  1284     }
  1285 #undef ONE_PIXEL_BLEND
  1286 }
  1287 
  1288 static void
  1289 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1290 {
  1291     /* XXX : 6 */
  1292     unsigned alpha = info->src->alpha;
  1293     int height = info->d_height;
  1294     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1295     int srcskip = info->s_skip >> 2;
  1296     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1297     int dstskip = info->d_skip >> 2;
  1298     SDL_PixelFormat *srcfmt = info->src;
  1299     SDL_PixelFormat *dstfmt = info->dst;
  1300     unsigned sA = srcfmt->alpha;
  1301     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1302     vector unsigned char mergePermute;
  1303     vector unsigned char vsrcPermute;
  1304     vector unsigned char vdstPermute;
  1305     vector unsigned char vsdstPermute;
  1306     vector unsigned char valpha;
  1307     vector unsigned char valphamask;
  1308     vector unsigned char vbits;
  1309     vector unsigned short v1;
  1310     vector unsigned short v8;
  1311 
  1312     mergePermute = VEC_MERGE_PERMUTE();
  1313     v1 = vec_splat_u16(1);
  1314     v8 = vec_splat_u16(8);
  1315 
  1316     /* set the alpha to 255 on the destination surf */
  1317     valphamask = VEC_ALPHA_MASK();
  1318 
  1319     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1320     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1321     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1322 
  1323     /* set a vector full of alpha and 255-alpha */
  1324     ((unsigned char *) &valpha)[0] = alpha;
  1325     valpha = vec_splat(valpha, 0);
  1326     vbits = (vector unsigned char) vec_splat_s8(-1);
  1327 
  1328     while (height--) {
  1329         int width = info->d_width;
  1330 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1331             Uint32 Pixel; \
  1332             unsigned sR, sG, sB, dR, dG, dB; \
  1333             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1334             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1335             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1336             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1337             ++srcp; \
  1338             ++dstp; \
  1339             widthvar--; \
  1340         }
  1341         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1342         if (width > 0) {
  1343             int extrawidth = (width % 4);
  1344             vector unsigned char valigner = vec_lvsl(0, srcp);
  1345             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1346             width -= extrawidth;
  1347             while (width) {
  1348                 vector unsigned char voverflow;
  1349                 vector unsigned char vd;
  1350 
  1351                 /* s = *srcp */
  1352                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1353                 vs = vec_perm(vs, voverflow, valigner);
  1354                 vs = vec_perm(vs, valpha, vsrcPermute);
  1355 
  1356                 /* d = *dstp */
  1357                 vd = (vector unsigned char) vec_ld(0, dstp);
  1358                 vd = vec_perm(vd, vd, vsdstPermute);
  1359 
  1360                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1361 
  1362                 /* set the alpha channel to full on */
  1363                 vd = vec_or(vd, valphamask);
  1364                 vd = vec_perm(vd, vbits, vdstPermute);
  1365 
  1366                 /* *dstp = res */
  1367                 vec_st((vector unsigned int) vd, 0, dstp);
  1368 
  1369                 srcp += 4;
  1370                 dstp += 4;
  1371                 width -= 4;
  1372                 vs = voverflow;
  1373             }
  1374             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1375         }
  1376 #undef ONE_PIXEL_BLEND
  1377 
  1378         srcp += srcskip;
  1379         dstp += dstskip;
  1380     }
  1381 
  1382 }
  1383 
  1384 
  1385 /* fast RGB888->(A)RGB888 blending */
  1386 static void
  1387 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1388 {
  1389     unsigned alpha = info->src->alpha;
  1390     int height = info->d_height;
  1391     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1392     int srcskip = info->s_skip >> 2;
  1393     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1394     int dstskip = info->d_skip >> 2;
  1395     vector unsigned char mergePermute;
  1396     vector unsigned char valpha;
  1397     vector unsigned char valphamask;
  1398     vector unsigned short v1;
  1399     vector unsigned short v8;
  1400 
  1401     mergePermute = VEC_MERGE_PERMUTE();
  1402     v1 = vec_splat_u16(1);
  1403     v8 = vec_splat_u16(8);
  1404 
  1405     /* set the alpha to 255 on the destination surf */
  1406     valphamask = VEC_ALPHA_MASK();
  1407 
  1408     /* set a vector full of alpha and 255-alpha */
  1409     ((unsigned char *) &valpha)[0] = alpha;
  1410     valpha = vec_splat(valpha, 0);
  1411 
  1412     while (height--) {
  1413         int width = info->d_width;
  1414 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1415             Uint32 s = *srcp; \
  1416             Uint32 d = *dstp; \
  1417             Uint32 s1 = s & 0xff00ff; \
  1418             Uint32 d1 = d & 0xff00ff; \
  1419             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1420                  & 0xff00ff; \
  1421             s &= 0xff00; \
  1422             d &= 0xff00; \
  1423             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1424             *dstp = d1 | d | 0xff000000; \
  1425             ++srcp; \
  1426             ++dstp; \
  1427             widthvar--; \
  1428         }
  1429         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1430         if (width > 0) {
  1431             int extrawidth = (width % 4);
  1432             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1433             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1434             width -= extrawidth;
  1435             while (width) {
  1436                 vector unsigned char voverflow;
  1437                 vector unsigned char vd;
  1438 
  1439                 /* s = *srcp */
  1440                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1441                 vs = vec_perm(vs, voverflow, valigner);
  1442 
  1443                 /* d = *dstp */
  1444                 vd = (vector unsigned char) vec_ld(0, dstp);
  1445 
  1446                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1447 
  1448                 /* set the alpha channel to full on */
  1449                 vd = vec_or(vd, valphamask);
  1450 
  1451                 /* *dstp = res */
  1452                 vec_st((vector unsigned int) vd, 0, dstp);
  1453 
  1454                 srcp += 4;
  1455                 dstp += 4;
  1456                 width -= 4;
  1457                 vs = voverflow;
  1458             }
  1459             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1460         }
  1461 #undef ONE_PIXEL_BLEND
  1462 
  1463         srcp += srcskip;
  1464         dstp += dstskip;
  1465     }
  1466 }
  1467 
  1468 #if __MWERKS__
  1469 #pragma altivec_model off
  1470 #endif
  1471 #endif /* SDL_ALTIVEC_BLITTERS */
  1472 
  1473 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1474 static void
  1475 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1476 {
  1477     int width = info->d_width;
  1478     int height = info->d_height;
  1479     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1480     int srcskip = info->s_skip >> 2;
  1481     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1482     int dstskip = info->d_skip >> 2;
  1483 
  1484     while (height--) {
  1485 	    /* *INDENT-OFF* */
  1486 	    DUFFS_LOOP4({
  1487 		    Uint32 s = *srcp++;
  1488 		    Uint32 d = *dstp;
  1489 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1490 			       + (s & d & 0x00010101)) | 0xff000000;
  1491 	    }, width);
  1492 	    /* *INDENT-ON* */
  1493         srcp += srcskip;
  1494         dstp += dstskip;
  1495     }
  1496 }
  1497 
  1498 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1499 static void
  1500 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1501 {
  1502     unsigned alpha = info->src->alpha;
  1503     if (alpha == 128) {
  1504         BlitRGBtoRGBSurfaceAlpha128(info);
  1505     } else {
  1506         int width = info->d_width;
  1507         int height = info->d_height;
  1508         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1509         int srcskip = info->s_skip >> 2;
  1510         Uint32 *dstp = (Uint32 *) info->d_pixels;
  1511         int dstskip = info->d_skip >> 2;
  1512         Uint32 s;
  1513         Uint32 d;
  1514         Uint32 s1;
  1515         Uint32 d1;
  1516 
  1517         while (height--) {
  1518 			/* *INDENT-OFF* */
  1519 			DUFFS_LOOP_DOUBLE2({
  1520 				/* One Pixel Blend */
  1521 				s = *srcp;
  1522 				d = *dstp;
  1523 				s1 = s & 0xff00ff;
  1524 				d1 = d & 0xff00ff;
  1525 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1526 				     & 0xff00ff;
  1527 				s &= 0xff00;
  1528 				d &= 0xff00;
  1529 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1530 				*dstp = d1 | d | 0xff000000;
  1531 				++srcp;
  1532 				++dstp;
  1533 			},{
  1534 			        /* Two Pixels Blend */
  1535 				s = *srcp;
  1536 				d = *dstp;
  1537 				s1 = s & 0xff00ff;
  1538 				d1 = d & 0xff00ff;
  1539 				d1 += (s1 - d1) * alpha >> 8;
  1540 				d1 &= 0xff00ff;
  1541 				     
  1542 				s = ((s & 0xff00) >> 8) | 
  1543 					((srcp[1] & 0xff00) << 8);
  1544 				d = ((d & 0xff00) >> 8) |
  1545 					((dstp[1] & 0xff00) << 8);
  1546 				d += (s - d) * alpha >> 8;
  1547 				d &= 0x00ff00ff;
  1548 				
  1549 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1550 				++srcp;
  1551 				
  1552 			        s1 = *srcp;
  1553 				d1 = *dstp;
  1554 				s1 &= 0xff00ff;
  1555 				d1 &= 0xff00ff;
  1556 				d1 += (s1 - d1) * alpha >> 8;
  1557 				d1 &= 0xff00ff;
  1558 				
  1559 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1560 				++srcp;
  1561 				++dstp;
  1562 			}, width);
  1563 			/* *INDENT-ON* */
  1564             srcp += srcskip;
  1565             dstp += dstskip;
  1566         }
  1567     }
  1568 }
  1569 
  1570 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1571 static void
  1572 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1573 {
  1574     int width = info->d_width;
  1575     int height = info->d_height;
  1576     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1577     int srcskip = info->s_skip >> 2;
  1578     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1579     int dstskip = info->d_skip >> 2;
  1580 
  1581     while (height--) {
  1582 	    /* *INDENT-OFF* */
  1583 	    DUFFS_LOOP4({
  1584 		Uint32 dalpha;
  1585 		Uint32 d;
  1586 		Uint32 s1;
  1587 		Uint32 d1;
  1588 		Uint32 s = *srcp;
  1589 		Uint32 alpha = s >> 24;
  1590 		/* FIXME: Here we special-case opaque alpha since the
  1591 		   compositioning used (>>8 instead of /255) doesn't handle
  1592 		   it correctly. Also special-case alpha=0 for speed?
  1593 		   Benchmark this! */
  1594 		if(alpha) {   
  1595 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1596 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1597 		  } else {
  1598 		    /*
  1599 		     * take out the middle component (green), and process
  1600 		     * the other two in parallel. One multiply less.
  1601 		     */
  1602 		    d = *dstp;
  1603 		    dalpha = d & 0xff000000;
  1604 		    s1 = s & 0xff00ff;
  1605 		    d1 = d & 0xff00ff;
  1606 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1607 		    s &= 0xff00;
  1608 		    d &= 0xff00;
  1609 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1610 		    *dstp = d1 | d | dalpha;
  1611 		  }
  1612 		}
  1613 		++srcp;
  1614 		++dstp;
  1615 	    }, width);
  1616 	    /* *INDENT-ON* */
  1617         srcp += srcskip;
  1618         dstp += dstskip;
  1619     }
  1620 }
  1621 
  1622 #if GCC_ASMBLIT
  1623 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1624 static void
  1625 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1626 {
  1627     int width = info->d_width;
  1628     int height = info->d_height;
  1629     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1630     int srcskip = info->s_skip >> 2;
  1631     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1632     int dstskip = info->d_skip >> 2;
  1633     SDL_PixelFormat *sf = info->src;
  1634     Uint32 amask = sf->Amask;
  1635     Uint32 ashift = sf->Ashift;
  1636 
  1637     __asm__(
  1638                /* make mm6 all zeros. */
  1639                "pxor       %%mm6, %%mm6\n"
  1640                /* Make a mask to preserve the alpha. */
  1641                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1642                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1643                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1644                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1645                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1646                /* form channel masks */
  1647                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1648                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1649                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1650                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1651                /* get alpha channel shift */
  1652                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1653   : /* nothing */ :            "m"(amask), "m"(ashift));
  1654 
  1655     while (height--) {
  1656 
  1657 	    /* *INDENT-OFF* */
  1658 	    DUFFS_LOOP4({
  1659 		Uint32 alpha;
  1660 
  1661 		__asm__ (
  1662 		"prefetch 64(%0)\n"
  1663 		"prefetch 64(%1)\n"
  1664 			: : "r" (srcp), "r" (dstp) );
  1665 
  1666 		alpha = *srcp & amask;
  1667 		/* FIXME: Here we special-case opaque alpha since the
  1668 		   compositioning used (>>8 instead of /255) doesn't handle
  1669 		   it correctly. Also special-case alpha=0 for speed?
  1670 		   Benchmark this! */
  1671 		if(alpha == 0) {
  1672 		    /* do nothing */
  1673 		}
  1674 		else if(alpha == amask) {
  1675 			/* opaque alpha -- copy RGB, keep dst alpha */
  1676 		    /* using MMX here to free up regular registers for other things */
  1677 			    __asm__ (
  1678 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1679 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1680 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1681 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1682 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1683 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1684 
  1685 		     : : "r" (srcp), "r" (dstp) );
  1686 		} 
  1687 
  1688 		else {
  1689 			    __asm__ (
  1690 		    /* load in the source, and dst. */
  1691 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1692 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1693 
  1694 		    /* Move the src alpha into mm2 */
  1695 
  1696 		    /* if supporting pshufw */
  1697 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1698 		    /*"psrlw     $8, %%mm2\n" */
  1699 		    
  1700 		    /* else: */
  1701 		    "movd       %2,    %%mm2\n"
  1702 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1703 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1704 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1705 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1706 
  1707 		    /* move the colors into words. */
  1708 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1709 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1710 
  1711 		    /* src - dst */
  1712 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1713 
  1714 		    /* A * (src-dst) */
  1715 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1716 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1717 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1718 
  1719 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1720 		    
  1721 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1722 
  1723 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1724 
  1725 		}
  1726 		++srcp;
  1727 		++dstp;
  1728 	    }, width);
  1729 	    /* *INDENT-ON* */
  1730         srcp += srcskip;
  1731         dstp += dstskip;
  1732     }
  1733 
  1734   __asm__("emms\n":);
  1735 }
  1736 
  1737 /* End GCC_ASMBLIT*/
  1738 
  1739 #elif MSVC_ASMBLIT
  1740 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1741 static void
  1742 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1743 {
  1744     int width = info->d_width;
  1745     int height = info->d_height;
  1746     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1747     int srcskip = info->s_skip >> 2;
  1748     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1749     int dstskip = info->d_skip >> 2;
  1750     SDL_PixelFormat *sf = info->src;
  1751     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1752     Uint32 amask = sf->Amask;
  1753     Uint32 ashift = sf->Ashift;
  1754     Uint64 multmask;
  1755 
  1756     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1757 
  1758     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1759 	/* *INDENT-OFF* */
  1760     multmask = ~(0xFFFFI64 << (ashift * 2));
  1761 	/* *INDENT-ON* */
  1762     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1763 
  1764     while (height--) {
  1765 	    /* *INDENT-OFF* */
  1766 	    DUFFS_LOOP4({
  1767 		Uint32 alpha;
  1768 
  1769 		_m_prefetch(srcp + 16);
  1770 		_m_prefetch(dstp + 16);
  1771 
  1772 		alpha = *srcp & amask;
  1773 		if (alpha == 0) {
  1774 			/* do nothing */
  1775 		} else if (alpha == amask) {
  1776 			/* copy RGB, keep dst alpha */
  1777 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1778 		} else {
  1779 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1780 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1781 
  1782 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1783 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1784 
  1785 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1786 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1787 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1788 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1789 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1790 
  1791 			/* blend */		    
  1792 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1793 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1794 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1795 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1796 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1797 			
  1798 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1799 		}
  1800 		++srcp;
  1801 		++dstp;
  1802 	    }, width);
  1803 	    /* *INDENT-ON* */
  1804         srcp += srcskip;
  1805         dstp += dstskip;
  1806     }
  1807     _mm_empty();
  1808 }
  1809 
  1810 /* End MSVC_ASMBLIT */
  1811 
  1812 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1813 
  1814 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1815 
  1816 /* blend a single 16 bit pixel at 50% */
  1817 #define BLEND16_50(d, s, mask)						\
  1818 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1819 
  1820 /* blend two 16 bit pixels at 50% */
  1821 #define BLEND2x16_50(d, s, mask)					     \
  1822 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1823 	 + (s & d & (~(mask | mask << 16))))
  1824 
  1825 static void
  1826 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1827 {
  1828     int width = info->d_width;
  1829     int height = info->d_height;
  1830     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1831     int srcskip = info->s_skip >> 1;
  1832     Uint16 *dstp = (Uint16 *) info->d_pixels;
  1833     int dstskip = info->d_skip >> 1;
  1834 
  1835     while (height--) {
  1836         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1837             /*
  1838              * Source and destination not aligned, pipeline it.
  1839              * This is mostly a win for big blits but no loss for
  1840              * small ones
  1841              */
  1842             Uint32 prev_sw;
  1843             int w = width;
  1844 
  1845             /* handle odd destination */
  1846             if ((uintptr_t) dstp & 2) {
  1847                 Uint16 d = *dstp, s = *srcp;
  1848                 *dstp = BLEND16_50(d, s, mask);
  1849                 dstp++;
  1850                 srcp++;
  1851                 w--;
  1852             }
  1853             srcp++;             /* srcp is now 32-bit aligned */
  1854 
  1855             /* bootstrap pipeline with first halfword */
  1856             prev_sw = ((Uint32 *) srcp)[-1];
  1857 
  1858             while (w > 1) {
  1859                 Uint32 sw, dw, s;
  1860                 sw = *(Uint32 *) srcp;
  1861                 dw = *(Uint32 *) dstp;
  1862 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1863                 s = (prev_sw << 16) + (sw >> 16);
  1864 #else
  1865                 s = (prev_sw >> 16) + (sw << 16);
  1866 #endif
  1867                 prev_sw = sw;
  1868                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1869                 dstp += 2;
  1870                 srcp += 2;
  1871                 w -= 2;
  1872             }
  1873 
  1874             /* final pixel if any */
  1875             if (w) {
  1876                 Uint16 d = *dstp, s;
  1877 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1878                 s = (Uint16) prev_sw;
  1879 #else
  1880                 s = (Uint16) (prev_sw >> 16);
  1881 #endif
  1882                 *dstp = BLEND16_50(d, s, mask);
  1883                 srcp++;
  1884                 dstp++;
  1885             }
  1886             srcp += srcskip - 1;
  1887             dstp += dstskip;
  1888         } else {
  1889             /* source and destination are aligned */
  1890             int w = width;
  1891 
  1892             /* first odd pixel? */
  1893             if ((uintptr_t) srcp & 2) {
  1894                 Uint16 d = *dstp, s = *srcp;
  1895                 *dstp = BLEND16_50(d, s, mask);
  1896                 srcp++;
  1897                 dstp++;
  1898                 w--;
  1899             }
  1900             /* srcp and dstp are now 32-bit aligned */
  1901 
  1902             while (w > 1) {
  1903                 Uint32 sw = *(Uint32 *) srcp;
  1904                 Uint32 dw = *(Uint32 *) dstp;
  1905                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1906                 srcp += 2;
  1907                 dstp += 2;
  1908                 w -= 2;
  1909             }
  1910 
  1911             /* last odd pixel? */
  1912             if (w) {
  1913                 Uint16 d = *dstp, s = *srcp;
  1914                 *dstp = BLEND16_50(d, s, mask);
  1915                 srcp++;
  1916                 dstp++;
  1917             }
  1918             srcp += srcskip;
  1919             dstp += dstskip;
  1920         }
  1921     }
  1922 }
  1923 
  1924 #if GCC_ASMBLIT
  1925 /* fast RGB565->RGB565 blending with surface alpha */
  1926 static void
  1927 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1928 {
  1929     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1930     if (alpha == 128) {
  1931         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1932     } else {
  1933         int width = info->d_width;
  1934         int height = info->d_height;
  1935         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1936         int srcskip = info->s_skip >> 1;
  1937         Uint16 *dstp = (Uint16 *) info->d_pixels;
  1938         int dstskip = info->d_skip >> 1;
  1939         Uint32 s, d;
  1940         Uint8 load[8];
  1941 
  1942         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1943         *(Uint64 *) load = alpha;
  1944         alpha >>= 3;            /* downscale alpha to 5 bits */
  1945 
  1946         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1947         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1948         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1949         /* position alpha to allow for mullo and mulhi on diff channels
  1950            to reduce the number of operations */
  1951         psllq_i2r(3, mm0);
  1952 
  1953         /* Setup the 565 color channel masks */
  1954         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1955         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1956         *(Uint64 *) load = 0x001F001F001F001FULL;
  1957         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1958         while (height--) {
  1959 			/* *INDENT-OFF* */
  1960 			DUFFS_LOOP_QUATRO2(
  1961 			{
  1962 				s = *srcp++;
  1963 				d = *dstp;
  1964 				/*
  1965 				 * shift out the middle component (green) to
  1966 				 * the high 16 bits, and process all three RGB
  1967 				 * components at the same time.
  1968 				 */
  1969 				s = (s | s << 16) & 0x07e0f81f;
  1970 				d = (d | d << 16) & 0x07e0f81f;
  1971 				d += (s - d) * alpha >> 5;
  1972 				d &= 0x07e0f81f;
  1973 				*dstp++ = d | d >> 16;
  1974 			},{
  1975 				s = *srcp++;
  1976 				d = *dstp;
  1977 				/*
  1978 				 * shift out the middle component (green) to
  1979 				 * the high 16 bits, and process all three RGB
  1980 				 * components at the same time.
  1981 				 */
  1982 				s = (s | s << 16) & 0x07e0f81f;
  1983 				d = (d | d << 16) & 0x07e0f81f;
  1984 				d += (s - d) * alpha >> 5;
  1985 				d &= 0x07e0f81f;
  1986 				*dstp++ = d | d >> 16;
  1987 				s = *srcp++;
  1988 				d = *dstp;
  1989 				/*
  1990 				 * shift out the middle component (green) to
  1991 				 * the high 16 bits, and process all three RGB
  1992 				 * components at the same time.
  1993 				 */
  1994 				s = (s | s << 16) & 0x07e0f81f;
  1995 				d = (d | d << 16) & 0x07e0f81f;
  1996 				d += (s - d) * alpha >> 5;
  1997 				d &= 0x07e0f81f;
  1998 				*dstp++ = d | d >> 16;
  1999 			},{
  2000 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2001 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2002 
  2003 				/* red -- does not need a mask since the right shift clears
  2004 				   the uninteresting bits */
  2005 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2006 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2007 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2008 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2009 
  2010 				/* blend */
  2011 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2012 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2013 				/* alpha used is actually 11 bits
  2014 				   11 + 5 = 16 bits, so the sign bits are lost */
  2015 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2016 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2017 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2018 
  2019 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2020 
  2021 				/* green -- process the bits in place */
  2022 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2023 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2024 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2025 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2026 
  2027 				/* blend */
  2028 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2029 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2030 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2031 				   bits are gone and the sign bits present */
  2032 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2033 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2034 
  2035 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2036 
  2037 				/* blue */
  2038 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2039 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2040 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2041 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2042 
  2043 				/* blend */
  2044 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2045 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2046 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2047 				   the interesting bits will need to be MASKed */
  2048 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2049 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2050 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2051 
  2052 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2053 
  2054 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2055 
  2056 				srcp += 4;
  2057 				dstp += 4;
  2058 			}, width);			
  2059 			/* *INDENT-ON* */
  2060             srcp += srcskip;
  2061             dstp += dstskip;
  2062         }
  2063         emms();
  2064     }
  2065 }
  2066 
  2067 /* fast RGB555->RGB555 blending with surface alpha */
  2068 static void
  2069 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2070 {
  2071     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2072     if (alpha == 128) {
  2073         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2074     } else {
  2075         int width = info->d_width;
  2076         int height = info->d_height;
  2077         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2078         int srcskip = info->s_skip >> 1;
  2079         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2080         int dstskip = info->d_skip >> 1;
  2081         Uint32 s, d;
  2082         Uint8 load[8];
  2083 
  2084         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2085         *(Uint64 *) load = alpha;
  2086         alpha >>= 3;            /* downscale alpha to 5 bits */
  2087 
  2088         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2089         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2090         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2091         /* position alpha to allow for mullo and mulhi on diff channels
  2092            to reduce the number of operations */
  2093         psllq_i2r(3, mm0);
  2094 
  2095         /* Setup the 555 color channel masks */
  2096         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2097         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2098         *(Uint64 *) load = 0x001F001F001F001FULL;
  2099         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2100         while (height--) {
  2101 			/* *INDENT-OFF* */
  2102 			DUFFS_LOOP_QUATRO2(
  2103 			{
  2104 				s = *srcp++;
  2105 				d = *dstp;
  2106 				/*
  2107 				 * shift out the middle component (green) to
  2108 				 * the high 16 bits, and process all three RGB
  2109 				 * components at the same time.
  2110 				 */
  2111 				s = (s | s << 16) & 0x03e07c1f;
  2112 				d = (d | d << 16) & 0x03e07c1f;
  2113 				d += (s - d) * alpha >> 5;
  2114 				d &= 0x03e07c1f;
  2115 				*dstp++ = d | d >> 16;
  2116 			},{
  2117 				s = *srcp++;
  2118 				d = *dstp;
  2119 				/*
  2120 				 * shift out the middle component (green) to
  2121 				 * the high 16 bits, and process all three RGB
  2122 				 * components at the same time.
  2123 				 */
  2124 				s = (s | s << 16) & 0x03e07c1f;
  2125 				d = (d | d << 16) & 0x03e07c1f;
  2126 				d += (s - d) * alpha >> 5;
  2127 				d &= 0x03e07c1f;
  2128 				*dstp++ = d | d >> 16;
  2129 			        s = *srcp++;
  2130 				d = *dstp;
  2131 				/*
  2132 				 * shift out the middle component (green) to
  2133 				 * the high 16 bits, and process all three RGB
  2134 				 * components at the same time.
  2135 				 */
  2136 				s = (s | s << 16) & 0x03e07c1f;
  2137 				d = (d | d << 16) & 0x03e07c1f;
  2138 				d += (s - d) * alpha >> 5;
  2139 				d &= 0x03e07c1f;
  2140 				*dstp++ = d | d >> 16;
  2141 			},{
  2142 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2143 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2144 
  2145 				/* red -- process the bits in place */
  2146 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2147 					/* by reusing the GREEN mask we free up another mmx
  2148 					   register to accumulate the result */
  2149 
  2150 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2151 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2152 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2153 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2154 
  2155 				/* blend */
  2156 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2157 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2158 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2159 				   cleared by a MASK below */
  2160 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2161 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2162 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2163 
  2164 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2165 
  2166 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2167 
  2168 				/* green -- process the bits in place */
  2169 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2170 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2171 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2172 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2173 
  2174 				/* blend */
  2175 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2176 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2177 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2178 				   bits are gone and the sign bits present */
  2179 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2180 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2181 
  2182 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2183 
  2184 				/* blue */
  2185 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2186 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2187 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2188 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2189 
  2190 				/* blend */
  2191 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2192 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2193 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2194 				   the interesting bits will need to be MASKed */
  2195 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2196 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2197 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2198 
  2199 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2200 
  2201 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2202 
  2203 				srcp += 4;
  2204 				dstp += 4;
  2205 			}, width);
  2206 			/* *INDENT-ON* */
  2207             srcp += srcskip;
  2208             dstp += dstskip;
  2209         }
  2210         emms();
  2211     }
  2212 }
  2213 
  2214 /* End GCC_ASMBLIT */
  2215 
  2216 #elif MSVC_ASMBLIT
  2217 /* fast RGB565->RGB565 blending with surface alpha */
  2218 static void
  2219 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2220 {
  2221     unsigned alpha = info->src->alpha;
  2222     if (alpha == 128) {
  2223         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2224     } else {
  2225         int width = info->d_width;
  2226         int height = info->d_height;
  2227         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2228         int srcskip = info->s_skip >> 1;
  2229         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2230         int dstskip = info->d_skip >> 1;
  2231         Uint32 s, d;
  2232 
  2233         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2234 
  2235         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2236         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2237         alpha >>= 3;            /* downscale alpha to 5 bits */
  2238 
  2239         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2240         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2241         /* position alpha to allow for mullo and mulhi on diff channels
  2242            to reduce the number of operations */
  2243         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2244 
  2245         /* Setup the 565 color channel masks */
  2246         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2247         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2248 
  2249         while (height--) {
  2250 			/* *INDENT-OFF* */
  2251 			DUFFS_LOOP_QUATRO2(
  2252 			{
  2253 				s = *srcp++;
  2254 				d = *dstp;
  2255 				/*
  2256 				 * shift out the middle component (green) to
  2257 				 * the high 16 bits, and process all three RGB
  2258 				 * components at the same time.
  2259 				 */
  2260 				s = (s | s << 16) & 0x07e0f81f;
  2261 				d = (d | d << 16) & 0x07e0f81f;
  2262 				d += (s - d) * alpha >> 5;
  2263 				d &= 0x07e0f81f;
  2264 				*dstp++ = (Uint16)(d | d >> 16);
  2265 			},{
  2266 				s = *srcp++;
  2267 				d = *dstp;
  2268 				/*
  2269 				 * shift out the middle component (green) to
  2270 				 * the high 16 bits, and process all three RGB
  2271 				 * components at the same time.
  2272 				 */
  2273 				s = (s | s << 16) & 0x07e0f81f;
  2274 				d = (d | d << 16) & 0x07e0f81f;
  2275 				d += (s - d) * alpha >> 5;
  2276 				d &= 0x07e0f81f;
  2277 				*dstp++ = (Uint16)(d | d >> 16);
  2278 				s = *srcp++;
  2279 				d = *dstp;
  2280 				/*
  2281 				 * shift out the middle component (green) to
  2282 				 * the high 16 bits, and process all three RGB
  2283 				 * components at the same time.
  2284 				 */
  2285 				s = (s | s << 16) & 0x07e0f81f;
  2286 				d = (d | d << 16) & 0x07e0f81f;
  2287 				d += (s - d) * alpha >> 5;
  2288 				d &= 0x07e0f81f;
  2289 				*dstp++ = (Uint16)(d | d >> 16);
  2290 			},{
  2291 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2292 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2293 
  2294 				/* red */
  2295 				src2 = src1;
  2296 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2297 
  2298 				dst2 = dst1;
  2299 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2300 
  2301 				/* blend */
  2302 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2303 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2304 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2305 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2306 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2307 
  2308 				mm_res = dst2; /* RED -> mm_res */
  2309 
  2310 				/* green -- process the bits in place */
  2311 				src2 = src1;
  2312 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2313 
  2314 				dst2 = dst1;
  2315 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2316 
  2317 				/* blend */
  2318 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2319 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2320 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2321 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2322 
  2323 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2324 
  2325 				/* blue */
  2326 				src2 = src1;
  2327 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2328 
  2329 				dst2 = dst1;
  2330 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2331 
  2332 				/* blend */
  2333 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2334 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2335 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2336 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2337 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2338 
  2339 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2340 
  2341 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2342 
  2343 				srcp += 4;
  2344 				dstp += 4;
  2345 			}, width);
  2346 			/* *INDENT-ON* */
  2347             srcp += srcskip;
  2348             dstp += dstskip;
  2349         }
  2350         _mm_empty();
  2351     }
  2352 }
  2353 
  2354 /* fast RGB555->RGB555 blending with surface alpha */
  2355 static void
  2356 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2357 {
  2358     unsigned alpha = info->src->alpha;
  2359     if (alpha == 128) {
  2360         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2361     } else {
  2362         int width = info->d_width;
  2363         int height = info->d_height;
  2364         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2365         int srcskip = info->s_skip >> 1;
  2366         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2367         int dstskip = info->d_skip >> 1;
  2368         Uint32 s, d;
  2369 
  2370         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2371 
  2372         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2373         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2374         alpha >>= 3;            /* downscale alpha to 5 bits */
  2375 
  2376         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2377         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2378         /* position alpha to allow for mullo and mulhi on diff channels
  2379            to reduce the number of operations */
  2380         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2381 
  2382         /* Setup the 555 color channel masks */
  2383         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2384         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2385         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2386 
  2387         while (height--) {
  2388 			/* *INDENT-OFF* */
  2389 			DUFFS_LOOP_QUATRO2(
  2390 			{
  2391 				s = *srcp++;
  2392 				d = *dstp;
  2393 				/*
  2394 				 * shift out the middle component (green) to
  2395 				 * the high 16 bits, and process all three RGB
  2396 				 * components at the same time.
  2397 				 */
  2398 				s = (s | s << 16) & 0x03e07c1f;
  2399 				d = (d | d << 16) & 0x03e07c1f;
  2400 				d += (s - d) * alpha >> 5;
  2401 				d &= 0x03e07c1f;
  2402 				*dstp++ = (Uint16)(d | d >> 16);
  2403 			},{
  2404 				s = *srcp++;
  2405 				d = *dstp;
  2406 				/*
  2407 				 * shift out the middle component (green) to
  2408 				 * the high 16 bits, and process all three RGB
  2409 				 * components at the same time.
  2410 				 */
  2411 				s = (s | s << 16) & 0x03e07c1f;
  2412 				d = (d | d << 16) & 0x03e07c1f;
  2413 				d += (s - d) * alpha >> 5;
  2414 				d &= 0x03e07c1f;
  2415 				*dstp++ = (Uint16)(d | d >> 16);
  2416 			        s = *srcp++;
  2417 				d = *dstp;
  2418 				/*
  2419 				 * shift out the middle component (green) to
  2420 				 * the high 16 bits, and process all three RGB
  2421 				 * components at the same time.
  2422 				 */
  2423 				s = (s | s << 16) & 0x03e07c1f;
  2424 				d = (d | d << 16) & 0x03e07c1f;
  2425 				d += (s - d) * alpha >> 5;
  2426 				d &= 0x03e07c1f;
  2427 				*dstp++ = (Uint16)(d | d >> 16);
  2428 			},{
  2429 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2430 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2431 
  2432 				/* red -- process the bits in place */
  2433 				src2 = src1;
  2434 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2435 
  2436 				dst2 = dst1;
  2437 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2438 
  2439 				/* blend */
  2440 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2441 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2442 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2443 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2444 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2445 
  2446 				mm_res = dst2; /* RED -> mm_res */
  2447 				
  2448 				/* green -- process the bits in place */
  2449 				src2 = src1;
  2450 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2451 
  2452 				dst2 = dst1;
  2453 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2454 
  2455 				/* blend */
  2456 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2457 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2458 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2459 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2460 
  2461 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2462 
  2463 				/* blue */
  2464 				src2 = src1; /* src -> src2 */
  2465 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2466 
  2467 				dst2 = dst1; /* dst -> dst2 */
  2468 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2469 
  2470 				/* blend */
  2471 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2472 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2473 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2474 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2475 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2476 
  2477 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2478 
  2479 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2480 
  2481 				srcp += 4;
  2482 				dstp += 4;
  2483 			}, width);
  2484 			/* *INDENT-ON* */
  2485             srcp += srcskip;
  2486             dstp += dstskip;
  2487         }
  2488         _mm_empty();
  2489     }
  2490 }
  2491 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2492 
  2493 /* fast RGB565->RGB565 blending with surface alpha */
  2494 static void
  2495 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2496 {
  2497     unsigned alpha = info->src->alpha;
  2498     if (alpha == 128) {
  2499         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2500     } else {
  2501         int width = info->d_width;
  2502         int height = info->d_height;
  2503         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2504         int srcskip = info->s_skip >> 1;
  2505         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2506         int dstskip = info->d_skip >> 1;
  2507         alpha >>= 3;            /* downscale alpha to 5 bits */
  2508 
  2509         while (height--) {
  2510 			/* *INDENT-OFF* */
  2511 			DUFFS_LOOP4({
  2512 				Uint32 s = *srcp++;
  2513 				Uint32 d = *dstp;
  2514 				/*
  2515 				 * shift out the middle component (green) to
  2516 				 * the high 16 bits, and process all three RGB
  2517 				 * components at the same time.
  2518 				 */
  2519 				s = (s | s << 16) & 0x07e0f81f;
  2520 				d = (d | d << 16) & 0x07e0f81f;
  2521 				d += (s - d) * alpha >> 5;
  2522 				d &= 0x07e0f81f;
  2523 				*dstp++ = (Uint16)(d | d >> 16);
  2524 			}, width);
  2525 			/* *INDENT-ON* */
  2526             srcp += srcskip;
  2527             dstp += dstskip;
  2528         }
  2529     }
  2530 }
  2531 
  2532 /* fast RGB555->RGB555 blending with surface alpha */
  2533 static void
  2534 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2535 {
  2536     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2537     if (alpha == 128) {
  2538         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2539     } else {
  2540         int width = info->d_width;
  2541         int height = info->d_height;
  2542         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2543         int srcskip = info->s_skip >> 1;
  2544         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2545         int dstskip = info->d_skip >> 1;
  2546         alpha >>= 3;            /* downscale alpha to 5 bits */
  2547 
  2548         while (height--) {
  2549 			/* *INDENT-OFF* */
  2550 			DUFFS_LOOP4({
  2551 				Uint32 s = *srcp++;
  2552 				Uint32 d = *dstp;
  2553 				/*
  2554 				 * shift out the middle component (green) to
  2555 				 * the high 16 bits, and process all three RGB
  2556 				 * components at the same time.
  2557 				 */
  2558 				s = (s | s << 16) & 0x03e07c1f;
  2559 				d = (d | d << 16) & 0x03e07c1f;
  2560 				d += (s - d) * alpha >> 5;
  2561 				d &= 0x03e07c1f;
  2562 				*dstp++ = (Uint16)(d | d >> 16);
  2563 			}, width);
  2564 			/* *INDENT-ON* */
  2565             srcp += srcskip;
  2566             dstp += dstskip;
  2567         }
  2568     }
  2569 }
  2570 
  2571 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2572 static void
  2573 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2574 {
  2575     int width = info->d_width;
  2576     int height = info->d_height;
  2577     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2578     int srcskip = info->s_skip >> 2;
  2579     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2580     int dstskip = info->d_skip >> 1;
  2581 
  2582     while (height--) {
  2583 	    /* *INDENT-OFF* */
  2584 	    DUFFS_LOOP4({
  2585 		Uint32 s = *srcp;
  2586 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2587 		/* FIXME: Here we special-case opaque alpha since the
  2588 		   compositioning used (>>8 instead of /255) doesn't handle
  2589 		   it correctly. Also special-case alpha=0 for speed?
  2590 		   Benchmark this! */
  2591 		if(alpha) {   
  2592 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2593 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2594 		  } else {
  2595 		    Uint32 d = *dstp;
  2596 		    /*
  2597 		     * convert source and destination to G0RAB65565
  2598 		     * and blend all components at the same time
  2599 		     */
  2600 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2601 		      + (s >> 3 & 0x1f);
  2602 		    d = (d | d << 16) & 0x07e0f81f;
  2603 		    d += (s - d) * alpha >> 5;
  2604 		    d &= 0x07e0f81f;
  2605 		    *dstp = (Uint16)(d | d >> 16);
  2606 		  }
  2607 		}
  2608 		srcp++;
  2609 		dstp++;
  2610 	    }, width);
  2611 	    /* *INDENT-ON* */
  2612         srcp += srcskip;
  2613         dstp += dstskip;
  2614     }
  2615 }
  2616 
  2617 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2618 static void
  2619 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2620 {
  2621     int width = info->d_width;
  2622     int height = info->d_height;
  2623     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2624     int srcskip = info->s_skip >> 2;
  2625     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2626     int dstskip = info->d_skip >> 1;
  2627 
  2628     while (height--) {
  2629 	    /* *INDENT-OFF* */
  2630 	    DUFFS_LOOP4({
  2631 		unsigned alpha;
  2632 		Uint32 s = *srcp;
  2633 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2634 		/* FIXME: Here we special-case opaque alpha since the
  2635 		   compositioning used (>>8 instead of /255) doesn't handle
  2636 		   it correctly. Also special-case alpha=0 for speed?
  2637 		   Benchmark this! */
  2638 		if(alpha) {   
  2639 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2640 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2641 		  } else {
  2642 		    Uint32 d = *dstp;
  2643 		    /*
  2644 		     * convert source and destination to G0RAB65565
  2645 		     * and blend all components at the same time
  2646 		     */
  2647 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2648 		      + (s >> 3 & 0x1f);
  2649 		    d = (d | d << 16) & 0x03e07c1f;
  2650 		    d += (s - d) * alpha >> 5;
  2651 		    d &= 0x03e07c1f;
  2652 		    *dstp = (Uint16)(d | d >> 16);
  2653 		  }
  2654 		}
  2655 		srcp++;
  2656 		dstp++;
  2657 	    }, width);
  2658 	    /* *INDENT-ON* */
  2659         srcp += srcskip;
  2660         dstp += dstskip;
  2661     }
  2662 }
  2663 
  2664 /* General (slow) N->N blending with per-surface alpha */
  2665 static void
  2666 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2667 {
  2668     int width = info->d_width;
  2669     int height = info->d_height;
  2670     Uint8 *src = info->s_pixels;
  2671     int srcskip = info->s_skip;
  2672     Uint8 *dst = info->d_pixels;
  2673     int dstskip = info->d_skip;
  2674     SDL_PixelFormat *srcfmt = info->src;
  2675     SDL_PixelFormat *dstfmt = info->dst;
  2676     int srcbpp = srcfmt->BytesPerPixel;
  2677     int dstbpp = dstfmt->BytesPerPixel;
  2678     unsigned sA = srcfmt->alpha;
  2679     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2680 
  2681     if (sA) {
  2682         while (height--) {
  2683 	    /* *INDENT-OFF* */
  2684 	    DUFFS_LOOP4(
  2685 	    {
  2686 		Uint32 Pixel;
  2687 		unsigned sR;
  2688 		unsigned sG;
  2689 		unsigned sB;
  2690 		unsigned dR;
  2691 		unsigned dG;
  2692 		unsigned dB;
  2693 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2694 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2695 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2696 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2697 		src += srcbpp;
  2698 		dst += dstbpp;
  2699 	    },
  2700 	    width);
  2701 	    /* *INDENT-ON* */
  2702             src += srcskip;
  2703             dst += dstskip;
  2704         }
  2705     }
  2706 }
  2707 
  2708 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2709 static void
  2710 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2711 {
  2712     int width = info->d_width;
  2713     int height = info->d_height;
  2714     Uint8 *src = info->s_pixels;
  2715     int srcskip = info->s_skip;
  2716     Uint8 *dst = info->d_pixels;
  2717     int dstskip = info->d_skip;
  2718     SDL_PixelFormat *srcfmt = info->src;
  2719     SDL_PixelFormat *dstfmt = info->dst;
  2720     Uint32 ckey = srcfmt->colorkey;
  2721     int srcbpp = srcfmt->BytesPerPixel;
  2722     int dstbpp = dstfmt->BytesPerPixel;
  2723     unsigned sA = srcfmt->alpha;
  2724     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2725 
  2726     while (height--) {
  2727 	    /* *INDENT-OFF* */
  2728 	    DUFFS_LOOP4(
  2729 	    {
  2730 		Uint32 Pixel;
  2731 		unsigned sR;
  2732 		unsigned sG;
  2733 		unsigned sB;
  2734 		unsigned dR;
  2735 		unsigned dG;
  2736 		unsigned dB;
  2737 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2738 		if(sA && Pixel != ckey) {
  2739 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2740 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2741 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2742 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2743 		}
  2744 		src += srcbpp;
  2745 		dst += dstbpp;
  2746 	    },
  2747 	    width);
  2748 	    /* *INDENT-ON* */
  2749         src += srcskip;
  2750         dst += dstskip;
  2751     }
  2752 }
  2753 
  2754 /* General (slow) N->N blending with pixel alpha */
  2755 static void
  2756 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2757 {
  2758     int width = info->d_width;
  2759     int height = info->d_height;
  2760     Uint8 *src = info->s_pixels;
  2761     int srcskip = info->s_skip;
  2762     Uint8 *dst = info->d_pixels;
  2763     int dstskip = info->d_skip;
  2764     SDL_PixelFormat *srcfmt = info->src;
  2765     SDL_PixelFormat *dstfmt = info->dst;
  2766 
  2767     int srcbpp;
  2768     int dstbpp;
  2769 
  2770     /* Set up some basic variables */
  2771     srcbpp = srcfmt->BytesPerPixel;
  2772     dstbpp = dstfmt->BytesPerPixel;
  2773 
  2774     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2775        quite right. for <8bpp source alpha, it gets them very wrong
  2776        (check all macros!)
  2777        It is unclear whether there is a good general solution that doesn't
  2778        need a branch (or a divide). */
  2779     while (height--) {
  2780 	    /* *INDENT-OFF* */
  2781 	    DUFFS_LOOP4(
  2782 	    {
  2783 		Uint32 Pixel;
  2784 		unsigned sR;
  2785 		unsigned sG;
  2786 		unsigned sB;
  2787 		unsigned dR;
  2788 		unsigned dG;
  2789 		unsigned dB;
  2790 		unsigned sA;
  2791 		unsigned dA;
  2792 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2793 		if(sA) {
  2794 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2795 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2796 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2797 		}
  2798 		src += srcbpp;
  2799 		dst += dstbpp;
  2800 	    },
  2801 	    width);
  2802 	    /* *INDENT-ON* */
  2803         src += srcskip;
  2804         dst += dstskip;
  2805     }
  2806 }
  2807 
  2808 
  2809 SDL_loblit
  2810 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2811 {
  2812     SDL_PixelFormat *sf = surface->format;
  2813     SDL_PixelFormat *df = surface->map->dst->format;
  2814 
  2815     if (sf->Amask == 0) {
  2816         if ((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2817             if (df->BytesPerPixel == 1)
  2818                 return BlitNto1SurfaceAlphaKey;
  2819             else
  2820 #if SDL_ALTIVEC_BLITTERS
  2821                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2822                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2823                     && SDL_HasAltiVec())
  2824                 return Blit32to32SurfaceAlphaKeyAltivec;
  2825             else
  2826 #endif
  2827                 return BlitNtoNSurfaceAlphaKey;
  2828         } else {
  2829             /* Per-surface alpha blits */
  2830             switch (df->BytesPerPixel) {
  2831             case 1:
  2832                 return BlitNto1SurfaceAlpha;
  2833 
  2834             case 2:
  2835                 if (surface->map->identity) {
  2836                     if (df->Gmask == 0x7e0) {
  2837 #if MMX_ASMBLIT
  2838                         if (SDL_HasMMX())
  2839                             return Blit565to565SurfaceAlphaMMX;
  2840                         else
  2841 #endif
  2842                             return Blit565to565SurfaceAlpha;
  2843                     } else if (df->Gmask == 0x3e0) {
  2844 #if MMX_ASMBLIT
  2845                         if (SDL_HasMMX())
  2846                             return Blit555to555SurfaceAlphaMMX;
  2847                         else
  2848 #endif
  2849                             return Blit555to555SurfaceAlpha;
  2850                     }
  2851                 }
  2852                 return BlitNtoNSurfaceAlpha;
  2853 
  2854             case 4:
  2855                 if (sf->Rmask == df->Rmask
  2856                     && sf->Gmask == df->Gmask
  2857                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2858 #if MMX_ASMBLIT
  2859                     if (sf->Rshift % 8 == 0
  2860                         && sf->Gshift % 8 == 0
  2861                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2862                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2863 #endif
  2864                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2865 #if SDL_ALTIVEC_BLITTERS
  2866                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2867                             && SDL_HasAltiVec())
  2868                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2869 #endif
  2870                         return BlitRGBtoRGBSurfaceAlpha;
  2871                     }
  2872                 }
  2873 #if SDL_ALTIVEC_BLITTERS
  2874                 if ((sf->BytesPerPixel == 4) &&
  2875                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2876                     && SDL_HasAltiVec())
  2877                     return Blit32to32SurfaceAlphaAltivec;
  2878                 else
  2879 #endif
  2880                     return BlitNtoNSurfaceAlpha;
  2881 
  2882             case 3:
  2883             default:
  2884                 return BlitNtoNSurfaceAlpha;
  2885             }
  2886         }
  2887     } else {
  2888         /* Per-pixel alpha blits */
  2889         switch (df->BytesPerPixel) {
  2890         case 1:
  2891             return BlitNto1PixelAlpha;
  2892 
  2893         case 2:
  2894 #if SDL_ALTIVEC_BLITTERS
  2895             if (sf->BytesPerPixel == 4
  2896                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2897                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2898                 && SDL_HasAltiVec())
  2899                 return Blit32to565PixelAlphaAltivec;
  2900             else
  2901 #endif
  2902                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2903                     && sf->Gmask == 0xff00
  2904                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2905                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2906                 if (df->Gmask == 0x7e0)
  2907                     return BlitARGBto565PixelAlpha;
  2908                 else if (df->Gmask == 0x3e0)
  2909                     return BlitARGBto555PixelAlpha;
  2910             }
  2911             return BlitNtoNPixelAlpha;
  2912 
  2913         case 4:
  2914             if (sf->Rmask == df->Rmask
  2915                 && sf->Gmask == df->Gmask
  2916                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2917 #if MMX_ASMBLIT
  2918                 if (sf->Rshift % 8 == 0
  2919                     && sf->Gshift % 8 == 0
  2920                     && sf->Bshift % 8 == 0
  2921                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2922                     if (SDL_Has3DNow())
  2923                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2924                     if (SDL_HasMMX())
  2925                         return BlitRGBtoRGBPixelAlphaMMX;
  2926                 }
  2927 #endif
  2928                 if (sf->Amask == 0xff000000) {
  2929 #if SDL_ALTIVEC_BLITTERS
  2930                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2931                         && SDL_HasAltiVec())
  2932                         return BlitRGBtoRGBPixelAlphaAltivec;
  2933 #endif
  2934                     return BlitRGBtoRGBPixelAlpha;
  2935                 }
  2936             }
  2937 #if SDL_ALTIVEC_BLITTERS
  2938             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2939                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2940                 && SDL_HasAltiVec())
  2941                 return Blit32to32PixelAlphaAltivec;
  2942             else
  2943 #endif
  2944                 return BlitNtoNPixelAlpha;
  2945 
  2946         case 3:
  2947         default:
  2948             return BlitNtoNPixelAlpha;
  2949         }
  2950     }
  2951 }
  2952 
  2953 /* vi: set ts=4 sw=4 expandtab: */