src/video/SDL_blit_A.c
author Sam Lantinga
Thu, 14 Jun 2007 13:21:29 +0000
changeset 2120 2c835d58faad
parent 2101 c4e0afbcf1f6
child 2132 46648dc418ec
permissions -rw-r--r--
make indent
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void
    50 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    51 {
    52     int width = info->d_width;
    53     int height = info->d_height;
    54     Uint8 *src = info->s_pixels;
    55     int srcskip = info->s_skip;
    56     Uint8 *dst = info->d_pixels;
    57     int dstskip = info->d_skip;
    58     Uint8 *palmap = info->table;
    59     SDL_PixelFormat *srcfmt = info->src;
    60     SDL_PixelFormat *dstfmt = info->dst;
    61     int srcbpp = srcfmt->BytesPerPixel;
    62 
    63     const unsigned A = srcfmt->alpha;
    64 
    65     while (height--) {
    66 	    /* *INDENT-OFF* */
    67 	    DUFFS_LOOP4(
    68 	    {
    69 		Uint32 Pixel;
    70 		unsigned sR;
    71 		unsigned sG;
    72 		unsigned sB;
    73 		unsigned dR;
    74 		unsigned dG;
    75 		unsigned dB;
    76 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    77 		dR = dstfmt->palette->colors[*dst].r;
    78 		dG = dstfmt->palette->colors[*dst].g;
    79 		dB = dstfmt->palette->colors[*dst].b;
    80 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    81 		dR &= 0xff;
    82 		dG &= 0xff;
    83 		dB &= 0xff;
    84 		/* Pack RGB into 8bit pixel */
    85 		if ( palmap == NULL ) {
    86 		    *dst =((dR>>5)<<(3+2))|
    87 			  ((dG>>5)<<(2))|
    88 			  ((dB>>6)<<(0));
    89 		} else {
    90 		    *dst = palmap[((dR>>5)<<(3+2))|
    91 				  ((dG>>5)<<(2))  |
    92 				  ((dB>>6)<<(0))];
    93 		}
    94 		dst++;
    95 		src += srcbpp;
    96 	    },
    97 	    width);
    98 	    /* *INDENT-ON* */
    99         src += srcskip;
   100         dst += dstskip;
   101     }
   102 }
   103 
   104 /* N->1 blending with pixel alpha */
   105 static void
   106 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   107 {
   108     int width = info->d_width;
   109     int height = info->d_height;
   110     Uint8 *src = info->s_pixels;
   111     int srcskip = info->s_skip;
   112     Uint8 *dst = info->d_pixels;
   113     int dstskip = info->d_skip;
   114     Uint8 *palmap = info->table;
   115     SDL_PixelFormat *srcfmt = info->src;
   116     SDL_PixelFormat *dstfmt = info->dst;
   117     int srcbpp = srcfmt->BytesPerPixel;
   118 
   119     /* FIXME: fix alpha bit field expansion here too? */
   120     while (height--) {
   121 	    /* *INDENT-OFF* */
   122 	    DUFFS_LOOP4(
   123 	    {
   124 		Uint32 Pixel;
   125 		unsigned sR;
   126 		unsigned sG;
   127 		unsigned sB;
   128 		unsigned sA;
   129 		unsigned dR;
   130 		unsigned dG;
   131 		unsigned dB;
   132 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   133 		dR = dstfmt->palette->colors[*dst].r;
   134 		dG = dstfmt->palette->colors[*dst].g;
   135 		dB = dstfmt->palette->colors[*dst].b;
   136 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   137 		dR &= 0xff;
   138 		dG &= 0xff;
   139 		dB &= 0xff;
   140 		/* Pack RGB into 8bit pixel */
   141 		if ( palmap == NULL ) {
   142 		    *dst =((dR>>5)<<(3+2))|
   143 			  ((dG>>5)<<(2))|
   144 			  ((dB>>6)<<(0));
   145 		} else {
   146 		    *dst = palmap[((dR>>5)<<(3+2))|
   147 				  ((dG>>5)<<(2))  |
   148 				  ((dB>>6)<<(0))  ];
   149 		}
   150 		dst++;
   151 		src += srcbpp;
   152 	    },
   153 	    width);
   154 	    /* *INDENT-ON* */
   155         src += srcskip;
   156         dst += dstskip;
   157     }
   158 }
   159 
   160 /* colorkeyed N->1 blending with per-surface alpha */
   161 static void
   162 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   163 {
   164     int width = info->d_width;
   165     int height = info->d_height;
   166     Uint8 *src = info->s_pixels;
   167     int srcskip = info->s_skip;
   168     Uint8 *dst = info->d_pixels;
   169     int dstskip = info->d_skip;
   170     Uint8 *palmap = info->table;
   171     SDL_PixelFormat *srcfmt = info->src;
   172     SDL_PixelFormat *dstfmt = info->dst;
   173     int srcbpp = srcfmt->BytesPerPixel;
   174     Uint32 ckey = srcfmt->colorkey;
   175 
   176     const int A = srcfmt->alpha;
   177 
   178     while (height--) {
   179 	    /* *INDENT-OFF* */
   180 	    DUFFS_LOOP(
   181 	    {
   182 		Uint32 Pixel;
   183 		unsigned sR;
   184 		unsigned sG;
   185 		unsigned sB;
   186 		unsigned dR;
   187 		unsigned dG;
   188 		unsigned dB;
   189 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   190 		if ( Pixel != ckey ) {
   191 		    dR = dstfmt->palette->colors[*dst].r;
   192 		    dG = dstfmt->palette->colors[*dst].g;
   193 		    dB = dstfmt->palette->colors[*dst].b;
   194 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   195 		    dR &= 0xff;
   196 		    dG &= 0xff;
   197 		    dB &= 0xff;
   198 		    /* Pack RGB into 8bit pixel */
   199 		    if ( palmap == NULL ) {
   200 			*dst =((dR>>5)<<(3+2))|
   201 			      ((dG>>5)<<(2)) |
   202 			      ((dB>>6)<<(0));
   203 		    } else {
   204 			*dst = palmap[((dR>>5)<<(3+2))|
   205 				      ((dG>>5)<<(2))  |
   206 				      ((dB>>6)<<(0))  ];
   207 		    }
   208 		}
   209 		dst++;
   210 		src += srcbpp;
   211 	    },
   212 	    width);
   213 	    /* *INDENT-ON* */
   214         src += srcskip;
   215         dst += dstskip;
   216     }
   217 }
   218 
   219 #if GCC_ASMBLIT
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   221 static void
   222 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   223 {
   224     int width = info->d_width;
   225     int height = info->d_height;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   227     int srcskip = info->s_skip >> 2;
   228     Uint32 *dstp = (Uint32 *) info->d_pixels;
   229     int dstskip = info->d_skip >> 2;
   230     Uint32 dalpha = info->dst->Amask;
   231     Uint8 load[8];
   232 
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   234     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   236     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   237     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   238     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   239     while (height--) {
   240 		/* *INDENT-OFF* */
   241 		DUFFS_LOOP_DOUBLE2(
   242 		{
   243 			Uint32 s = *srcp++;
   244 			Uint32 d = *dstp;
   245 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   246 				   + (s & d & 0x00010101)) | dalpha;
   247 		},{
   248 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   249 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   250 
   251 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   252 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   253 
   254 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   255 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   256 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   257 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   258 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   259 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   260 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   261 			
   262 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   263 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   264 			dstp += 2;
   265 			srcp += 2;
   266 		}, width);
   267 		/* *INDENT-ON* */
   268         srcp += srcskip;
   269         dstp += dstskip;
   270     }
   271     emms();
   272 }
   273 
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   275 static void
   276 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   277 {
   278     SDL_PixelFormat *df = info->dst;
   279     unsigned alpha = info->src->alpha;
   280 
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   282         /* only call a128 version when R,G,B occupy lower bits */
   283         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   284     } else {
   285         int width = info->d_width;
   286         int height = info->d_height;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   288         int srcskip = info->s_skip >> 2;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   290         int dstskip = info->d_skip >> 2;
   291 
   292         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   293         /* form the alpha mult */
   294         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   295         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   296         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   297         alpha =
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   299                                                            Bshift);
   300         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   301         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   302         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   304         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   305         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   306 
   307         while (height--) {
   308 			/* *INDENT-OFF* */
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   315 
   316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   320 
   321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   324 				++srcp;
   325 				++dstp;
   326 			},{
   327 				/* Two Pixels Blend */
   328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   332 
   333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   337 
   338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   342 
   343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   347 
   348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   350 				
   351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   352 
   353   				srcp += 2;
   354   				dstp += 2;
   355   			}, width);
   356 			/* *INDENT-ON* */
   357             srcp += srcskip;
   358             dstp += dstskip;
   359         }
   360         emms();
   361     }
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void
   366 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   367 {
   368     int width = info->d_width;
   369     int height = info->d_height;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   371     int srcskip = info->s_skip >> 2;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   373     int dstskip = info->d_skip >> 2;
   374     SDL_PixelFormat *sf = info->src;
   375     Uint32 amask = sf->Amask;
   376 
   377     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   378     /* form multiplication mask */
   379     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   380     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   381     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   382     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   383     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   384     /* form channel masks */
   385     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   386     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   387     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   388     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   389     /* get alpha channel shift */
   390     /* *INDENT-OFF* */
   391     __asm__ __volatile__ (
   392         "movd %0, %%mm5"
   393         : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   394     /* *INDENT-ON* */
   395 
   396     while (height--) {
   397 	    /* *INDENT-OFF* */
   398 	    DUFFS_LOOP4({
   399 		Uint32 alpha = *srcp & amask;
   400 		/* FIXME: Here we special-case opaque alpha since the
   401 			compositioning used (>>8 instead of /255) doesn't handle
   402 			it correctly. Also special-case alpha=0 for speed?
   403 			Benchmark this! */
   404 		if(alpha == 0) {
   405 			/* do nothing */
   406 		} else if(alpha == amask) {
   407 			/* opaque alpha -- copy RGB, keep dst alpha */
   408 			/* using MMX here to free up regular registers for other things */
   409 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   410 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   411 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   412 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   413 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   414 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   415 		} else {
   416 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   417 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   418 
   419 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   420 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   421 
   422 			__asm__ __volatile__ (
   423 				"movd %0, %%mm4"
   424 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   425 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   426 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   427 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   428 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   429 
   430 			/* blend */		    
   431 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   432 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   433 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   434 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   435 			
   436 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   437 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   438 		}
   439 		++srcp;
   440 		++dstp;
   441 	    }, width);
   442 	    /* *INDENT-ON* */
   443         srcp += srcskip;
   444         dstp += dstskip;
   445     }
   446     emms();
   447 }
   448 
   449 /* End GCC_ASMBLIT */
   450 
   451 #elif MSVC_ASMBLIT
   452 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   453 static void
   454 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   455 {
   456     int width = info->d_width;
   457     int height = info->d_height;
   458     Uint32 *srcp = (Uint32 *) info->s_pixels;
   459     int srcskip = info->s_skip >> 2;
   460     Uint32 *dstp = (Uint32 *) info->d_pixels;
   461     int dstskip = info->d_skip >> 2;
   462     Uint32 dalpha = info->dst->Amask;
   463 
   464     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   465 
   466     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   467     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   468     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   469 
   470     while (height--) {
   471         int n = width;
   472         if (n & 1) {
   473             Uint32 s = *srcp++;
   474             Uint32 d = *dstp;
   475             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   476                        + (s & d & 0x00010101)) | dalpha;
   477             n--;
   478         }
   479 
   480         for (n >>= 1; n > 0; --n) {
   481             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   482             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   483 
   484             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   485             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   486 
   487             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   488             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   489             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   490             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   491 
   492             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   493             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   494             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   495             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   496 
   497             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   498             dstp += 2;
   499             srcp += 2;
   500         }
   501 
   502         srcp += srcskip;
   503         dstp += dstskip;
   504     }
   505     _mm_empty();
   506 }
   507 
   508 /* fast RGB888->(A)RGB888 blending with surface alpha */
   509 static void
   510 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   511 {
   512     SDL_PixelFormat *df = info->dst;
   513     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   514     unsigned alpha = info->src->alpha;
   515 
   516     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   517         /* only call a128 version when R,G,B occupy lower bits */
   518         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   519     } else {
   520         int width = info->d_width;
   521         int height = info->d_height;
   522         Uint32 *srcp = (Uint32 *) info->s_pixels;
   523         int srcskip = info->s_skip >> 2;
   524         Uint32 *dstp = (Uint32 *) info->d_pixels;
   525         int dstskip = info->d_skip >> 2;
   526         Uint32 dalpha = df->Amask;
   527         Uint32 amult;
   528 
   529         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   530 
   531         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   532         /* form the alpha mult */
   533         amult = alpha | (alpha << 8);
   534         amult = amult | (amult << 16);
   535         chanmask =
   536             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   537                                                            Bshift);
   538         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   539         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   540         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   541         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   542 
   543         while (height--) {
   544             int n = width;
   545             if (n & 1) {
   546                 /* One Pixel Blend */
   547                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   548                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   549 
   550                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   551                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   552 
   553                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   554                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   555                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   556                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   557 
   558                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   559                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   560                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   561 
   562                 ++srcp;
   563                 ++dstp;
   564 
   565                 n--;
   566             }
   567 
   568             for (n >>= 1; n > 0; --n) {
   569                 /* Two Pixels Blend */
   570                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   571                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   572                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   573                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   574 
   575                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   576                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   577                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   578                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   579 
   580                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   581                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   582                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   583                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   584 
   585                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   586                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   587                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   588                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   589 
   590                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   591                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   592 
   593                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   594 
   595                 srcp += 2;
   596                 dstp += 2;
   597             }
   598             srcp += srcskip;
   599             dstp += dstskip;
   600         }
   601         _mm_empty();
   602     }
   603 }
   604 
   605 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   606 static void
   607 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   608 {
   609     int width = info->d_width;
   610     int height = info->d_height;
   611     Uint32 *srcp = (Uint32 *) info->s_pixels;
   612     int srcskip = info->s_skip >> 2;
   613     Uint32 *dstp = (Uint32 *) info->d_pixels;
   614     int dstskip = info->d_skip >> 2;
   615     SDL_PixelFormat *sf = info->src;
   616     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   617     Uint32 amask = sf->Amask;
   618     Uint32 ashift = sf->Ashift;
   619     Uint64 multmask;
   620 
   621     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   622 
   623     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   624 	/* *INDENT-OFF* */
   625 	multmask = ~(0xFFFFI64 << (ashift * 2));
   626 	/* *INDENT-ON* */
   627     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   628 
   629     while (height--) {
   630 		/* *INDENT-OFF* */
   631 		DUFFS_LOOP4({
   632 		Uint32 alpha = *srcp & amask;
   633 		if (alpha == 0) {
   634 			/* do nothing */
   635 		} else if (alpha == amask) {
   636 			/* opaque alpha -- copy RGB, keep dst alpha */
   637 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   638 		} else {
   639 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   640 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   641 
   642 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   643 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   644 
   645 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   646 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   647 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   648 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   649 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   650 
   651 			/* blend */		    
   652 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   653 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   654 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   655 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   656 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   657 			
   658 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   659 		}
   660 		++srcp;
   661 		++dstp;
   662 	    }, width);
   663 		/* *INDENT-ON* */
   664         srcp += srcskip;
   665         dstp += dstskip;
   666     }
   667     _mm_empty();
   668 }
   669 
   670 /* End MSVC_ASMBLIT */
   671 
   672 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   673 
   674 #if SDL_ALTIVEC_BLITTERS
   675 #if __MWERKS__
   676 #pragma altivec_model on
   677 #endif
   678 #if HAVE_ALTIVEC_H
   679 #include <altivec.h>
   680 #endif
   681 #include <assert.h>
   682 
   683 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   684 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   685         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   686 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   687         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   688 #else
   689 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   690         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   691 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   692         (vector unsigned short) { a,b,c,d,e,f,g,h }
   693 #endif
   694 
   695 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   696 #define VECPRINT(msg, v) do { \
   697     vector unsigned int tmpvec = (vector unsigned int)(v); \
   698     unsigned int *vp = (unsigned int *)&tmpvec; \
   699     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   700 } while (0)
   701 
   702 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   703     (vector unsigned char)(
   704         0x00, 0x10, 0x02, 0x12,
   705         0x04, 0x14, 0x06, 0x16,
   706         0x08, 0x18, 0x0A, 0x1A,
   707         0x0C, 0x1C, 0x0E, 0x1E );
   708 */
   709 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   710 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   711 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   712 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   713     ? vec_lvsl(0, src) \
   714     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   715 
   716 
   717 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   718     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   719     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   720     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   721     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   722     /* valpha2 is 255-alpha */ \
   723     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   724     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   725     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   726     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   727     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   728     /* add source and dest */ \
   729     vtemp1 = vec_add(vtemp1, vtemp3); \
   730     vtemp2 = vec_add(vtemp2, vtemp4); \
   731     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   732     vtemp1 = vec_add(vtemp1, v1_16); \
   733     vtemp3 = vec_sr(vtemp1, v8_16); \
   734     vtemp1 = vec_add(vtemp1, vtemp3); \
   735     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   736     vtemp2 = vec_add(vtemp2, v1_16); \
   737     vtemp4 = vec_sr(vtemp2, v8_16); \
   738     vtemp2 = vec_add(vtemp2, vtemp4); \
   739     /* (>>8) and get ARGBARGBARGBARGB */ \
   740     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   741 } while (0)
   742 
   743 /* Calculate the permute vector used for 32->32 swizzling */
   744 static vector unsigned char
   745 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   746 {
   747     /*
   748      * We have to assume that the bits that aren't used by other
   749      *  colors is alpha, and it's one complete byte, since some formats
   750      *  leave alpha with a zero mask, but we should still swizzle the bits.
   751      */
   752     /* ARGB */
   753     const static struct SDL_PixelFormat default_pixel_format = {
   754         NULL, 0, 0,
   755         0, 0, 0, 0,
   756         16, 8, 0, 24,
   757         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   758         0, 0
   759     };
   760     if (!srcfmt) {
   761         srcfmt = &default_pixel_format;
   762     }
   763     if (!dstfmt) {
   764         dstfmt = &default_pixel_format;
   765     }
   766     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   767                                                        0x04, 0x04, 0x04, 0x04,
   768                                                        0x08, 0x08, 0x08, 0x08,
   769                                                        0x0C, 0x0C, 0x0C,
   770                                                        0x0C);
   771     vector unsigned char vswiz;
   772     vector unsigned int srcvec;
   773 #define RESHIFT(X) (3 - ((X) >> 3))
   774     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   775     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   776     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   777     Uint32 amask;
   778     /* Use zero for alpha if either surface doesn't have alpha */
   779     if (dstfmt->Amask) {
   780         amask =
   781             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   782                                                                    Ashift);
   783     } else {
   784         amask =
   785             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   786                           0xFFFFFFFF);
   787     }
   788 #undef RESHIFT
   789     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   790     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   791     return (vswiz);
   792 }
   793 
   794 static void
   795 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   796 {
   797     int height = info->d_height;
   798     Uint8 *src = (Uint8 *) info->s_pixels;
   799     int srcskip = info->s_skip;
   800     Uint8 *dst = (Uint8 *) info->d_pixels;
   801     int dstskip = info->d_skip;
   802     SDL_PixelFormat *srcfmt = info->src;
   803 
   804     vector unsigned char v0 = vec_splat_u8(0);
   805     vector unsigned short v8_16 = vec_splat_u16(8);
   806     vector unsigned short v1_16 = vec_splat_u16(1);
   807     vector unsigned short v2_16 = vec_splat_u16(2);
   808     vector unsigned short v3_16 = vec_splat_u16(3);
   809     vector unsigned int v8_32 = vec_splat_u32(8);
   810     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   811     vector unsigned short v3f =
   812         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   813                           0x003f, 0x003f, 0x003f, 0x003f);
   814     vector unsigned short vfc =
   815         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   816                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   817 
   818     /* 
   819        0x10 - 0x1f is the alpha
   820        0x00 - 0x0e evens are the red
   821        0x01 - 0x0f odds are zero
   822      */
   823     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   824                                                        0x10, 0x02, 0x01, 0x01,
   825                                                        0x10, 0x04, 0x01, 0x01,
   826                                                        0x10, 0x06, 0x01,
   827                                                        0x01);
   828     vector unsigned char vredalpha2 =
   829         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   830                                         vec_sl(v8_32, v16_32))
   831         );
   832     /*
   833        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   834        0x11 - 0x0f odds are blue
   835      */
   836     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   837                                                    0x04, 0x05, 0x06, 0x13,
   838                                                    0x08, 0x09, 0x0a, 0x15,
   839                                                    0x0c, 0x0d, 0x0e, 0x17);
   840     vector unsigned char vblue2 =
   841         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   842         );
   843     /*
   844        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   845        0x10 - 0x0e evens are green
   846      */
   847     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   848                                                     0x04, 0x05, 0x12, 0x07,
   849                                                     0x08, 0x09, 0x14, 0x0b,
   850                                                     0x0c, 0x0d, 0x16, 0x0f);
   851     vector unsigned char vgreen2 =
   852         (vector unsigned
   853          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   854         );
   855     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   856                                                     0x00, 0x0a, 0x00, 0x0e,
   857                                                     0x00, 0x12, 0x00, 0x16,
   858                                                     0x00, 0x1a, 0x00, 0x1e);
   859     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   860     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   861     vector unsigned char valphaPermute =
   862         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   863 
   864     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   865     vf800 = vec_sl(vf800, vec_splat_u16(8));
   866 
   867     while (height--) {
   868         int extrawidth;
   869         vector unsigned char valigner;
   870         vector unsigned char vsrc;
   871         vector unsigned char voverflow;
   872         int width = info->d_width;
   873 
   874 #define ONE_PIXEL_BLEND(condition, widthvar) \
   875         while (condition) { \
   876             Uint32 Pixel; \
   877             unsigned sR, sG, sB, dR, dG, dB, sA; \
   878             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   879             if(sA) { \
   880                 unsigned short dstpixel = *((unsigned short *)dst); \
   881                 dR = (dstpixel >> 8) & 0xf8; \
   882                 dG = (dstpixel >> 3) & 0xfc; \
   883                 dB = (dstpixel << 3) & 0xf8; \
   884                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   885                 *((unsigned short *)dst) = ( \
   886                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   887                 ); \
   888             } \
   889             src += 4; \
   890             dst += 2; \
   891             widthvar--; \
   892         }
   893         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   894         extrawidth = (width % 8);
   895         valigner = VEC_ALIGNER(src);
   896         vsrc = (vector unsigned char) vec_ld(0, src);
   897         width -= extrawidth;
   898         while (width) {
   899             vector unsigned char valpha;
   900             vector unsigned char vsrc1, vsrc2;
   901             vector unsigned char vdst1, vdst2;
   902             vector unsigned short vR, vG, vB;
   903             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   904 
   905             /* Load 8 pixels from src as ARGB */
   906             voverflow = (vector unsigned char) vec_ld(15, src);
   907             vsrc = vec_perm(vsrc, voverflow, valigner);
   908             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   909             src += 16;
   910             vsrc = (vector unsigned char) vec_ld(15, src);
   911             voverflow = vec_perm(voverflow, vsrc, valigner);
   912             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   913             src += 16;
   914 
   915             /* Load 8 pixels from dst as XRGB */
   916             voverflow = vec_ld(0, dst);
   917             vR = vec_and((vector unsigned short) voverflow, vf800);
   918             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   919             vG = vec_sl(vB, v2_16);
   920             vdst1 =
   921                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   922                                                 (vector unsigned char) vR,
   923                                                 vredalpha1);
   924             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   925             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   926             vdst2 =
   927                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   928                                                 (vector unsigned char) vR,
   929                                                 vredalpha2);
   930             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   931             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   932 
   933             /* Alpha blend 8 pixels as ARGB */
   934             valpha = vec_perm(vsrc1, v0, valphaPermute);
   935             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   936                                v8_16);
   937             valpha = vec_perm(vsrc2, v0, valphaPermute);
   938             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   939                                v8_16);
   940 
   941             /* Convert 8 pixels to 565 */
   942             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   943                                                         vdst1,
   944                                                         (vector unsigned int)
   945                                                         vdst2);
   946             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   947             vgpixel = vec_and(vgpixel, vfc);
   948             vgpixel = vec_sl(vgpixel, v3_16);
   949             vrpixel = vec_sl(vpixel, v1_16);
   950             vrpixel = vec_and(vrpixel, vf800);
   951             vbpixel = vec_and(vpixel, v3f);
   952             vdst1 =
   953                 vec_or((vector unsigned char) vrpixel,
   954                        (vector unsigned char) vgpixel);
   955             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   956 
   957             /* Store 8 pixels */
   958             vec_st(vdst1, 0, dst);
   959 
   960             width -= 8;
   961             dst += 16;
   962         }
   963         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   964 #undef ONE_PIXEL_BLEND
   965         src += srcskip;
   966         dst += dstskip;
   967     }
   968 }
   969 
   970 static void
   971 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   972 {
   973     unsigned alpha = info->src->alpha;
   974     int height = info->d_height;
   975     Uint32 *srcp = (Uint32 *) info->s_pixels;
   976     int srcskip = info->s_skip >> 2;
   977     Uint32 *dstp = (Uint32 *) info->d_pixels;
   978     int dstskip = info->d_skip >> 2;
   979     SDL_PixelFormat *srcfmt = info->src;
   980     SDL_PixelFormat *dstfmt = info->dst;
   981     unsigned sA = srcfmt->alpha;
   982     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   983     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   984     Uint32 ckey = info->src->colorkey;
   985     vector unsigned char mergePermute;
   986     vector unsigned char vsrcPermute;
   987     vector unsigned char vdstPermute;
   988     vector unsigned char vsdstPermute;
   989     vector unsigned char valpha;
   990     vector unsigned char valphamask;
   991     vector unsigned char vbits;
   992     vector unsigned char v0;
   993     vector unsigned short v1;
   994     vector unsigned short v8;
   995     vector unsigned int vckey;
   996     vector unsigned int vrgbmask;
   997 
   998     mergePermute = VEC_MERGE_PERMUTE();
   999     v0 = vec_splat_u8(0);
  1000     v1 = vec_splat_u16(1);
  1001     v8 = vec_splat_u16(8);
  1002 
  1003     /* set the alpha to 255 on the destination surf */
  1004     valphamask = VEC_ALPHA_MASK();
  1005 
  1006     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1007     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1008     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1009 
  1010     /* set a vector full of alpha and 255-alpha */
  1011     ((unsigned char *) &valpha)[0] = alpha;
  1012     valpha = vec_splat(valpha, 0);
  1013     vbits = (vector unsigned char) vec_splat_s8(-1);
  1014 
  1015     ckey &= rgbmask;
  1016     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1017     vckey = vec_splat(vckey, 0);
  1018     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1019     vrgbmask = vec_splat(vrgbmask, 0);
  1020 
  1021     while (height--) {
  1022         int width = info->d_width;
  1023 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1024         while (condition) { \
  1025             Uint32 Pixel; \
  1026             unsigned sR, sG, sB, dR, dG, dB; \
  1027             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  1028             if(sA && Pixel != ckey) { \
  1029                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  1030                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1031                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1032                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1033             } \
  1034             dstp++; \
  1035             srcp++; \
  1036             widthvar--; \
  1037         }
  1038         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1039         if (width > 0) {
  1040             int extrawidth = (width % 4);
  1041             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1042             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1043             width -= extrawidth;
  1044             while (width) {
  1045                 vector unsigned char vsel;
  1046                 vector unsigned char voverflow;
  1047                 vector unsigned char vd;
  1048                 vector unsigned char vd_orig;
  1049 
  1050                 /* s = *srcp */
  1051                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1052                 vs = vec_perm(vs, voverflow, valigner);
  1053 
  1054                 /* vsel is set for items that match the key */
  1055                 vsel =
  1056                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1057                                                    vrgbmask);
  1058                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1059                                                         vsel, vckey);
  1060 
  1061                 /* permute to source format */
  1062                 vs = vec_perm(vs, valpha, vsrcPermute);
  1063 
  1064                 /* d = *dstp */
  1065                 vd = (vector unsigned char) vec_ld(0, dstp);
  1066                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1067 
  1068                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1069 
  1070                 /* set the alpha channel to full on */
  1071                 vd = vec_or(vd, valphamask);
  1072 
  1073                 /* mask out color key */
  1074                 vd = vec_sel(vd, vd_orig, vsel);
  1075 
  1076                 /* permute to dest format */
  1077                 vd = vec_perm(vd, vbits, vdstPermute);
  1078 
  1079                 /* *dstp = res */
  1080                 vec_st((vector unsigned int) vd, 0, dstp);
  1081 
  1082                 srcp += 4;
  1083                 dstp += 4;
  1084                 width -= 4;
  1085                 vs = voverflow;
  1086             }
  1087             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1088         }
  1089 #undef ONE_PIXEL_BLEND
  1090 
  1091         srcp += srcskip;
  1092         dstp += dstskip;
  1093     }
  1094 }
  1095 
  1096 
  1097 static void
  1098 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1099 {
  1100     int width = info->d_width;
  1101     int height = info->d_height;
  1102     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1103     int srcskip = info->s_skip >> 2;
  1104     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1105     int dstskip = info->d_skip >> 2;
  1106     SDL_PixelFormat *srcfmt = info->src;
  1107     SDL_PixelFormat *dstfmt = info->dst;
  1108     vector unsigned char mergePermute;
  1109     vector unsigned char valphaPermute;
  1110     vector unsigned char vsrcPermute;
  1111     vector unsigned char vdstPermute;
  1112     vector unsigned char vsdstPermute;
  1113     vector unsigned char valphamask;
  1114     vector unsigned char vpixelmask;
  1115     vector unsigned char v0;
  1116     vector unsigned short v1;
  1117     vector unsigned short v8;
  1118 
  1119     v0 = vec_splat_u8(0);
  1120     v1 = vec_splat_u16(1);
  1121     v8 = vec_splat_u16(8);
  1122     mergePermute = VEC_MERGE_PERMUTE();
  1123     valphamask = VEC_ALPHA_MASK();
  1124     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1125     vpixelmask = vec_nor(valphamask, v0);
  1126     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1127     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1128     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1129 
  1130     while (height--) {
  1131         width = info->d_width;
  1132 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1133             Uint32 Pixel; \
  1134             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1135             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1136             if(sA) { \
  1137               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1138               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1139               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1140             } \
  1141             ++srcp; \
  1142             ++dstp; \
  1143             widthvar--; \
  1144         }
  1145         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1146         if (width > 0) {
  1147             /* vsrcPermute */
  1148             /* vdstPermute */
  1149             int extrawidth = (width % 4);
  1150             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1151             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1152             width -= extrawidth;
  1153             while (width) {
  1154                 vector unsigned char voverflow;
  1155                 vector unsigned char vd;
  1156                 vector unsigned char valpha;
  1157                 vector unsigned char vdstalpha;
  1158                 /* s = *srcp */
  1159                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1160                 vs = vec_perm(vs, voverflow, valigner);
  1161                 vs = vec_perm(vs, v0, vsrcPermute);
  1162 
  1163                 valpha = vec_perm(vs, v0, valphaPermute);
  1164 
  1165                 /* d = *dstp */
  1166                 vd = (vector unsigned char) vec_ld(0, dstp);
  1167                 vd = vec_perm(vd, v0, vsdstPermute);
  1168                 vdstalpha = vec_and(vd, valphamask);
  1169 
  1170                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1171 
  1172                 /* set the alpha to the dest alpha */
  1173                 vd = vec_and(vd, vpixelmask);
  1174                 vd = vec_or(vd, vdstalpha);
  1175                 vd = vec_perm(vd, v0, vdstPermute);
  1176 
  1177                 /* *dstp = res */
  1178                 vec_st((vector unsigned int) vd, 0, dstp);
  1179 
  1180                 srcp += 4;
  1181                 dstp += 4;
  1182                 width -= 4;
  1183                 vs = voverflow;
  1184 
  1185             }
  1186             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1187         }
  1188         srcp += srcskip;
  1189         dstp += dstskip;
  1190 #undef ONE_PIXEL_BLEND
  1191     }
  1192 }
  1193 
  1194 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1195 static void
  1196 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1197 {
  1198     int width = info->d_width;
  1199     int height = info->d_height;
  1200     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1201     int srcskip = info->s_skip >> 2;
  1202     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1203     int dstskip = info->d_skip >> 2;
  1204     vector unsigned char mergePermute;
  1205     vector unsigned char valphaPermute;
  1206     vector unsigned char valphamask;
  1207     vector unsigned char vpixelmask;
  1208     vector unsigned char v0;
  1209     vector unsigned short v1;
  1210     vector unsigned short v8;
  1211     v0 = vec_splat_u8(0);
  1212     v1 = vec_splat_u16(1);
  1213     v8 = vec_splat_u16(8);
  1214     mergePermute = VEC_MERGE_PERMUTE();
  1215     valphamask = VEC_ALPHA_MASK();
  1216     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1217 
  1218 
  1219     vpixelmask = vec_nor(valphamask, v0);
  1220     while (height--) {
  1221         width = info->d_width;
  1222 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1223         while ((condition)) { \
  1224             Uint32 dalpha; \
  1225             Uint32 d; \
  1226             Uint32 s1; \
  1227             Uint32 d1; \
  1228             Uint32 s = *srcp; \
  1229             Uint32 alpha = s >> 24; \
  1230             if(alpha) { \
  1231               if(alpha == SDL_ALPHA_OPAQUE) { \
  1232                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1233               } else { \
  1234                 d = *dstp; \
  1235                 dalpha = d & 0xff000000; \
  1236                 s1 = s & 0xff00ff; \
  1237                 d1 = d & 0xff00ff; \
  1238                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1239                 s &= 0xff00; \
  1240                 d &= 0xff00; \
  1241                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1242                 *dstp = d1 | d | dalpha; \
  1243               } \
  1244             } \
  1245             ++srcp; \
  1246             ++dstp; \
  1247             widthvar--; \
  1248 	    }
  1249         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1250         if (width > 0) {
  1251             int extrawidth = (width % 4);
  1252             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1253             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1254             width -= extrawidth;
  1255             while (width) {
  1256                 vector unsigned char voverflow;
  1257                 vector unsigned char vd;
  1258                 vector unsigned char valpha;
  1259                 vector unsigned char vdstalpha;
  1260                 /* s = *srcp */
  1261                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1262                 vs = vec_perm(vs, voverflow, valigner);
  1263 
  1264                 valpha = vec_perm(vs, v0, valphaPermute);
  1265 
  1266                 /* d = *dstp */
  1267                 vd = (vector unsigned char) vec_ld(0, dstp);
  1268                 vdstalpha = vec_and(vd, valphamask);
  1269 
  1270                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1271 
  1272                 /* set the alpha to the dest alpha */
  1273                 vd = vec_and(vd, vpixelmask);
  1274                 vd = vec_or(vd, vdstalpha);
  1275 
  1276                 /* *dstp = res */
  1277                 vec_st((vector unsigned int) vd, 0, dstp);
  1278 
  1279                 srcp += 4;
  1280                 dstp += 4;
  1281                 width -= 4;
  1282                 vs = voverflow;
  1283             }
  1284             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1285         }
  1286         srcp += srcskip;
  1287         dstp += dstskip;
  1288     }
  1289 #undef ONE_PIXEL_BLEND
  1290 }
  1291 
  1292 static void
  1293 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1294 {
  1295     /* XXX : 6 */
  1296     unsigned alpha = info->src->alpha;
  1297     int height = info->d_height;
  1298     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1299     int srcskip = info->s_skip >> 2;
  1300     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1301     int dstskip = info->d_skip >> 2;
  1302     SDL_PixelFormat *srcfmt = info->src;
  1303     SDL_PixelFormat *dstfmt = info->dst;
  1304     unsigned sA = srcfmt->alpha;
  1305     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1306     vector unsigned char mergePermute;
  1307     vector unsigned char vsrcPermute;
  1308     vector unsigned char vdstPermute;
  1309     vector unsigned char vsdstPermute;
  1310     vector unsigned char valpha;
  1311     vector unsigned char valphamask;
  1312     vector unsigned char vbits;
  1313     vector unsigned short v1;
  1314     vector unsigned short v8;
  1315 
  1316     mergePermute = VEC_MERGE_PERMUTE();
  1317     v1 = vec_splat_u16(1);
  1318     v8 = vec_splat_u16(8);
  1319 
  1320     /* set the alpha to 255 on the destination surf */
  1321     valphamask = VEC_ALPHA_MASK();
  1322 
  1323     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1324     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1325     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1326 
  1327     /* set a vector full of alpha and 255-alpha */
  1328     ((unsigned char *) &valpha)[0] = alpha;
  1329     valpha = vec_splat(valpha, 0);
  1330     vbits = (vector unsigned char) vec_splat_s8(-1);
  1331 
  1332     while (height--) {
  1333         int width = info->d_width;
  1334 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1335             Uint32 Pixel; \
  1336             unsigned sR, sG, sB, dR, dG, dB; \
  1337             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1338             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1339             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1340             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1341             ++srcp; \
  1342             ++dstp; \
  1343             widthvar--; \
  1344         }
  1345         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1346         if (width > 0) {
  1347             int extrawidth = (width % 4);
  1348             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1349             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1350             width -= extrawidth;
  1351             while (width) {
  1352                 vector unsigned char voverflow;
  1353                 vector unsigned char vd;
  1354 
  1355                 /* s = *srcp */
  1356                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1357                 vs = vec_perm(vs, voverflow, valigner);
  1358                 vs = vec_perm(vs, valpha, vsrcPermute);
  1359 
  1360                 /* d = *dstp */
  1361                 vd = (vector unsigned char) vec_ld(0, dstp);
  1362                 vd = vec_perm(vd, vd, vsdstPermute);
  1363 
  1364                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1365 
  1366                 /* set the alpha channel to full on */
  1367                 vd = vec_or(vd, valphamask);
  1368                 vd = vec_perm(vd, vbits, vdstPermute);
  1369 
  1370                 /* *dstp = res */
  1371                 vec_st((vector unsigned int) vd, 0, dstp);
  1372 
  1373                 srcp += 4;
  1374                 dstp += 4;
  1375                 width -= 4;
  1376                 vs = voverflow;
  1377             }
  1378             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1379         }
  1380 #undef ONE_PIXEL_BLEND
  1381 
  1382         srcp += srcskip;
  1383         dstp += dstskip;
  1384     }
  1385 
  1386 }
  1387 
  1388 
  1389 /* fast RGB888->(A)RGB888 blending */
  1390 static void
  1391 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1392 {
  1393     unsigned alpha = info->src->alpha;
  1394     int height = info->d_height;
  1395     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1396     int srcskip = info->s_skip >> 2;
  1397     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1398     int dstskip = info->d_skip >> 2;
  1399     vector unsigned char mergePermute;
  1400     vector unsigned char valpha;
  1401     vector unsigned char valphamask;
  1402     vector unsigned short v1;
  1403     vector unsigned short v8;
  1404 
  1405     mergePermute = VEC_MERGE_PERMUTE();
  1406     v1 = vec_splat_u16(1);
  1407     v8 = vec_splat_u16(8);
  1408 
  1409     /* set the alpha to 255 on the destination surf */
  1410     valphamask = VEC_ALPHA_MASK();
  1411 
  1412     /* set a vector full of alpha and 255-alpha */
  1413     ((unsigned char *) &valpha)[0] = alpha;
  1414     valpha = vec_splat(valpha, 0);
  1415 
  1416     while (height--) {
  1417         int width = info->d_width;
  1418 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1419             Uint32 s = *srcp; \
  1420             Uint32 d = *dstp; \
  1421             Uint32 s1 = s & 0xff00ff; \
  1422             Uint32 d1 = d & 0xff00ff; \
  1423             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1424                  & 0xff00ff; \
  1425             s &= 0xff00; \
  1426             d &= 0xff00; \
  1427             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1428             *dstp = d1 | d | 0xff000000; \
  1429             ++srcp; \
  1430             ++dstp; \
  1431             widthvar--; \
  1432         }
  1433         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1434         if (width > 0) {
  1435             int extrawidth = (width % 4);
  1436             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1437             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1438             width -= extrawidth;
  1439             while (width) {
  1440                 vector unsigned char voverflow;
  1441                 vector unsigned char vd;
  1442 
  1443                 /* s = *srcp */
  1444                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1445                 vs = vec_perm(vs, voverflow, valigner);
  1446 
  1447                 /* d = *dstp */
  1448                 vd = (vector unsigned char) vec_ld(0, dstp);
  1449 
  1450                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1451 
  1452                 /* set the alpha channel to full on */
  1453                 vd = vec_or(vd, valphamask);
  1454 
  1455                 /* *dstp = res */
  1456                 vec_st((vector unsigned int) vd, 0, dstp);
  1457 
  1458                 srcp += 4;
  1459                 dstp += 4;
  1460                 width -= 4;
  1461                 vs = voverflow;
  1462             }
  1463             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1464         }
  1465 #undef ONE_PIXEL_BLEND
  1466 
  1467         srcp += srcskip;
  1468         dstp += dstskip;
  1469     }
  1470 }
  1471 
  1472 #if __MWERKS__
  1473 #pragma altivec_model off
  1474 #endif
  1475 #endif /* SDL_ALTIVEC_BLITTERS */
  1476 
  1477 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1478 static void
  1479 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1480 {
  1481     int width = info->d_width;
  1482     int height = info->d_height;
  1483     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1484     int srcskip = info->s_skip >> 2;
  1485     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1486     int dstskip = info->d_skip >> 2;
  1487 
  1488     while (height--) {
  1489 	    /* *INDENT-OFF* */
  1490 	    DUFFS_LOOP4({
  1491 		    Uint32 s = *srcp++;
  1492 		    Uint32 d = *dstp;
  1493 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1494 			       + (s & d & 0x00010101)) | 0xff000000;
  1495 	    }, width);
  1496 	    /* *INDENT-ON* */
  1497         srcp += srcskip;
  1498         dstp += dstskip;
  1499     }
  1500 }
  1501 
  1502 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1503 static void
  1504 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1505 {
  1506     unsigned alpha = info->src->alpha;
  1507     if (alpha == 128) {
  1508         BlitRGBtoRGBSurfaceAlpha128(info);
  1509     } else {
  1510         int width = info->d_width;
  1511         int height = info->d_height;
  1512         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1513         int srcskip = info->s_skip >> 2;
  1514         Uint32 *dstp = (Uint32 *) info->d_pixels;
  1515         int dstskip = info->d_skip >> 2;
  1516         Uint32 s;
  1517         Uint32 d;
  1518         Uint32 s1;
  1519         Uint32 d1;
  1520 
  1521         while (height--) {
  1522 			/* *INDENT-OFF* */
  1523 			DUFFS_LOOP_DOUBLE2({
  1524 				/* One Pixel Blend */
  1525 				s = *srcp;
  1526 				d = *dstp;
  1527 				s1 = s & 0xff00ff;
  1528 				d1 = d & 0xff00ff;
  1529 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1530 				     & 0xff00ff;
  1531 				s &= 0xff00;
  1532 				d &= 0xff00;
  1533 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1534 				*dstp = d1 | d | 0xff000000;
  1535 				++srcp;
  1536 				++dstp;
  1537 			},{
  1538 			        /* Two Pixels Blend */
  1539 				s = *srcp;
  1540 				d = *dstp;
  1541 				s1 = s & 0xff00ff;
  1542 				d1 = d & 0xff00ff;
  1543 				d1 += (s1 - d1) * alpha >> 8;
  1544 				d1 &= 0xff00ff;
  1545 				     
  1546 				s = ((s & 0xff00) >> 8) | 
  1547 					((srcp[1] & 0xff00) << 8);
  1548 				d = ((d & 0xff00) >> 8) |
  1549 					((dstp[1] & 0xff00) << 8);
  1550 				d += (s - d) * alpha >> 8;
  1551 				d &= 0x00ff00ff;
  1552 				
  1553 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1554 				++srcp;
  1555 				
  1556 			        s1 = *srcp;
  1557 				d1 = *dstp;
  1558 				s1 &= 0xff00ff;
  1559 				d1 &= 0xff00ff;
  1560 				d1 += (s1 - d1) * alpha >> 8;
  1561 				d1 &= 0xff00ff;
  1562 				
  1563 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1564 				++srcp;
  1565 				++dstp;
  1566 			}, width);
  1567 			/* *INDENT-ON* */
  1568             srcp += srcskip;
  1569             dstp += dstskip;
  1570         }
  1571     }
  1572 }
  1573 
  1574 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1575 static void
  1576 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1577 {
  1578     int width = info->d_width;
  1579     int height = info->d_height;
  1580     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1581     int srcskip = info->s_skip >> 2;
  1582     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1583     int dstskip = info->d_skip >> 2;
  1584 
  1585     while (height--) {
  1586 	    /* *INDENT-OFF* */
  1587 	    DUFFS_LOOP4({
  1588 		Uint32 dalpha;
  1589 		Uint32 d;
  1590 		Uint32 s1;
  1591 		Uint32 d1;
  1592 		Uint32 s = *srcp;
  1593 		Uint32 alpha = s >> 24;
  1594 		/* FIXME: Here we special-case opaque alpha since the
  1595 		   compositioning used (>>8 instead of /255) doesn't handle
  1596 		   it correctly. Also special-case alpha=0 for speed?
  1597 		   Benchmark this! */
  1598 		if(alpha) {   
  1599 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1600 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1601 		  } else {
  1602 		    /*
  1603 		     * take out the middle component (green), and process
  1604 		     * the other two in parallel. One multiply less.
  1605 		     */
  1606 		    d = *dstp;
  1607 		    dalpha = d & 0xff000000;
  1608 		    s1 = s & 0xff00ff;
  1609 		    d1 = d & 0xff00ff;
  1610 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1611 		    s &= 0xff00;
  1612 		    d &= 0xff00;
  1613 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1614 		    *dstp = d1 | d | dalpha;
  1615 		  }
  1616 		}
  1617 		++srcp;
  1618 		++dstp;
  1619 	    }, width);
  1620 	    /* *INDENT-ON* */
  1621         srcp += srcskip;
  1622         dstp += dstskip;
  1623     }
  1624 }
  1625 
  1626 #if GCC_ASMBLIT
  1627 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1628 static void
  1629 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1630 {
  1631     int width = info->d_width;
  1632     int height = info->d_height;
  1633     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1634     int srcskip = info->s_skip >> 2;
  1635     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1636     int dstskip = info->d_skip >> 2;
  1637     SDL_PixelFormat *sf = info->src;
  1638     Uint32 amask = sf->Amask;
  1639 
  1640     __asm__(
  1641                /* make mm6 all zeros. */
  1642                "pxor       %%mm6, %%mm6\n"
  1643                /* Make a mask to preserve the alpha. */
  1644                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1645                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1646                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1647                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1648                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1649                /* form channel masks */
  1650                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1651                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1652                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1653                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1654                /* get alpha channel shift */
  1655                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1656   : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
  1657 
  1658     while (height--) {
  1659 
  1660 	    /* *INDENT-OFF* */
  1661 	    DUFFS_LOOP4({
  1662 		Uint32 alpha;
  1663 
  1664 		__asm__ (
  1665 		"prefetch 64(%0)\n"
  1666 		"prefetch 64(%1)\n"
  1667 			: : "r" (srcp), "r" (dstp) );
  1668 
  1669 		alpha = *srcp & amask;
  1670 		/* FIXME: Here we special-case opaque alpha since the
  1671 		   compositioning used (>>8 instead of /255) doesn't handle
  1672 		   it correctly. Also special-case alpha=0 for speed?
  1673 		   Benchmark this! */
  1674 		if(alpha == 0) {
  1675 		    /* do nothing */
  1676 		}
  1677 		else if(alpha == amask) {
  1678 			/* opaque alpha -- copy RGB, keep dst alpha */
  1679 		    /* using MMX here to free up regular registers for other things */
  1680 			    __asm__ (
  1681 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1682 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1683 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1684 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1685 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1686 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1687 
  1688 		     : : "r" (srcp), "r" (dstp) );
  1689 		} 
  1690 
  1691 		else {
  1692 			    __asm__ (
  1693 		    /* load in the source, and dst. */
  1694 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1695 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1696 
  1697 		    /* Move the src alpha into mm2 */
  1698 
  1699 		    /* if supporting pshufw */
  1700 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1701 		    /*"psrlw     $8, %%mm2\n" */
  1702 		    
  1703 		    /* else: */
  1704 		    "movd       %2,    %%mm2\n"
  1705 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1706 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1707 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1708 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1709 
  1710 		    /* move the colors into words. */
  1711 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1712 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1713 
  1714 		    /* src - dst */
  1715 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1716 
  1717 		    /* A * (src-dst) */
  1718 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1719 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1720 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1721 
  1722 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1723 		    
  1724 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1725 
  1726 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1727 
  1728 		}
  1729 		++srcp;
  1730 		++dstp;
  1731 	    }, width);
  1732 	    /* *INDENT-ON* */
  1733         srcp += srcskip;
  1734         dstp += dstskip;
  1735     }
  1736 
  1737   __asm__("emms\n":);
  1738 }
  1739 
  1740 /* End GCC_ASMBLIT*/
  1741 
  1742 #elif MSVC_ASMBLIT
  1743 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1744 static void
  1745 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1746 {
  1747     int width = info->d_width;
  1748     int height = info->d_height;
  1749     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1750     int srcskip = info->s_skip >> 2;
  1751     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1752     int dstskip = info->d_skip >> 2;
  1753     SDL_PixelFormat *sf = info->src;
  1754     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1755     Uint32 amask = sf->Amask;
  1756     Uint32 ashift = sf->Ashift;
  1757     Uint64 multmask;
  1758 
  1759     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1760 
  1761     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1762 	/* *INDENT-OFF* */
  1763     multmask = ~(0xFFFFI64 << (ashift * 2));
  1764 	/* *INDENT-ON* */
  1765     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1766 
  1767     while (height--) {
  1768 	    /* *INDENT-OFF* */
  1769 	    DUFFS_LOOP4({
  1770 		Uint32 alpha;
  1771 
  1772 		_m_prefetch(srcp + 16);
  1773 		_m_prefetch(dstp + 16);
  1774 
  1775 		alpha = *srcp & amask;
  1776 		if (alpha == 0) {
  1777 			/* do nothing */
  1778 		} else if (alpha == amask) {
  1779 			/* copy RGB, keep dst alpha */
  1780 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1781 		} else {
  1782 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1783 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1784 
  1785 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1786 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1787 
  1788 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1789 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1790 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1791 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1792 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1793 
  1794 			/* blend */		    
  1795 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1796 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1797 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1798 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1799 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1800 			
  1801 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1802 		}
  1803 		++srcp;
  1804 		++dstp;
  1805 	    }, width);
  1806 	    /* *INDENT-ON* */
  1807         srcp += srcskip;
  1808         dstp += dstskip;
  1809     }
  1810     _mm_empty();
  1811 }
  1812 
  1813 /* End MSVC_ASMBLIT */
  1814 
  1815 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1816 
  1817 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1818 
  1819 /* blend a single 16 bit pixel at 50% */
  1820 #define BLEND16_50(d, s, mask)						\
  1821 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1822 
  1823 /* blend two 16 bit pixels at 50% */
  1824 #define BLEND2x16_50(d, s, mask)					     \
  1825 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1826 	 + (s & d & (~(mask | mask << 16))))
  1827 
  1828 static void
  1829 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1830 {
  1831     int width = info->d_width;
  1832     int height = info->d_height;
  1833     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1834     int srcskip = info->s_skip >> 1;
  1835     Uint16 *dstp = (Uint16 *) info->d_pixels;
  1836     int dstskip = info->d_skip >> 1;
  1837 
  1838     while (height--) {
  1839         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1840             /*
  1841              * Source and destination not aligned, pipeline it.
  1842              * This is mostly a win for big blits but no loss for
  1843              * small ones
  1844              */
  1845             Uint32 prev_sw;
  1846             int w = width;
  1847 
  1848             /* handle odd destination */
  1849             if ((uintptr_t) dstp & 2) {
  1850                 Uint16 d = *dstp, s = *srcp;
  1851                 *dstp = BLEND16_50(d, s, mask);
  1852                 dstp++;
  1853                 srcp++;
  1854                 w--;
  1855             }
  1856             srcp++;             /* srcp is now 32-bit aligned */
  1857 
  1858             /* bootstrap pipeline with first halfword */
  1859             prev_sw = ((Uint32 *) srcp)[-1];
  1860 
  1861             while (w > 1) {
  1862                 Uint32 sw, dw, s;
  1863                 sw = *(Uint32 *) srcp;
  1864                 dw = *(Uint32 *) dstp;
  1865 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1866                 s = (prev_sw << 16) + (sw >> 16);
  1867 #else
  1868                 s = (prev_sw >> 16) + (sw << 16);
  1869 #endif
  1870                 prev_sw = sw;
  1871                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1872                 dstp += 2;
  1873                 srcp += 2;
  1874                 w -= 2;
  1875             }
  1876 
  1877             /* final pixel if any */
  1878             if (w) {
  1879                 Uint16 d = *dstp, s;
  1880 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1881                 s = (Uint16) prev_sw;
  1882 #else
  1883                 s = (Uint16) (prev_sw >> 16);
  1884 #endif
  1885                 *dstp = BLEND16_50(d, s, mask);
  1886                 srcp++;
  1887                 dstp++;
  1888             }
  1889             srcp += srcskip - 1;
  1890             dstp += dstskip;
  1891         } else {
  1892             /* source and destination are aligned */
  1893             int w = width;
  1894 
  1895             /* first odd pixel? */
  1896             if ((uintptr_t) srcp & 2) {
  1897                 Uint16 d = *dstp, s = *srcp;
  1898                 *dstp = BLEND16_50(d, s, mask);
  1899                 srcp++;
  1900                 dstp++;
  1901                 w--;
  1902             }
  1903             /* srcp and dstp are now 32-bit aligned */
  1904 
  1905             while (w > 1) {
  1906                 Uint32 sw = *(Uint32 *) srcp;
  1907                 Uint32 dw = *(Uint32 *) dstp;
  1908                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1909                 srcp += 2;
  1910                 dstp += 2;
  1911                 w -= 2;
  1912             }
  1913 
  1914             /* last odd pixel? */
  1915             if (w) {
  1916                 Uint16 d = *dstp, s = *srcp;
  1917                 *dstp = BLEND16_50(d, s, mask);
  1918                 srcp++;
  1919                 dstp++;
  1920             }
  1921             srcp += srcskip;
  1922             dstp += dstskip;
  1923         }
  1924     }
  1925 }
  1926 
  1927 #if GCC_ASMBLIT
  1928 /* fast RGB565->RGB565 blending with surface alpha */
  1929 static void
  1930 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1931 {
  1932     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1933     if (alpha == 128) {
  1934         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1935     } else {
  1936         int width = info->d_width;
  1937         int height = info->d_height;
  1938         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1939         int srcskip = info->s_skip >> 1;
  1940         Uint16 *dstp = (Uint16 *) info->d_pixels;
  1941         int dstskip = info->d_skip >> 1;
  1942         Uint32 s, d;
  1943         Uint8 load[8];
  1944 
  1945         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1946         *(Uint64 *) load = alpha;
  1947         alpha >>= 3;            /* downscale alpha to 5 bits */
  1948 
  1949         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1950         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1951         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1952         /* position alpha to allow for mullo and mulhi on diff channels
  1953            to reduce the number of operations */
  1954         psllq_i2r(3, mm0);
  1955 
  1956         /* Setup the 565 color channel masks */
  1957         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1958         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1959         *(Uint64 *) load = 0x001F001F001F001FULL;
  1960         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1961         while (height--) {
  1962 			/* *INDENT-OFF* */
  1963 			DUFFS_LOOP_QUATRO2(
  1964 			{
  1965 				s = *srcp++;
  1966 				d = *dstp;
  1967 				/*
  1968 				 * shift out the middle component (green) to
  1969 				 * the high 16 bits, and process all three RGB
  1970 				 * components at the same time.
  1971 				 */
  1972 				s = (s | s << 16) & 0x07e0f81f;
  1973 				d = (d | d << 16) & 0x07e0f81f;
  1974 				d += (s - d) * alpha >> 5;
  1975 				d &= 0x07e0f81f;
  1976 				*dstp++ = d | d >> 16;
  1977 			},{
  1978 				s = *srcp++;
  1979 				d = *dstp;
  1980 				/*
  1981 				 * shift out the middle component (green) to
  1982 				 * the high 16 bits, and process all three RGB
  1983 				 * components at the same time.
  1984 				 */
  1985 				s = (s | s << 16) & 0x07e0f81f;
  1986 				d = (d | d << 16) & 0x07e0f81f;
  1987 				d += (s - d) * alpha >> 5;
  1988 				d &= 0x07e0f81f;
  1989 				*dstp++ = d | d >> 16;
  1990 				s = *srcp++;
  1991 				d = *dstp;
  1992 				/*
  1993 				 * shift out the middle component (green) to
  1994 				 * the high 16 bits, and process all three RGB
  1995 				 * components at the same time.
  1996 				 */
  1997 				s = (s | s << 16) & 0x07e0f81f;
  1998 				d = (d | d << 16) & 0x07e0f81f;
  1999 				d += (s - d) * alpha >> 5;
  2000 				d &= 0x07e0f81f;
  2001 				*dstp++ = d | d >> 16;
  2002 			},{
  2003 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2004 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2005 
  2006 				/* red -- does not need a mask since the right shift clears
  2007 				   the uninteresting bits */
  2008 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2009 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2010 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2011 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2012 
  2013 				/* blend */
  2014 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2015 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2016 				/* alpha used is actually 11 bits
  2017 				   11 + 5 = 16 bits, so the sign bits are lost */
  2018 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2019 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2020 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2021 
  2022 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2023 
  2024 				/* green -- process the bits in place */
  2025 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2026 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2027 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2028 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2029 
  2030 				/* blend */
  2031 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2032 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2033 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2034 				   bits are gone and the sign bits present */
  2035 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2036 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2037 
  2038 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2039 
  2040 				/* blue */
  2041 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2042 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2043 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2044 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2045 
  2046 				/* blend */
  2047 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2048 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2049 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2050 				   the interesting bits will need to be MASKed */
  2051 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2052 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2053 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2054 
  2055 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2056 
  2057 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2058 
  2059 				srcp += 4;
  2060 				dstp += 4;
  2061 			}, width);			
  2062 			/* *INDENT-ON* */
  2063             srcp += srcskip;
  2064             dstp += dstskip;
  2065         }
  2066         emms();
  2067     }
  2068 }
  2069 
  2070 /* fast RGB555->RGB555 blending with surface alpha */
  2071 static void
  2072 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2073 {
  2074     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2075     if (alpha == 128) {
  2076         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2077     } else {
  2078         int width = info->d_width;
  2079         int height = info->d_height;
  2080         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2081         int srcskip = info->s_skip >> 1;
  2082         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2083         int dstskip = info->d_skip >> 1;
  2084         Uint32 s, d;
  2085         Uint8 load[8];
  2086 
  2087         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2088         *(Uint64 *) load = alpha;
  2089         alpha >>= 3;            /* downscale alpha to 5 bits */
  2090 
  2091         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2092         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2093         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2094         /* position alpha to allow for mullo and mulhi on diff channels
  2095            to reduce the number of operations */
  2096         psllq_i2r(3, mm0);
  2097 
  2098         /* Setup the 555 color channel masks */
  2099         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2100         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2101         *(Uint64 *) load = 0x001F001F001F001FULL;
  2102         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2103         while (height--) {
  2104 			/* *INDENT-OFF* */
  2105 			DUFFS_LOOP_QUATRO2(
  2106 			{
  2107 				s = *srcp++;
  2108 				d = *dstp;
  2109 				/*
  2110 				 * shift out the middle component (green) to
  2111 				 * the high 16 bits, and process all three RGB
  2112 				 * components at the same time.
  2113 				 */
  2114 				s = (s | s << 16) & 0x03e07c1f;
  2115 				d = (d | d << 16) & 0x03e07c1f;
  2116 				d += (s - d) * alpha >> 5;
  2117 				d &= 0x03e07c1f;
  2118 				*dstp++ = d | d >> 16;
  2119 			},{
  2120 				s = *srcp++;
  2121 				d = *dstp;
  2122 				/*
  2123 				 * shift out the middle component (green) to
  2124 				 * the high 16 bits, and process all three RGB
  2125 				 * components at the same time.
  2126 				 */
  2127 				s = (s | s << 16) & 0x03e07c1f;
  2128 				d = (d | d << 16) & 0x03e07c1f;
  2129 				d += (s - d) * alpha >> 5;
  2130 				d &= 0x03e07c1f;
  2131 				*dstp++ = d | d >> 16;
  2132 			        s = *srcp++;
  2133 				d = *dstp;
  2134 				/*
  2135 				 * shift out the middle component (green) to
  2136 				 * the high 16 bits, and process all three RGB
  2137 				 * components at the same time.
  2138 				 */
  2139 				s = (s | s << 16) & 0x03e07c1f;
  2140 				d = (d | d << 16) & 0x03e07c1f;
  2141 				d += (s - d) * alpha >> 5;
  2142 				d &= 0x03e07c1f;
  2143 				*dstp++ = d | d >> 16;
  2144 			},{
  2145 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2146 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2147 
  2148 				/* red -- process the bits in place */
  2149 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2150 					/* by reusing the GREEN mask we free up another mmx
  2151 					   register to accumulate the result */
  2152 
  2153 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2154 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2155 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2156 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2157 
  2158 				/* blend */
  2159 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2160 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2161 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2162 				   cleared by a MASK below */
  2163 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2164 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2165 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2166 
  2167 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2168 
  2169 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2170 
  2171 				/* green -- process the bits in place */
  2172 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2173 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2174 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2175 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2176 
  2177 				/* blend */
  2178 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2179 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2180 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2181 				   bits are gone and the sign bits present */
  2182 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2183 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2184 
  2185 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2186 
  2187 				/* blue */
  2188 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2189 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2190 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2191 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2192 
  2193 				/* blend */
  2194 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2195 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2196 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2197 				   the interesting bits will need to be MASKed */
  2198 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2199 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2200 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2201 
  2202 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2203 
  2204 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2205 
  2206 				srcp += 4;
  2207 				dstp += 4;
  2208 			}, width);
  2209 			/* *INDENT-ON* */
  2210             srcp += srcskip;
  2211             dstp += dstskip;
  2212         }
  2213         emms();
  2214     }
  2215 }
  2216 
  2217 /* End GCC_ASMBLIT */
  2218 
  2219 #elif MSVC_ASMBLIT
  2220 /* fast RGB565->RGB565 blending with surface alpha */
  2221 static void
  2222 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2223 {
  2224     unsigned alpha = info->src->alpha;
  2225     if (alpha == 128) {
  2226         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2227     } else {
  2228         int width = info->d_width;
  2229         int height = info->d_height;
  2230         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2231         int srcskip = info->s_skip >> 1;
  2232         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2233         int dstskip = info->d_skip >> 1;
  2234         Uint32 s, d;
  2235 
  2236         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2237 
  2238         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2239         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2240         alpha >>= 3;            /* downscale alpha to 5 bits */
  2241 
  2242         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2243         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2244         /* position alpha to allow for mullo and mulhi on diff channels
  2245            to reduce the number of operations */
  2246         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2247 
  2248         /* Setup the 565 color channel masks */
  2249         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2250         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2251 
  2252         while (height--) {
  2253 			/* *INDENT-OFF* */
  2254 			DUFFS_LOOP_QUATRO2(
  2255 			{
  2256 				s = *srcp++;
  2257 				d = *dstp;
  2258 				/*
  2259 				 * shift out the middle component (green) to
  2260 				 * the high 16 bits, and process all three RGB
  2261 				 * components at the same time.
  2262 				 */
  2263 				s = (s | s << 16) & 0x07e0f81f;
  2264 				d = (d | d << 16) & 0x07e0f81f;
  2265 				d += (s - d) * alpha >> 5;
  2266 				d &= 0x07e0f81f;
  2267 				*dstp++ = (Uint16)(d | d >> 16);
  2268 			},{
  2269 				s = *srcp++;
  2270 				d = *dstp;
  2271 				/*
  2272 				 * shift out the middle component (green) to
  2273 				 * the high 16 bits, and process all three RGB
  2274 				 * components at the same time.
  2275 				 */
  2276 				s = (s | s << 16) & 0x07e0f81f;
  2277 				d = (d | d << 16) & 0x07e0f81f;
  2278 				d += (s - d) * alpha >> 5;
  2279 				d &= 0x07e0f81f;
  2280 				*dstp++ = (Uint16)(d | d >> 16);
  2281 				s = *srcp++;
  2282 				d = *dstp;
  2283 				/*
  2284 				 * shift out the middle component (green) to
  2285 				 * the high 16 bits, and process all three RGB
  2286 				 * components at the same time.
  2287 				 */
  2288 				s = (s | s << 16) & 0x07e0f81f;
  2289 				d = (d | d << 16) & 0x07e0f81f;
  2290 				d += (s - d) * alpha >> 5;
  2291 				d &= 0x07e0f81f;
  2292 				*dstp++ = (Uint16)(d | d >> 16);
  2293 			},{
  2294 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2295 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2296 
  2297 				/* red */
  2298 				src2 = src1;
  2299 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2300 
  2301 				dst2 = dst1;
  2302 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2303 
  2304 				/* blend */
  2305 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2306 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2307 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2308 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2309 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2310 
  2311 				mm_res = dst2; /* RED -> mm_res */
  2312 
  2313 				/* green -- process the bits in place */
  2314 				src2 = src1;
  2315 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2316 
  2317 				dst2 = dst1;
  2318 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2319 
  2320 				/* blend */
  2321 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2322 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2323 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2324 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2325 
  2326 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2327 
  2328 				/* blue */
  2329 				src2 = src1;
  2330 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2331 
  2332 				dst2 = dst1;
  2333 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2334 
  2335 				/* blend */
  2336 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2337 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2338 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2339 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2340 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2341 
  2342 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2343 
  2344 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2345 
  2346 				srcp += 4;
  2347 				dstp += 4;
  2348 			}, width);
  2349 			/* *INDENT-ON* */
  2350             srcp += srcskip;
  2351             dstp += dstskip;
  2352         }
  2353         _mm_empty();
  2354     }
  2355 }
  2356 
  2357 /* fast RGB555->RGB555 blending with surface alpha */
  2358 static void
  2359 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2360 {
  2361     unsigned alpha = info->src->alpha;
  2362     if (alpha == 128) {
  2363         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2364     } else {
  2365         int width = info->d_width;
  2366         int height = info->d_height;
  2367         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2368         int srcskip = info->s_skip >> 1;
  2369         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2370         int dstskip = info->d_skip >> 1;
  2371         Uint32 s, d;
  2372 
  2373         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2374 
  2375         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2376         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2377         alpha >>= 3;            /* downscale alpha to 5 bits */
  2378 
  2379         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2380         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2381         /* position alpha to allow for mullo and mulhi on diff channels
  2382            to reduce the number of operations */
  2383         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2384 
  2385         /* Setup the 555 color channel masks */
  2386         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2387         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2388         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2389 
  2390         while (height--) {
  2391 			/* *INDENT-OFF* */
  2392 			DUFFS_LOOP_QUATRO2(
  2393 			{
  2394 				s = *srcp++;
  2395 				d = *dstp;
  2396 				/*
  2397 				 * shift out the middle component (green) to
  2398 				 * the high 16 bits, and process all three RGB
  2399 				 * components at the same time.
  2400 				 */
  2401 				s = (s | s << 16) & 0x03e07c1f;
  2402 				d = (d | d << 16) & 0x03e07c1f;
  2403 				d += (s - d) * alpha >> 5;
  2404 				d &= 0x03e07c1f;
  2405 				*dstp++ = (Uint16)(d | d >> 16);
  2406 			},{
  2407 				s = *srcp++;
  2408 				d = *dstp;
  2409 				/*
  2410 				 * shift out the middle component (green) to
  2411 				 * the high 16 bits, and process all three RGB
  2412 				 * components at the same time.
  2413 				 */
  2414 				s = (s | s << 16) & 0x03e07c1f;
  2415 				d = (d | d << 16) & 0x03e07c1f;
  2416 				d += (s - d) * alpha >> 5;
  2417 				d &= 0x03e07c1f;
  2418 				*dstp++ = (Uint16)(d | d >> 16);
  2419 			        s = *srcp++;
  2420 				d = *dstp;
  2421 				/*
  2422 				 * shift out the middle component (green) to
  2423 				 * the high 16 bits, and process all three RGB
  2424 				 * components at the same time.
  2425 				 */
  2426 				s = (s | s << 16) & 0x03e07c1f;
  2427 				d = (d | d << 16) & 0x03e07c1f;
  2428 				d += (s - d) * alpha >> 5;
  2429 				d &= 0x03e07c1f;
  2430 				*dstp++ = (Uint16)(d | d >> 16);
  2431 			},{
  2432 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2433 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2434 
  2435 				/* red -- process the bits in place */
  2436 				src2 = src1;
  2437 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2438 
  2439 				dst2 = dst1;
  2440 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2441 
  2442 				/* blend */
  2443 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2444 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2445 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2446 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2447 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2448 
  2449 				mm_res = dst2; /* RED -> mm_res */
  2450 				
  2451 				/* green -- process the bits in place */
  2452 				src2 = src1;
  2453 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2454 
  2455 				dst2 = dst1;
  2456 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2457 
  2458 				/* blend */
  2459 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2460 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2461 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2462 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2463 
  2464 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2465 
  2466 				/* blue */
  2467 				src2 = src1; /* src -> src2 */
  2468 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2469 
  2470 				dst2 = dst1; /* dst -> dst2 */
  2471 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2472 
  2473 				/* blend */
  2474 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2475 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2476 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2477 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2478 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2479 
  2480 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2481 
  2482 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2483 
  2484 				srcp += 4;
  2485 				dstp += 4;
  2486 			}, width);
  2487 			/* *INDENT-ON* */
  2488             srcp += srcskip;
  2489             dstp += dstskip;
  2490         }
  2491         _mm_empty();
  2492     }
  2493 }
  2494 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2495 
  2496 /* fast RGB565->RGB565 blending with surface alpha */
  2497 static void
  2498 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2499 {
  2500     unsigned alpha = info->src->alpha;
  2501     if (alpha == 128) {
  2502         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2503     } else {
  2504         int width = info->d_width;
  2505         int height = info->d_height;
  2506         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2507         int srcskip = info->s_skip >> 1;
  2508         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2509         int dstskip = info->d_skip >> 1;
  2510         alpha >>= 3;            /* downscale alpha to 5 bits */
  2511 
  2512         while (height--) {
  2513 			/* *INDENT-OFF* */
  2514 			DUFFS_LOOP4({
  2515 				Uint32 s = *srcp++;
  2516 				Uint32 d = *dstp;
  2517 				/*
  2518 				 * shift out the middle component (green) to
  2519 				 * the high 16 bits, and process all three RGB
  2520 				 * components at the same time.
  2521 				 */
  2522 				s = (s | s << 16) & 0x07e0f81f;
  2523 				d = (d | d << 16) & 0x07e0f81f;
  2524 				d += (s - d) * alpha >> 5;
  2525 				d &= 0x07e0f81f;
  2526 				*dstp++ = (Uint16)(d | d >> 16);
  2527 			}, width);
  2528 			/* *INDENT-ON* */
  2529             srcp += srcskip;
  2530             dstp += dstskip;
  2531         }
  2532     }
  2533 }
  2534 
  2535 /* fast RGB555->RGB555 blending with surface alpha */
  2536 static void
  2537 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2538 {
  2539     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2540     if (alpha == 128) {
  2541         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2542     } else {
  2543         int width = info->d_width;
  2544         int height = info->d_height;
  2545         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2546         int srcskip = info->s_skip >> 1;
  2547         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2548         int dstskip = info->d_skip >> 1;
  2549         alpha >>= 3;            /* downscale alpha to 5 bits */
  2550 
  2551         while (height--) {
  2552 			/* *INDENT-OFF* */
  2553 			DUFFS_LOOP4({
  2554 				Uint32 s = *srcp++;
  2555 				Uint32 d = *dstp;
  2556 				/*
  2557 				 * shift out the middle component (green) to
  2558 				 * the high 16 bits, and process all three RGB
  2559 				 * components at the same time.
  2560 				 */
  2561 				s = (s | s << 16) & 0x03e07c1f;
  2562 				d = (d | d << 16) & 0x03e07c1f;
  2563 				d += (s - d) * alpha >> 5;
  2564 				d &= 0x03e07c1f;
  2565 				*dstp++ = (Uint16)(d | d >> 16);
  2566 			}, width);
  2567 			/* *INDENT-ON* */
  2568             srcp += srcskip;
  2569             dstp += dstskip;
  2570         }
  2571     }
  2572 }
  2573 
  2574 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2575 static void
  2576 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2577 {
  2578     int width = info->d_width;
  2579     int height = info->d_height;
  2580     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2581     int srcskip = info->s_skip >> 2;
  2582     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2583     int dstskip = info->d_skip >> 1;
  2584 
  2585     while (height--) {
  2586 	    /* *INDENT-OFF* */
  2587 	    DUFFS_LOOP4({
  2588 		Uint32 s = *srcp;
  2589 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2590 		/* FIXME: Here we special-case opaque alpha since the
  2591 		   compositioning used (>>8 instead of /255) doesn't handle
  2592 		   it correctly. Also special-case alpha=0 for speed?
  2593 		   Benchmark this! */
  2594 		if(alpha) {   
  2595 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2596 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2597 		  } else {
  2598 		    Uint32 d = *dstp;
  2599 		    /*
  2600 		     * convert source and destination to G0RAB65565
  2601 		     * and blend all components at the same time
  2602 		     */
  2603 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2604 		      + (s >> 3 & 0x1f);
  2605 		    d = (d | d << 16) & 0x07e0f81f;
  2606 		    d += (s - d) * alpha >> 5;
  2607 		    d &= 0x07e0f81f;
  2608 		    *dstp = (Uint16)(d | d >> 16);
  2609 		  }
  2610 		}
  2611 		srcp++;
  2612 		dstp++;
  2613 	    }, width);
  2614 	    /* *INDENT-ON* */
  2615         srcp += srcskip;
  2616         dstp += dstskip;
  2617     }
  2618 }
  2619 
  2620 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2621 static void
  2622 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2623 {
  2624     int width = info->d_width;
  2625     int height = info->d_height;
  2626     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2627     int srcskip = info->s_skip >> 2;
  2628     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2629     int dstskip = info->d_skip >> 1;
  2630 
  2631     while (height--) {
  2632 	    /* *INDENT-OFF* */
  2633 	    DUFFS_LOOP4({
  2634 		unsigned alpha;
  2635 		Uint32 s = *srcp;
  2636 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2637 		/* FIXME: Here we special-case opaque alpha since the
  2638 		   compositioning used (>>8 instead of /255) doesn't handle
  2639 		   it correctly. Also special-case alpha=0 for speed?
  2640 		   Benchmark this! */
  2641 		if(alpha) {   
  2642 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2643 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2644 		  } else {
  2645 		    Uint32 d = *dstp;
  2646 		    /*
  2647 		     * convert source and destination to G0RAB65565
  2648 		     * and blend all components at the same time
  2649 		     */
  2650 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2651 		      + (s >> 3 & 0x1f);
  2652 		    d = (d | d << 16) & 0x03e07c1f;
  2653 		    d += (s - d) * alpha >> 5;
  2654 		    d &= 0x03e07c1f;
  2655 		    *dstp = (Uint16)(d | d >> 16);
  2656 		  }
  2657 		}
  2658 		srcp++;
  2659 		dstp++;
  2660 	    }, width);
  2661 	    /* *INDENT-ON* */
  2662         srcp += srcskip;
  2663         dstp += dstskip;
  2664     }
  2665 }
  2666 
  2667 /* General (slow) N->N blending with per-surface alpha */
  2668 static void
  2669 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2670 {
  2671     int width = info->d_width;
  2672     int height = info->d_height;
  2673     Uint8 *src = info->s_pixels;
  2674     int srcskip = info->s_skip;
  2675     Uint8 *dst = info->d_pixels;
  2676     int dstskip = info->d_skip;
  2677     SDL_PixelFormat *srcfmt = info->src;
  2678     SDL_PixelFormat *dstfmt = info->dst;
  2679     int srcbpp = srcfmt->BytesPerPixel;
  2680     int dstbpp = dstfmt->BytesPerPixel;
  2681     unsigned sA = srcfmt->alpha;
  2682     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2683 
  2684     if (sA) {
  2685         while (height--) {
  2686 	    /* *INDENT-OFF* */
  2687 	    DUFFS_LOOP4(
  2688 	    {
  2689 		Uint32 Pixel;
  2690 		unsigned sR;
  2691 		unsigned sG;
  2692 		unsigned sB;
  2693 		unsigned dR;
  2694 		unsigned dG;
  2695 		unsigned dB;
  2696 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2697 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2698 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2699 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2700 		src += srcbpp;
  2701 		dst += dstbpp;
  2702 	    },
  2703 	    width);
  2704 	    /* *INDENT-ON* */
  2705             src += srcskip;
  2706             dst += dstskip;
  2707         }
  2708     }
  2709 }
  2710 
  2711 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2712 static void
  2713 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2714 {
  2715     int width = info->d_width;
  2716     int height = info->d_height;
  2717     Uint8 *src = info->s_pixels;
  2718     int srcskip = info->s_skip;
  2719     Uint8 *dst = info->d_pixels;
  2720     int dstskip = info->d_skip;
  2721     SDL_PixelFormat *srcfmt = info->src;
  2722     SDL_PixelFormat *dstfmt = info->dst;
  2723     Uint32 ckey = srcfmt->colorkey;
  2724     int srcbpp = srcfmt->BytesPerPixel;
  2725     int dstbpp = dstfmt->BytesPerPixel;
  2726     unsigned sA = srcfmt->alpha;
  2727     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2728 
  2729     while (height--) {
  2730 	    /* *INDENT-OFF* */
  2731 	    DUFFS_LOOP4(
  2732 	    {
  2733 		Uint32 Pixel;
  2734 		unsigned sR;
  2735 		unsigned sG;
  2736 		unsigned sB;
  2737 		unsigned dR;
  2738 		unsigned dG;
  2739 		unsigned dB;
  2740 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2741 		if(sA && Pixel != ckey) {
  2742 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2743 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2744 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2745 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2746 		}
  2747 		src += srcbpp;
  2748 		dst += dstbpp;
  2749 	    },
  2750 	    width);
  2751 	    /* *INDENT-ON* */
  2752         src += srcskip;
  2753         dst += dstskip;
  2754     }
  2755 }
  2756 
  2757 /* General (slow) N->N blending with pixel alpha */
  2758 static void
  2759 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2760 {
  2761     int width = info->d_width;
  2762     int height = info->d_height;
  2763     Uint8 *src = info->s_pixels;
  2764     int srcskip = info->s_skip;
  2765     Uint8 *dst = info->d_pixels;
  2766     int dstskip = info->d_skip;
  2767     SDL_PixelFormat *srcfmt = info->src;
  2768     SDL_PixelFormat *dstfmt = info->dst;
  2769 
  2770     int srcbpp;
  2771     int dstbpp;
  2772 
  2773     /* Set up some basic variables */
  2774     srcbpp = srcfmt->BytesPerPixel;
  2775     dstbpp = dstfmt->BytesPerPixel;
  2776 
  2777     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2778        quite right. for <8bpp source alpha, it gets them very wrong
  2779        (check all macros!)
  2780        It is unclear whether there is a good general solution that doesn't
  2781        need a branch (or a divide). */
  2782     while (height--) {
  2783 	    /* *INDENT-OFF* */
  2784 	    DUFFS_LOOP4(
  2785 	    {
  2786 		Uint32 Pixel;
  2787 		unsigned sR;
  2788 		unsigned sG;
  2789 		unsigned sB;
  2790 		unsigned dR;
  2791 		unsigned dG;
  2792 		unsigned dB;
  2793 		unsigned sA;
  2794 		unsigned dA;
  2795 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2796 		if(sA) {
  2797 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2798 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2799 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2800 		}
  2801 		src += srcbpp;
  2802 		dst += dstbpp;
  2803 	    },
  2804 	    width);
  2805 	    /* *INDENT-ON* */
  2806         src += srcskip;
  2807         dst += dstskip;
  2808     }
  2809 }
  2810 
  2811 
  2812 SDL_loblit
  2813 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2814 {
  2815     SDL_PixelFormat *sf = surface->format;
  2816     SDL_PixelFormat *df = surface->map->dst->format;
  2817 
  2818     if (sf->Amask == 0) {
  2819         if ((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2820             if (df->BytesPerPixel == 1)
  2821                 return BlitNto1SurfaceAlphaKey;
  2822             else
  2823 #if SDL_ALTIVEC_BLITTERS
  2824                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2825                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2826                     && SDL_HasAltiVec())
  2827                 return Blit32to32SurfaceAlphaKeyAltivec;
  2828             else
  2829 #endif
  2830                 return BlitNtoNSurfaceAlphaKey;
  2831         } else {
  2832             /* Per-surface alpha blits */
  2833             switch (df->BytesPerPixel) {
  2834             case 1:
  2835                 return BlitNto1SurfaceAlpha;
  2836 
  2837             case 2:
  2838                 if (surface->map->identity) {
  2839                     if (df->Gmask == 0x7e0) {
  2840 #if MMX_ASMBLIT
  2841                         if (SDL_HasMMX())
  2842                             return Blit565to565SurfaceAlphaMMX;
  2843                         else
  2844 #endif
  2845                             return Blit565to565SurfaceAlpha;
  2846                     } else if (df->Gmask == 0x3e0) {
  2847 #if MMX_ASMBLIT
  2848                         if (SDL_HasMMX())
  2849                             return Blit555to555SurfaceAlphaMMX;
  2850                         else
  2851 #endif
  2852                             return Blit555to555SurfaceAlpha;
  2853                     }
  2854                 }
  2855                 return BlitNtoNSurfaceAlpha;
  2856 
  2857             case 4:
  2858                 if (sf->Rmask == df->Rmask
  2859                     && sf->Gmask == df->Gmask
  2860                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2861 #if MMX_ASMBLIT
  2862                     if (sf->Rshift % 8 == 0
  2863                         && sf->Gshift % 8 == 0
  2864                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2865                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2866 #endif
  2867                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2868 #if SDL_ALTIVEC_BLITTERS
  2869                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2870                             && SDL_HasAltiVec())
  2871                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2872 #endif
  2873                         return BlitRGBtoRGBSurfaceAlpha;
  2874                     }
  2875                 }
  2876 #if SDL_ALTIVEC_BLITTERS
  2877                 if ((sf->BytesPerPixel == 4) &&
  2878                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2879                     && SDL_HasAltiVec())
  2880                     return Blit32to32SurfaceAlphaAltivec;
  2881                 else
  2882 #endif
  2883                     return BlitNtoNSurfaceAlpha;
  2884 
  2885             case 3:
  2886             default:
  2887                 return BlitNtoNSurfaceAlpha;
  2888             }
  2889         }
  2890     } else {
  2891         /* Per-pixel alpha blits */
  2892         switch (df->BytesPerPixel) {
  2893         case 1:
  2894             return BlitNto1PixelAlpha;
  2895 
  2896         case 2:
  2897 #if SDL_ALTIVEC_BLITTERS
  2898             if (sf->BytesPerPixel == 4
  2899                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2900                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2901                 && SDL_HasAltiVec())
  2902                 return Blit32to565PixelAlphaAltivec;
  2903             else
  2904 #endif
  2905                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2906                     && sf->Gmask == 0xff00
  2907                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2908                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2909                 if (df->Gmask == 0x7e0)
  2910                     return BlitARGBto565PixelAlpha;
  2911                 else if (df->Gmask == 0x3e0)
  2912                     return BlitARGBto555PixelAlpha;
  2913             }
  2914             return BlitNtoNPixelAlpha;
  2915 
  2916         case 4:
  2917             if (sf->Rmask == df->Rmask
  2918                 && sf->Gmask == df->Gmask
  2919                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2920 #if MMX_ASMBLIT
  2921                 if (sf->Rshift % 8 == 0
  2922                     && sf->Gshift % 8 == 0
  2923                     && sf->Bshift % 8 == 0
  2924                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2925                     if (SDL_Has3DNow())
  2926                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2927                     if (SDL_HasMMX())
  2928                         return BlitRGBtoRGBPixelAlphaMMX;
  2929                 }
  2930 #endif
  2931                 if (sf->Amask == 0xff000000) {
  2932 #if SDL_ALTIVEC_BLITTERS
  2933                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2934                         && SDL_HasAltiVec())
  2935                         return BlitRGBtoRGBPixelAlphaAltivec;
  2936 #endif
  2937                     return BlitRGBtoRGBPixelAlpha;
  2938                 }
  2939             }
  2940 #if SDL_ALTIVEC_BLITTERS
  2941             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2942                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2943                 && SDL_HasAltiVec())
  2944                 return Blit32to32PixelAlphaAltivec;
  2945             else
  2946 #endif
  2947                 return BlitNtoNPixelAlpha;
  2948 
  2949         case 3:
  2950         default:
  2951             return BlitNtoNPixelAlpha;
  2952         }
  2953     }
  2954 }
  2955 
  2956 /* vi: set ts=4 sw=4 expandtab: */