src/video/SDL_blit_A.c
author Ryan C. Gordon
Wed, 04 Apr 2007 09:36:25 +0000
changeset 2101 c4e0afbcf1f6
parent 2086 fffea8d6bf92
child 2120 2c835d58faad
permissions -rw-r--r--
Merge r3005:3006 from branches/SDL-1.2: Alpha blending MMX/3DNow register bug.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void
    50 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    51 {
    52     int width = info->d_width;
    53     int height = info->d_height;
    54     Uint8 *src = info->s_pixels;
    55     int srcskip = info->s_skip;
    56     Uint8 *dst = info->d_pixels;
    57     int dstskip = info->d_skip;
    58     Uint8 *palmap = info->table;
    59     SDL_PixelFormat *srcfmt = info->src;
    60     SDL_PixelFormat *dstfmt = info->dst;
    61     int srcbpp = srcfmt->BytesPerPixel;
    62 
    63     const unsigned A = srcfmt->alpha;
    64 
    65     while (height--) {
    66 	    /* *INDENT-OFF* */
    67 	    DUFFS_LOOP4(
    68 	    {
    69 		Uint32 Pixel;
    70 		unsigned sR;
    71 		unsigned sG;
    72 		unsigned sB;
    73 		unsigned dR;
    74 		unsigned dG;
    75 		unsigned dB;
    76 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    77 		dR = dstfmt->palette->colors[*dst].r;
    78 		dG = dstfmt->palette->colors[*dst].g;
    79 		dB = dstfmt->palette->colors[*dst].b;
    80 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    81 		dR &= 0xff;
    82 		dG &= 0xff;
    83 		dB &= 0xff;
    84 		/* Pack RGB into 8bit pixel */
    85 		if ( palmap == NULL ) {
    86 		    *dst =((dR>>5)<<(3+2))|
    87 			  ((dG>>5)<<(2))|
    88 			  ((dB>>6)<<(0));
    89 		} else {
    90 		    *dst = palmap[((dR>>5)<<(3+2))|
    91 				  ((dG>>5)<<(2))  |
    92 				  ((dB>>6)<<(0))];
    93 		}
    94 		dst++;
    95 		src += srcbpp;
    96 	    },
    97 	    width);
    98 	    /* *INDENT-ON* */
    99         src += srcskip;
   100         dst += dstskip;
   101     }
   102 }
   103 
   104 /* N->1 blending with pixel alpha */
   105 static void
   106 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   107 {
   108     int width = info->d_width;
   109     int height = info->d_height;
   110     Uint8 *src = info->s_pixels;
   111     int srcskip = info->s_skip;
   112     Uint8 *dst = info->d_pixels;
   113     int dstskip = info->d_skip;
   114     Uint8 *palmap = info->table;
   115     SDL_PixelFormat *srcfmt = info->src;
   116     SDL_PixelFormat *dstfmt = info->dst;
   117     int srcbpp = srcfmt->BytesPerPixel;
   118 
   119     /* FIXME: fix alpha bit field expansion here too? */
   120     while (height--) {
   121 	    /* *INDENT-OFF* */
   122 	    DUFFS_LOOP4(
   123 	    {
   124 		Uint32 Pixel;
   125 		unsigned sR;
   126 		unsigned sG;
   127 		unsigned sB;
   128 		unsigned sA;
   129 		unsigned dR;
   130 		unsigned dG;
   131 		unsigned dB;
   132 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   133 		dR = dstfmt->palette->colors[*dst].r;
   134 		dG = dstfmt->palette->colors[*dst].g;
   135 		dB = dstfmt->palette->colors[*dst].b;
   136 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   137 		dR &= 0xff;
   138 		dG &= 0xff;
   139 		dB &= 0xff;
   140 		/* Pack RGB into 8bit pixel */
   141 		if ( palmap == NULL ) {
   142 		    *dst =((dR>>5)<<(3+2))|
   143 			  ((dG>>5)<<(2))|
   144 			  ((dB>>6)<<(0));
   145 		} else {
   146 		    *dst = palmap[((dR>>5)<<(3+2))|
   147 				  ((dG>>5)<<(2))  |
   148 				  ((dB>>6)<<(0))  ];
   149 		}
   150 		dst++;
   151 		src += srcbpp;
   152 	    },
   153 	    width);
   154 	    /* *INDENT-ON* */
   155         src += srcskip;
   156         dst += dstskip;
   157     }
   158 }
   159 
   160 /* colorkeyed N->1 blending with per-surface alpha */
   161 static void
   162 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   163 {
   164     int width = info->d_width;
   165     int height = info->d_height;
   166     Uint8 *src = info->s_pixels;
   167     int srcskip = info->s_skip;
   168     Uint8 *dst = info->d_pixels;
   169     int dstskip = info->d_skip;
   170     Uint8 *palmap = info->table;
   171     SDL_PixelFormat *srcfmt = info->src;
   172     SDL_PixelFormat *dstfmt = info->dst;
   173     int srcbpp = srcfmt->BytesPerPixel;
   174     Uint32 ckey = srcfmt->colorkey;
   175 
   176     const int A = srcfmt->alpha;
   177 
   178     while (height--) {
   179 	    /* *INDENT-OFF* */
   180 	    DUFFS_LOOP(
   181 	    {
   182 		Uint32 Pixel;
   183 		unsigned sR;
   184 		unsigned sG;
   185 		unsigned sB;
   186 		unsigned dR;
   187 		unsigned dG;
   188 		unsigned dB;
   189 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   190 		if ( Pixel != ckey ) {
   191 		    dR = dstfmt->palette->colors[*dst].r;
   192 		    dG = dstfmt->palette->colors[*dst].g;
   193 		    dB = dstfmt->palette->colors[*dst].b;
   194 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   195 		    dR &= 0xff;
   196 		    dG &= 0xff;
   197 		    dB &= 0xff;
   198 		    /* Pack RGB into 8bit pixel */
   199 		    if ( palmap == NULL ) {
   200 			*dst =((dR>>5)<<(3+2))|
   201 			      ((dG>>5)<<(2)) |
   202 			      ((dB>>6)<<(0));
   203 		    } else {
   204 			*dst = palmap[((dR>>5)<<(3+2))|
   205 				      ((dG>>5)<<(2))  |
   206 				      ((dB>>6)<<(0))  ];
   207 		    }
   208 		}
   209 		dst++;
   210 		src += srcbpp;
   211 	    },
   212 	    width);
   213 	    /* *INDENT-ON* */
   214         src += srcskip;
   215         dst += dstskip;
   216     }
   217 }
   218 
   219 #if GCC_ASMBLIT
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   221 static void
   222 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   223 {
   224     int width = info->d_width;
   225     int height = info->d_height;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   227     int srcskip = info->s_skip >> 2;
   228     Uint32 *dstp = (Uint32 *) info->d_pixels;
   229     int dstskip = info->d_skip >> 2;
   230     Uint32 dalpha = info->dst->Amask;
   231     Uint8 load[8];
   232 
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   234     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   236     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   237     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   238     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   239     while (height--) {
   240 		/* *INDENT-OFF* */
   241 		DUFFS_LOOP_DOUBLE2(
   242 		{
   243 			Uint32 s = *srcp++;
   244 			Uint32 d = *dstp;
   245 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   246 				   + (s & d & 0x00010101)) | dalpha;
   247 		},{
   248 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   249 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   250 
   251 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   252 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   253 
   254 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   255 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   256 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   257 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   258 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   259 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   260 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   261 			
   262 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   263 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   264 			dstp += 2;
   265 			srcp += 2;
   266 		}, width);
   267 		/* *INDENT-ON* */
   268         srcp += srcskip;
   269         dstp += dstskip;
   270     }
   271     emms();
   272 }
   273 
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   275 static void
   276 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   277 {
   278     SDL_PixelFormat *df = info->dst;
   279     unsigned alpha = info->src->alpha;
   280 
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   282         /* only call a128 version when R,G,B occupy lower bits */
   283         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   284     } else {
   285         int width = info->d_width;
   286         int height = info->d_height;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   288         int srcskip = info->s_skip >> 2;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   290         int dstskip = info->d_skip >> 2;
   291 
   292         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   293         /* form the alpha mult */
   294         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   295         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   296         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   297         alpha =
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   299                                                            Bshift);
   300         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   301         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   302         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   304         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   305         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   306 
   307         while (height--) {
   308 			/* *INDENT-OFF* */
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   315 
   316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   320 
   321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   324 				++srcp;
   325 				++dstp;
   326 			},{
   327 				/* Two Pixels Blend */
   328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   332 
   333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   337 
   338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   342 
   343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   347 
   348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   350 				
   351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   352 
   353   				srcp += 2;
   354   				dstp += 2;
   355   			}, width);
   356 			/* *INDENT-ON* */
   357             srcp += srcskip;
   358             dstp += dstskip;
   359         }
   360         emms();
   361     }
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void
   366 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   367 {
   368     int width = info->d_width;
   369     int height = info->d_height;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   371     int srcskip = info->s_skip >> 2;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   373     int dstskip = info->d_skip >> 2;
   374     SDL_PixelFormat *sf = info->src;
   375     Uint32 amask = sf->Amask;
   376 
   377     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   378     /* form multiplication mask */
   379     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   380     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   381     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   382     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   383     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   384     /* form channel masks */
   385     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   386     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   387     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   388     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   389     /* get alpha channel shift */
   390     __asm__ __volatile__ (
   391         "movd %0, %%mm5"
   392         : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   393 
   394     while (height--) {
   395 	    /* *INDENT-OFF* */
   396 	    DUFFS_LOOP4({
   397 		Uint32 alpha = *srcp & amask;
   398 		/* FIXME: Here we special-case opaque alpha since the
   399 			compositioning used (>>8 instead of /255) doesn't handle
   400 			it correctly. Also special-case alpha=0 for speed?
   401 			Benchmark this! */
   402 		if(alpha == 0) {
   403 			/* do nothing */
   404 		} else if(alpha == amask) {
   405 			/* opaque alpha -- copy RGB, keep dst alpha */
   406 			/* using MMX here to free up regular registers for other things */
   407 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   408 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   409 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   410 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   411 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   412 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   413 		} else {
   414 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   415 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   416 
   417 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   418 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   419 
   420 			__asm__ __volatile__ (
   421 				"movd %0, %%mm4"
   422 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   423 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   424 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   425 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   426 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   427 
   428 			/* blend */		    
   429 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   430 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   431 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   432 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   433 			
   434 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   435 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   436 		}
   437 		++srcp;
   438 		++dstp;
   439 	    }, width);
   440 	    /* *INDENT-ON* */
   441         srcp += srcskip;
   442         dstp += dstskip;
   443     }
   444     emms();
   445 }
   446 
   447 /* End GCC_ASMBLIT */
   448 
   449 #elif MSVC_ASMBLIT
   450 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   451 static void
   452 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   453 {
   454     int width = info->d_width;
   455     int height = info->d_height;
   456     Uint32 *srcp = (Uint32 *) info->s_pixels;
   457     int srcskip = info->s_skip >> 2;
   458     Uint32 *dstp = (Uint32 *) info->d_pixels;
   459     int dstskip = info->d_skip >> 2;
   460     Uint32 dalpha = info->dst->Amask;
   461 
   462     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   463 
   464     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   465     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   466     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   467 
   468     while (height--) {
   469         int n = width;
   470         if (n & 1) {
   471             Uint32 s = *srcp++;
   472             Uint32 d = *dstp;
   473             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   474                        + (s & d & 0x00010101)) | dalpha;
   475             n--;
   476         }
   477 
   478         for (n >>= 1; n > 0; --n) {
   479             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   480             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   481 
   482             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   483             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   484 
   485             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   486             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   487             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   488             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   489 
   490             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   491             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   492             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   493             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   494 
   495             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   496             dstp += 2;
   497             srcp += 2;
   498         }
   499 
   500         srcp += srcskip;
   501         dstp += dstskip;
   502     }
   503     _mm_empty();
   504 }
   505 
   506 /* fast RGB888->(A)RGB888 blending with surface alpha */
   507 static void
   508 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   509 {
   510     SDL_PixelFormat *df = info->dst;
   511     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   512     unsigned alpha = info->src->alpha;
   513 
   514     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   515         /* only call a128 version when R,G,B occupy lower bits */
   516         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   517     } else {
   518         int width = info->d_width;
   519         int height = info->d_height;
   520         Uint32 *srcp = (Uint32 *) info->s_pixels;
   521         int srcskip = info->s_skip >> 2;
   522         Uint32 *dstp = (Uint32 *) info->d_pixels;
   523         int dstskip = info->d_skip >> 2;
   524         Uint32 dalpha = df->Amask;
   525         Uint32 amult;
   526 
   527         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   528 
   529         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   530         /* form the alpha mult */
   531         amult = alpha | (alpha << 8);
   532         amult = amult | (amult << 16);
   533         chanmask =
   534             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   535                                                            Bshift);
   536         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   537         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   538         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   539         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   540 
   541         while (height--) {
   542             int n = width;
   543             if (n & 1) {
   544                 /* One Pixel Blend */
   545                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   546                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   547 
   548                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   549                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   550 
   551                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   552                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   553                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   554                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   555 
   556                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   557                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   558                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   559 
   560                 ++srcp;
   561                 ++dstp;
   562 
   563                 n--;
   564             }
   565 
   566             for (n >>= 1; n > 0; --n) {
   567                 /* Two Pixels Blend */
   568                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   569                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   570                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   571                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   572 
   573                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   574                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   575                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   576                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   577 
   578                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   579                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   580                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   581                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   582 
   583                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   584                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   585                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   586                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   587 
   588                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   589                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   590 
   591                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   592 
   593                 srcp += 2;
   594                 dstp += 2;
   595             }
   596             srcp += srcskip;
   597             dstp += dstskip;
   598         }
   599         _mm_empty();
   600     }
   601 }
   602 
   603 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   604 static void
   605 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   606 {
   607     int width = info->d_width;
   608     int height = info->d_height;
   609     Uint32 *srcp = (Uint32 *) info->s_pixels;
   610     int srcskip = info->s_skip >> 2;
   611     Uint32 *dstp = (Uint32 *) info->d_pixels;
   612     int dstskip = info->d_skip >> 2;
   613     SDL_PixelFormat *sf = info->src;
   614     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   615     Uint32 amask = sf->Amask;
   616     Uint32 ashift = sf->Ashift;
   617     Uint64 multmask;
   618 
   619     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   620 
   621     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   622 	/* *INDENT-OFF* */
   623 	multmask = ~(0xFFFFI64 << (ashift * 2));
   624 	/* *INDENT-ON* */
   625     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   626 
   627     while (height--) {
   628 		/* *INDENT-OFF* */
   629 		DUFFS_LOOP4({
   630 		Uint32 alpha = *srcp & amask;
   631 		if (alpha == 0) {
   632 			/* do nothing */
   633 		} else if (alpha == amask) {
   634 			/* opaque alpha -- copy RGB, keep dst alpha */
   635 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   636 		} else {
   637 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   638 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   639 
   640 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   641 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   642 
   643 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   644 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   645 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   646 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   647 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   648 
   649 			/* blend */		    
   650 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   651 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   652 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   653 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   654 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   655 			
   656 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   657 		}
   658 		++srcp;
   659 		++dstp;
   660 	    }, width);
   661 		/* *INDENT-ON* */
   662         srcp += srcskip;
   663         dstp += dstskip;
   664     }
   665     _mm_empty();
   666 }
   667 
   668 /* End MSVC_ASMBLIT */
   669 
   670 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   671 
   672 #if SDL_ALTIVEC_BLITTERS
   673 #if __MWERKS__
   674 #pragma altivec_model on
   675 #endif
   676 #if HAVE_ALTIVEC_H
   677 #include <altivec.h>
   678 #endif
   679 #include <assert.h>
   680 
   681 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   682 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   683         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   684 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   685         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   686 #else
   687 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   688         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   689 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   690         (vector unsigned short) { a,b,c,d,e,f,g,h }
   691 #endif
   692 
   693 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   694 #define VECPRINT(msg, v) do { \
   695     vector unsigned int tmpvec = (vector unsigned int)(v); \
   696     unsigned int *vp = (unsigned int *)&tmpvec; \
   697     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   698 } while (0)
   699 
   700 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   701     (vector unsigned char)(
   702         0x00, 0x10, 0x02, 0x12,
   703         0x04, 0x14, 0x06, 0x16,
   704         0x08, 0x18, 0x0A, 0x1A,
   705         0x0C, 0x1C, 0x0E, 0x1E );
   706 */
   707 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   708 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   709 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   710 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   711     ? vec_lvsl(0, src) \
   712     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   713 
   714 
   715 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   716     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   717     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   718     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   719     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   720     /* valpha2 is 255-alpha */ \
   721     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   722     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   723     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   724     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   725     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   726     /* add source and dest */ \
   727     vtemp1 = vec_add(vtemp1, vtemp3); \
   728     vtemp2 = vec_add(vtemp2, vtemp4); \
   729     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   730     vtemp1 = vec_add(vtemp1, v1_16); \
   731     vtemp3 = vec_sr(vtemp1, v8_16); \
   732     vtemp1 = vec_add(vtemp1, vtemp3); \
   733     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   734     vtemp2 = vec_add(vtemp2, v1_16); \
   735     vtemp4 = vec_sr(vtemp2, v8_16); \
   736     vtemp2 = vec_add(vtemp2, vtemp4); \
   737     /* (>>8) and get ARGBARGBARGBARGB */ \
   738     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   739 } while (0)
   740 
   741 /* Calculate the permute vector used for 32->32 swizzling */
   742 static vector unsigned char
   743 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   744 {
   745     /*
   746      * We have to assume that the bits that aren't used by other
   747      *  colors is alpha, and it's one complete byte, since some formats
   748      *  leave alpha with a zero mask, but we should still swizzle the bits.
   749      */
   750     /* ARGB */
   751     const static struct SDL_PixelFormat default_pixel_format = {
   752         NULL, 0, 0,
   753         0, 0, 0, 0,
   754         16, 8, 0, 24,
   755         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   756         0, 0
   757     };
   758     if (!srcfmt) {
   759         srcfmt = &default_pixel_format;
   760     }
   761     if (!dstfmt) {
   762         dstfmt = &default_pixel_format;
   763     }
   764     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   765                                                        0x04, 0x04, 0x04, 0x04,
   766                                                        0x08, 0x08, 0x08, 0x08,
   767                                                        0x0C, 0x0C, 0x0C,
   768                                                        0x0C);
   769     vector unsigned char vswiz;
   770     vector unsigned int srcvec;
   771 #define RESHIFT(X) (3 - ((X) >> 3))
   772     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   773     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   774     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   775     Uint32 amask;
   776     /* Use zero for alpha if either surface doesn't have alpha */
   777     if (dstfmt->Amask) {
   778         amask =
   779             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   780                                                                    Ashift);
   781     } else {
   782         amask =
   783             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   784                           0xFFFFFFFF);
   785     }
   786 #undef RESHIFT
   787     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   788     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   789     return (vswiz);
   790 }
   791 
   792 static void
   793 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   794 {
   795     int height = info->d_height;
   796     Uint8 *src = (Uint8 *) info->s_pixels;
   797     int srcskip = info->s_skip;
   798     Uint8 *dst = (Uint8 *) info->d_pixels;
   799     int dstskip = info->d_skip;
   800     SDL_PixelFormat *srcfmt = info->src;
   801 
   802     vector unsigned char v0 = vec_splat_u8(0);
   803     vector unsigned short v8_16 = vec_splat_u16(8);
   804     vector unsigned short v1_16 = vec_splat_u16(1);
   805     vector unsigned short v2_16 = vec_splat_u16(2);
   806     vector unsigned short v3_16 = vec_splat_u16(3);
   807     vector unsigned int v8_32 = vec_splat_u32(8);
   808     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   809     vector unsigned short v3f =
   810         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   811                           0x003f, 0x003f, 0x003f, 0x003f);
   812     vector unsigned short vfc =
   813         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   814                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   815 
   816     /* 
   817        0x10 - 0x1f is the alpha
   818        0x00 - 0x0e evens are the red
   819        0x01 - 0x0f odds are zero
   820      */
   821     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   822                                                        0x10, 0x02, 0x01, 0x01,
   823                                                        0x10, 0x04, 0x01, 0x01,
   824                                                        0x10, 0x06, 0x01,
   825                                                        0x01);
   826     vector unsigned char vredalpha2 =
   827         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   828                                         vec_sl(v8_32, v16_32))
   829         );
   830     /*
   831        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   832        0x11 - 0x0f odds are blue
   833      */
   834     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   835                                                    0x04, 0x05, 0x06, 0x13,
   836                                                    0x08, 0x09, 0x0a, 0x15,
   837                                                    0x0c, 0x0d, 0x0e, 0x17);
   838     vector unsigned char vblue2 =
   839         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   840         );
   841     /*
   842        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   843        0x10 - 0x0e evens are green
   844      */
   845     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   846                                                     0x04, 0x05, 0x12, 0x07,
   847                                                     0x08, 0x09, 0x14, 0x0b,
   848                                                     0x0c, 0x0d, 0x16, 0x0f);
   849     vector unsigned char vgreen2 =
   850         (vector unsigned
   851          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   852         );
   853     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   854                                                     0x00, 0x0a, 0x00, 0x0e,
   855                                                     0x00, 0x12, 0x00, 0x16,
   856                                                     0x00, 0x1a, 0x00, 0x1e);
   857     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   858     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   859     vector unsigned char valphaPermute =
   860         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   861 
   862     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   863     vf800 = vec_sl(vf800, vec_splat_u16(8));
   864 
   865     while (height--) {
   866         int extrawidth;
   867         vector unsigned char valigner;
   868         vector unsigned char vsrc;
   869         vector unsigned char voverflow;
   870         int width = info->d_width;
   871 
   872 #define ONE_PIXEL_BLEND(condition, widthvar) \
   873         while (condition) { \
   874             Uint32 Pixel; \
   875             unsigned sR, sG, sB, dR, dG, dB, sA; \
   876             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   877             if(sA) { \
   878                 unsigned short dstpixel = *((unsigned short *)dst); \
   879                 dR = (dstpixel >> 8) & 0xf8; \
   880                 dG = (dstpixel >> 3) & 0xfc; \
   881                 dB = (dstpixel << 3) & 0xf8; \
   882                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   883                 *((unsigned short *)dst) = ( \
   884                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   885                 ); \
   886             } \
   887             src += 4; \
   888             dst += 2; \
   889             widthvar--; \
   890         }
   891         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   892         extrawidth = (width % 8);
   893         valigner = VEC_ALIGNER(src);
   894         vsrc = (vector unsigned char) vec_ld(0, src);
   895         width -= extrawidth;
   896         while (width) {
   897             vector unsigned char valpha;
   898             vector unsigned char vsrc1, vsrc2;
   899             vector unsigned char vdst1, vdst2;
   900             vector unsigned short vR, vG, vB;
   901             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   902 
   903             /* Load 8 pixels from src as ARGB */
   904             voverflow = (vector unsigned char) vec_ld(15, src);
   905             vsrc = vec_perm(vsrc, voverflow, valigner);
   906             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   907             src += 16;
   908             vsrc = (vector unsigned char) vec_ld(15, src);
   909             voverflow = vec_perm(voverflow, vsrc, valigner);
   910             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   911             src += 16;
   912 
   913             /* Load 8 pixels from dst as XRGB */
   914             voverflow = vec_ld(0, dst);
   915             vR = vec_and((vector unsigned short) voverflow, vf800);
   916             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   917             vG = vec_sl(vB, v2_16);
   918             vdst1 =
   919                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   920                                                 (vector unsigned char) vR,
   921                                                 vredalpha1);
   922             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   923             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   924             vdst2 =
   925                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   926                                                 (vector unsigned char) vR,
   927                                                 vredalpha2);
   928             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   929             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   930 
   931             /* Alpha blend 8 pixels as ARGB */
   932             valpha = vec_perm(vsrc1, v0, valphaPermute);
   933             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   934                                v8_16);
   935             valpha = vec_perm(vsrc2, v0, valphaPermute);
   936             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   937                                v8_16);
   938 
   939             /* Convert 8 pixels to 565 */
   940             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   941                                                         vdst1,
   942                                                         (vector unsigned int)
   943                                                         vdst2);
   944             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   945             vgpixel = vec_and(vgpixel, vfc);
   946             vgpixel = vec_sl(vgpixel, v3_16);
   947             vrpixel = vec_sl(vpixel, v1_16);
   948             vrpixel = vec_and(vrpixel, vf800);
   949             vbpixel = vec_and(vpixel, v3f);
   950             vdst1 =
   951                 vec_or((vector unsigned char) vrpixel,
   952                        (vector unsigned char) vgpixel);
   953             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   954 
   955             /* Store 8 pixels */
   956             vec_st(vdst1, 0, dst);
   957 
   958             width -= 8;
   959             dst += 16;
   960         }
   961         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   962 #undef ONE_PIXEL_BLEND
   963         src += srcskip;
   964         dst += dstskip;
   965     }
   966 }
   967 
   968 static void
   969 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   970 {
   971     unsigned alpha = info->src->alpha;
   972     int height = info->d_height;
   973     Uint32 *srcp = (Uint32 *) info->s_pixels;
   974     int srcskip = info->s_skip >> 2;
   975     Uint32 *dstp = (Uint32 *) info->d_pixels;
   976     int dstskip = info->d_skip >> 2;
   977     SDL_PixelFormat *srcfmt = info->src;
   978     SDL_PixelFormat *dstfmt = info->dst;
   979     unsigned sA = srcfmt->alpha;
   980     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   981     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   982     Uint32 ckey = info->src->colorkey;
   983     vector unsigned char mergePermute;
   984     vector unsigned char vsrcPermute;
   985     vector unsigned char vdstPermute;
   986     vector unsigned char vsdstPermute;
   987     vector unsigned char valpha;
   988     vector unsigned char valphamask;
   989     vector unsigned char vbits;
   990     vector unsigned char v0;
   991     vector unsigned short v1;
   992     vector unsigned short v8;
   993     vector unsigned int vckey;
   994     vector unsigned int vrgbmask;
   995 
   996     mergePermute = VEC_MERGE_PERMUTE();
   997     v0 = vec_splat_u8(0);
   998     v1 = vec_splat_u16(1);
   999     v8 = vec_splat_u16(8);
  1000 
  1001     /* set the alpha to 255 on the destination surf */
  1002     valphamask = VEC_ALPHA_MASK();
  1003 
  1004     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1005     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1006     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1007 
  1008     /* set a vector full of alpha and 255-alpha */
  1009     ((unsigned char *) &valpha)[0] = alpha;
  1010     valpha = vec_splat(valpha, 0);
  1011     vbits = (vector unsigned char) vec_splat_s8(-1);
  1012 
  1013     ckey &= rgbmask;
  1014     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1015     vckey = vec_splat(vckey, 0);
  1016     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1017     vrgbmask = vec_splat(vrgbmask, 0);
  1018 
  1019     while (height--) {
  1020         int width = info->d_width;
  1021 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1022         while (condition) { \
  1023             Uint32 Pixel; \
  1024             unsigned sR, sG, sB, dR, dG, dB; \
  1025             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
  1026             if(sA && Pixel != ckey) { \
  1027                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
  1028                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1029                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1030                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1031             } \
  1032             dstp++; \
  1033             srcp++; \
  1034             widthvar--; \
  1035         }
  1036         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1037         if (width > 0) {
  1038             int extrawidth = (width % 4);
  1039             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1040             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1041             width -= extrawidth;
  1042             while (width) {
  1043                 vector unsigned char vsel;
  1044                 vector unsigned char voverflow;
  1045                 vector unsigned char vd;
  1046                 vector unsigned char vd_orig;
  1047 
  1048                 /* s = *srcp */
  1049                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1050                 vs = vec_perm(vs, voverflow, valigner);
  1051 
  1052                 /* vsel is set for items that match the key */
  1053                 vsel =
  1054                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1055                                                    vrgbmask);
  1056                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1057                                                         vsel, vckey);
  1058 
  1059                 /* permute to source format */
  1060                 vs = vec_perm(vs, valpha, vsrcPermute);
  1061 
  1062                 /* d = *dstp */
  1063                 vd = (vector unsigned char) vec_ld(0, dstp);
  1064                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1065 
  1066                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1067 
  1068                 /* set the alpha channel to full on */
  1069                 vd = vec_or(vd, valphamask);
  1070 
  1071                 /* mask out color key */
  1072                 vd = vec_sel(vd, vd_orig, vsel);
  1073 
  1074                 /* permute to dest format */
  1075                 vd = vec_perm(vd, vbits, vdstPermute);
  1076 
  1077                 /* *dstp = res */
  1078                 vec_st((vector unsigned int) vd, 0, dstp);
  1079 
  1080                 srcp += 4;
  1081                 dstp += 4;
  1082                 width -= 4;
  1083                 vs = voverflow;
  1084             }
  1085             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1086         }
  1087 #undef ONE_PIXEL_BLEND
  1088 
  1089         srcp += srcskip;
  1090         dstp += dstskip;
  1091     }
  1092 }
  1093 
  1094 
  1095 static void
  1096 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1097 {
  1098     int width = info->d_width;
  1099     int height = info->d_height;
  1100     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1101     int srcskip = info->s_skip >> 2;
  1102     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1103     int dstskip = info->d_skip >> 2;
  1104     SDL_PixelFormat *srcfmt = info->src;
  1105     SDL_PixelFormat *dstfmt = info->dst;
  1106     vector unsigned char mergePermute;
  1107     vector unsigned char valphaPermute;
  1108     vector unsigned char vsrcPermute;
  1109     vector unsigned char vdstPermute;
  1110     vector unsigned char vsdstPermute;
  1111     vector unsigned char valphamask;
  1112     vector unsigned char vpixelmask;
  1113     vector unsigned char v0;
  1114     vector unsigned short v1;
  1115     vector unsigned short v8;
  1116 
  1117     v0 = vec_splat_u8(0);
  1118     v1 = vec_splat_u16(1);
  1119     v8 = vec_splat_u16(8);
  1120     mergePermute = VEC_MERGE_PERMUTE();
  1121     valphamask = VEC_ALPHA_MASK();
  1122     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1123     vpixelmask = vec_nor(valphamask, v0);
  1124     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1125     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1126     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1127 
  1128     while (height--) {
  1129         width = info->d_width;
  1130 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1131             Uint32 Pixel; \
  1132             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1133             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1134             if(sA) { \
  1135               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1136               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1137               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1138             } \
  1139             ++srcp; \
  1140             ++dstp; \
  1141             widthvar--; \
  1142         }
  1143         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1144         if (width > 0) {
  1145             /* vsrcPermute */
  1146             /* vdstPermute */
  1147             int extrawidth = (width % 4);
  1148             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1149             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1150             width -= extrawidth;
  1151             while (width) {
  1152                 vector unsigned char voverflow;
  1153                 vector unsigned char vd;
  1154                 vector unsigned char valpha;
  1155                 vector unsigned char vdstalpha;
  1156                 /* s = *srcp */
  1157                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1158                 vs = vec_perm(vs, voverflow, valigner);
  1159                 vs = vec_perm(vs, v0, vsrcPermute);
  1160 
  1161                 valpha = vec_perm(vs, v0, valphaPermute);
  1162 
  1163                 /* d = *dstp */
  1164                 vd = (vector unsigned char) vec_ld(0, dstp);
  1165                 vd = vec_perm(vd, v0, vsdstPermute);
  1166                 vdstalpha = vec_and(vd, valphamask);
  1167 
  1168                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1169 
  1170                 /* set the alpha to the dest alpha */
  1171                 vd = vec_and(vd, vpixelmask);
  1172                 vd = vec_or(vd, vdstalpha);
  1173                 vd = vec_perm(vd, v0, vdstPermute);
  1174 
  1175                 /* *dstp = res */
  1176                 vec_st((vector unsigned int) vd, 0, dstp);
  1177 
  1178                 srcp += 4;
  1179                 dstp += 4;
  1180                 width -= 4;
  1181                 vs = voverflow;
  1182 
  1183             }
  1184             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1185         }
  1186         srcp += srcskip;
  1187         dstp += dstskip;
  1188 #undef ONE_PIXEL_BLEND
  1189     }
  1190 }
  1191 
  1192 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1193 static void
  1194 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1195 {
  1196     int width = info->d_width;
  1197     int height = info->d_height;
  1198     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1199     int srcskip = info->s_skip >> 2;
  1200     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1201     int dstskip = info->d_skip >> 2;
  1202     vector unsigned char mergePermute;
  1203     vector unsigned char valphaPermute;
  1204     vector unsigned char valphamask;
  1205     vector unsigned char vpixelmask;
  1206     vector unsigned char v0;
  1207     vector unsigned short v1;
  1208     vector unsigned short v8;
  1209     v0 = vec_splat_u8(0);
  1210     v1 = vec_splat_u16(1);
  1211     v8 = vec_splat_u16(8);
  1212     mergePermute = VEC_MERGE_PERMUTE();
  1213     valphamask = VEC_ALPHA_MASK();
  1214     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1215 
  1216 
  1217     vpixelmask = vec_nor(valphamask, v0);
  1218     while (height--) {
  1219         width = info->d_width;
  1220 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1221         while ((condition)) { \
  1222             Uint32 dalpha; \
  1223             Uint32 d; \
  1224             Uint32 s1; \
  1225             Uint32 d1; \
  1226             Uint32 s = *srcp; \
  1227             Uint32 alpha = s >> 24; \
  1228             if(alpha) { \
  1229               if(alpha == SDL_ALPHA_OPAQUE) { \
  1230                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1231               } else { \
  1232                 d = *dstp; \
  1233                 dalpha = d & 0xff000000; \
  1234                 s1 = s & 0xff00ff; \
  1235                 d1 = d & 0xff00ff; \
  1236                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1237                 s &= 0xff00; \
  1238                 d &= 0xff00; \
  1239                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1240                 *dstp = d1 | d | dalpha; \
  1241               } \
  1242             } \
  1243             ++srcp; \
  1244             ++dstp; \
  1245             widthvar--; \
  1246 	    }
  1247         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1248         if (width > 0) {
  1249             int extrawidth = (width % 4);
  1250             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1251             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1252             width -= extrawidth;
  1253             while (width) {
  1254                 vector unsigned char voverflow;
  1255                 vector unsigned char vd;
  1256                 vector unsigned char valpha;
  1257                 vector unsigned char vdstalpha;
  1258                 /* s = *srcp */
  1259                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1260                 vs = vec_perm(vs, voverflow, valigner);
  1261 
  1262                 valpha = vec_perm(vs, v0, valphaPermute);
  1263 
  1264                 /* d = *dstp */
  1265                 vd = (vector unsigned char) vec_ld(0, dstp);
  1266                 vdstalpha = vec_and(vd, valphamask);
  1267 
  1268                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1269 
  1270                 /* set the alpha to the dest alpha */
  1271                 vd = vec_and(vd, vpixelmask);
  1272                 vd = vec_or(vd, vdstalpha);
  1273 
  1274                 /* *dstp = res */
  1275                 vec_st((vector unsigned int) vd, 0, dstp);
  1276 
  1277                 srcp += 4;
  1278                 dstp += 4;
  1279                 width -= 4;
  1280                 vs = voverflow;
  1281             }
  1282             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1283         }
  1284         srcp += srcskip;
  1285         dstp += dstskip;
  1286     }
  1287 #undef ONE_PIXEL_BLEND
  1288 }
  1289 
  1290 static void
  1291 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1292 {
  1293     /* XXX : 6 */
  1294     unsigned alpha = info->src->alpha;
  1295     int height = info->d_height;
  1296     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1297     int srcskip = info->s_skip >> 2;
  1298     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1299     int dstskip = info->d_skip >> 2;
  1300     SDL_PixelFormat *srcfmt = info->src;
  1301     SDL_PixelFormat *dstfmt = info->dst;
  1302     unsigned sA = srcfmt->alpha;
  1303     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1304     vector unsigned char mergePermute;
  1305     vector unsigned char vsrcPermute;
  1306     vector unsigned char vdstPermute;
  1307     vector unsigned char vsdstPermute;
  1308     vector unsigned char valpha;
  1309     vector unsigned char valphamask;
  1310     vector unsigned char vbits;
  1311     vector unsigned short v1;
  1312     vector unsigned short v8;
  1313 
  1314     mergePermute = VEC_MERGE_PERMUTE();
  1315     v1 = vec_splat_u16(1);
  1316     v8 = vec_splat_u16(8);
  1317 
  1318     /* set the alpha to 255 on the destination surf */
  1319     valphamask = VEC_ALPHA_MASK();
  1320 
  1321     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1322     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1323     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1324 
  1325     /* set a vector full of alpha and 255-alpha */
  1326     ((unsigned char *) &valpha)[0] = alpha;
  1327     valpha = vec_splat(valpha, 0);
  1328     vbits = (vector unsigned char) vec_splat_s8(-1);
  1329 
  1330     while (height--) {
  1331         int width = info->d_width;
  1332 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1333             Uint32 Pixel; \
  1334             unsigned sR, sG, sB, dR, dG, dB; \
  1335             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1336             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1337             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1338             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1339             ++srcp; \
  1340             ++dstp; \
  1341             widthvar--; \
  1342         }
  1343         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1344         if (width > 0) {
  1345             int extrawidth = (width % 4);
  1346             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1347             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1348             width -= extrawidth;
  1349             while (width) {
  1350                 vector unsigned char voverflow;
  1351                 vector unsigned char vd;
  1352 
  1353                 /* s = *srcp */
  1354                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1355                 vs = vec_perm(vs, voverflow, valigner);
  1356                 vs = vec_perm(vs, valpha, vsrcPermute);
  1357 
  1358                 /* d = *dstp */
  1359                 vd = (vector unsigned char) vec_ld(0, dstp);
  1360                 vd = vec_perm(vd, vd, vsdstPermute);
  1361 
  1362                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1363 
  1364                 /* set the alpha channel to full on */
  1365                 vd = vec_or(vd, valphamask);
  1366                 vd = vec_perm(vd, vbits, vdstPermute);
  1367 
  1368                 /* *dstp = res */
  1369                 vec_st((vector unsigned int) vd, 0, dstp);
  1370 
  1371                 srcp += 4;
  1372                 dstp += 4;
  1373                 width -= 4;
  1374                 vs = voverflow;
  1375             }
  1376             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1377         }
  1378 #undef ONE_PIXEL_BLEND
  1379 
  1380         srcp += srcskip;
  1381         dstp += dstskip;
  1382     }
  1383 
  1384 }
  1385 
  1386 
  1387 /* fast RGB888->(A)RGB888 blending */
  1388 static void
  1389 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1390 {
  1391     unsigned alpha = info->src->alpha;
  1392     int height = info->d_height;
  1393     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1394     int srcskip = info->s_skip >> 2;
  1395     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1396     int dstskip = info->d_skip >> 2;
  1397     vector unsigned char mergePermute;
  1398     vector unsigned char valpha;
  1399     vector unsigned char valphamask;
  1400     vector unsigned short v1;
  1401     vector unsigned short v8;
  1402 
  1403     mergePermute = VEC_MERGE_PERMUTE();
  1404     v1 = vec_splat_u16(1);
  1405     v8 = vec_splat_u16(8);
  1406 
  1407     /* set the alpha to 255 on the destination surf */
  1408     valphamask = VEC_ALPHA_MASK();
  1409 
  1410     /* set a vector full of alpha and 255-alpha */
  1411     ((unsigned char *) &valpha)[0] = alpha;
  1412     valpha = vec_splat(valpha, 0);
  1413 
  1414     while (height--) {
  1415         int width = info->d_width;
  1416 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1417             Uint32 s = *srcp; \
  1418             Uint32 d = *dstp; \
  1419             Uint32 s1 = s & 0xff00ff; \
  1420             Uint32 d1 = d & 0xff00ff; \
  1421             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1422                  & 0xff00ff; \
  1423             s &= 0xff00; \
  1424             d &= 0xff00; \
  1425             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1426             *dstp = d1 | d | 0xff000000; \
  1427             ++srcp; \
  1428             ++dstp; \
  1429             widthvar--; \
  1430         }
  1431         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1432         if (width > 0) {
  1433             int extrawidth = (width % 4);
  1434             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1435             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1436             width -= extrawidth;
  1437             while (width) {
  1438                 vector unsigned char voverflow;
  1439                 vector unsigned char vd;
  1440 
  1441                 /* s = *srcp */
  1442                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1443                 vs = vec_perm(vs, voverflow, valigner);
  1444 
  1445                 /* d = *dstp */
  1446                 vd = (vector unsigned char) vec_ld(0, dstp);
  1447 
  1448                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1449 
  1450                 /* set the alpha channel to full on */
  1451                 vd = vec_or(vd, valphamask);
  1452 
  1453                 /* *dstp = res */
  1454                 vec_st((vector unsigned int) vd, 0, dstp);
  1455 
  1456                 srcp += 4;
  1457                 dstp += 4;
  1458                 width -= 4;
  1459                 vs = voverflow;
  1460             }
  1461             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1462         }
  1463 #undef ONE_PIXEL_BLEND
  1464 
  1465         srcp += srcskip;
  1466         dstp += dstskip;
  1467     }
  1468 }
  1469 
  1470 #if __MWERKS__
  1471 #pragma altivec_model off
  1472 #endif
  1473 #endif /* SDL_ALTIVEC_BLITTERS */
  1474 
  1475 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1476 static void
  1477 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1478 {
  1479     int width = info->d_width;
  1480     int height = info->d_height;
  1481     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1482     int srcskip = info->s_skip >> 2;
  1483     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1484     int dstskip = info->d_skip >> 2;
  1485 
  1486     while (height--) {
  1487 	    /* *INDENT-OFF* */
  1488 	    DUFFS_LOOP4({
  1489 		    Uint32 s = *srcp++;
  1490 		    Uint32 d = *dstp;
  1491 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1492 			       + (s & d & 0x00010101)) | 0xff000000;
  1493 	    }, width);
  1494 	    /* *INDENT-ON* */
  1495         srcp += srcskip;
  1496         dstp += dstskip;
  1497     }
  1498 }
  1499 
  1500 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1501 static void
  1502 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1503 {
  1504     unsigned alpha = info->src->alpha;
  1505     if (alpha == 128) {
  1506         BlitRGBtoRGBSurfaceAlpha128(info);
  1507     } else {
  1508         int width = info->d_width;
  1509         int height = info->d_height;
  1510         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1511         int srcskip = info->s_skip >> 2;
  1512         Uint32 *dstp = (Uint32 *) info->d_pixels;
  1513         int dstskip = info->d_skip >> 2;
  1514         Uint32 s;
  1515         Uint32 d;
  1516         Uint32 s1;
  1517         Uint32 d1;
  1518 
  1519         while (height--) {
  1520 			/* *INDENT-OFF* */
  1521 			DUFFS_LOOP_DOUBLE2({
  1522 				/* One Pixel Blend */
  1523 				s = *srcp;
  1524 				d = *dstp;
  1525 				s1 = s & 0xff00ff;
  1526 				d1 = d & 0xff00ff;
  1527 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1528 				     & 0xff00ff;
  1529 				s &= 0xff00;
  1530 				d &= 0xff00;
  1531 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1532 				*dstp = d1 | d | 0xff000000;
  1533 				++srcp;
  1534 				++dstp;
  1535 			},{
  1536 			        /* Two Pixels Blend */
  1537 				s = *srcp;
  1538 				d = *dstp;
  1539 				s1 = s & 0xff00ff;
  1540 				d1 = d & 0xff00ff;
  1541 				d1 += (s1 - d1) * alpha >> 8;
  1542 				d1 &= 0xff00ff;
  1543 				     
  1544 				s = ((s & 0xff00) >> 8) | 
  1545 					((srcp[1] & 0xff00) << 8);
  1546 				d = ((d & 0xff00) >> 8) |
  1547 					((dstp[1] & 0xff00) << 8);
  1548 				d += (s - d) * alpha >> 8;
  1549 				d &= 0x00ff00ff;
  1550 				
  1551 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1552 				++srcp;
  1553 				
  1554 			        s1 = *srcp;
  1555 				d1 = *dstp;
  1556 				s1 &= 0xff00ff;
  1557 				d1 &= 0xff00ff;
  1558 				d1 += (s1 - d1) * alpha >> 8;
  1559 				d1 &= 0xff00ff;
  1560 				
  1561 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1562 				++srcp;
  1563 				++dstp;
  1564 			}, width);
  1565 			/* *INDENT-ON* */
  1566             srcp += srcskip;
  1567             dstp += dstskip;
  1568         }
  1569     }
  1570 }
  1571 
  1572 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1573 static void
  1574 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1575 {
  1576     int width = info->d_width;
  1577     int height = info->d_height;
  1578     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1579     int srcskip = info->s_skip >> 2;
  1580     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1581     int dstskip = info->d_skip >> 2;
  1582 
  1583     while (height--) {
  1584 	    /* *INDENT-OFF* */
  1585 	    DUFFS_LOOP4({
  1586 		Uint32 dalpha;
  1587 		Uint32 d;
  1588 		Uint32 s1;
  1589 		Uint32 d1;
  1590 		Uint32 s = *srcp;
  1591 		Uint32 alpha = s >> 24;
  1592 		/* FIXME: Here we special-case opaque alpha since the
  1593 		   compositioning used (>>8 instead of /255) doesn't handle
  1594 		   it correctly. Also special-case alpha=0 for speed?
  1595 		   Benchmark this! */
  1596 		if(alpha) {   
  1597 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1598 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1599 		  } else {
  1600 		    /*
  1601 		     * take out the middle component (green), and process
  1602 		     * the other two in parallel. One multiply less.
  1603 		     */
  1604 		    d = *dstp;
  1605 		    dalpha = d & 0xff000000;
  1606 		    s1 = s & 0xff00ff;
  1607 		    d1 = d & 0xff00ff;
  1608 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1609 		    s &= 0xff00;
  1610 		    d &= 0xff00;
  1611 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1612 		    *dstp = d1 | d | dalpha;
  1613 		  }
  1614 		}
  1615 		++srcp;
  1616 		++dstp;
  1617 	    }, width);
  1618 	    /* *INDENT-ON* */
  1619         srcp += srcskip;
  1620         dstp += dstskip;
  1621     }
  1622 }
  1623 
  1624 #if GCC_ASMBLIT
  1625 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1626 static void
  1627 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1628 {
  1629     int width = info->d_width;
  1630     int height = info->d_height;
  1631     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1632     int srcskip = info->s_skip >> 2;
  1633     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1634     int dstskip = info->d_skip >> 2;
  1635     SDL_PixelFormat *sf = info->src;
  1636     Uint32 amask = sf->Amask;
  1637 
  1638     __asm__(
  1639                /* make mm6 all zeros. */
  1640                "pxor       %%mm6, %%mm6\n"
  1641                /* Make a mask to preserve the alpha. */
  1642                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1643                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1644                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1645                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1646                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1647                /* form channel masks */
  1648                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1649                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1650                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1651                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1652                /* get alpha channel shift */
  1653                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1654   : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
  1655 
  1656     while (height--) {
  1657 
  1658 	    /* *INDENT-OFF* */
  1659 	    DUFFS_LOOP4({
  1660 		Uint32 alpha;
  1661 
  1662 		__asm__ (
  1663 		"prefetch 64(%0)\n"
  1664 		"prefetch 64(%1)\n"
  1665 			: : "r" (srcp), "r" (dstp) );
  1666 
  1667 		alpha = *srcp & amask;
  1668 		/* FIXME: Here we special-case opaque alpha since the
  1669 		   compositioning used (>>8 instead of /255) doesn't handle
  1670 		   it correctly. Also special-case alpha=0 for speed?
  1671 		   Benchmark this! */
  1672 		if(alpha == 0) {
  1673 		    /* do nothing */
  1674 		}
  1675 		else if(alpha == amask) {
  1676 			/* opaque alpha -- copy RGB, keep dst alpha */
  1677 		    /* using MMX here to free up regular registers for other things */
  1678 			    __asm__ (
  1679 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1680 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1681 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1682 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1683 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1684 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1685 
  1686 		     : : "r" (srcp), "r" (dstp) );
  1687 		} 
  1688 
  1689 		else {
  1690 			    __asm__ (
  1691 		    /* load in the source, and dst. */
  1692 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1693 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1694 
  1695 		    /* Move the src alpha into mm2 */
  1696 
  1697 		    /* if supporting pshufw */
  1698 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1699 		    /*"psrlw     $8, %%mm2\n" */
  1700 		    
  1701 		    /* else: */
  1702 		    "movd       %2,    %%mm2\n"
  1703 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1704 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1705 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1706 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1707 
  1708 		    /* move the colors into words. */
  1709 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1710 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1711 
  1712 		    /* src - dst */
  1713 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1714 
  1715 		    /* A * (src-dst) */
  1716 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1717 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1718 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1719 
  1720 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1721 		    
  1722 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1723 
  1724 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1725 
  1726 		}
  1727 		++srcp;
  1728 		++dstp;
  1729 	    }, width);
  1730 	    /* *INDENT-ON* */
  1731         srcp += srcskip;
  1732         dstp += dstskip;
  1733     }
  1734 
  1735   __asm__("emms\n":);
  1736 }
  1737 
  1738 /* End GCC_ASMBLIT*/
  1739 
  1740 #elif MSVC_ASMBLIT
  1741 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1742 static void
  1743 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1744 {
  1745     int width = info->d_width;
  1746     int height = info->d_height;
  1747     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1748     int srcskip = info->s_skip >> 2;
  1749     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1750     int dstskip = info->d_skip >> 2;
  1751     SDL_PixelFormat *sf = info->src;
  1752     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1753     Uint32 amask = sf->Amask;
  1754     Uint32 ashift = sf->Ashift;
  1755     Uint64 multmask;
  1756 
  1757     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1758 
  1759     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1760 	/* *INDENT-OFF* */
  1761     multmask = ~(0xFFFFI64 << (ashift * 2));
  1762 	/* *INDENT-ON* */
  1763     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1764 
  1765     while (height--) {
  1766 	    /* *INDENT-OFF* */
  1767 	    DUFFS_LOOP4({
  1768 		Uint32 alpha;
  1769 
  1770 		_m_prefetch(srcp + 16);
  1771 		_m_prefetch(dstp + 16);
  1772 
  1773 		alpha = *srcp & amask;
  1774 		if (alpha == 0) {
  1775 			/* do nothing */
  1776 		} else if (alpha == amask) {
  1777 			/* copy RGB, keep dst alpha */
  1778 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1779 		} else {
  1780 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1781 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1782 
  1783 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1784 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1785 
  1786 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1787 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1788 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1789 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1790 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1791 
  1792 			/* blend */		    
  1793 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1794 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1795 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1796 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1797 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1798 			
  1799 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1800 		}
  1801 		++srcp;
  1802 		++dstp;
  1803 	    }, width);
  1804 	    /* *INDENT-ON* */
  1805         srcp += srcskip;
  1806         dstp += dstskip;
  1807     }
  1808     _mm_empty();
  1809 }
  1810 
  1811 /* End MSVC_ASMBLIT */
  1812 
  1813 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1814 
  1815 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1816 
  1817 /* blend a single 16 bit pixel at 50% */
  1818 #define BLEND16_50(d, s, mask)						\
  1819 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1820 
  1821 /* blend two 16 bit pixels at 50% */
  1822 #define BLEND2x16_50(d, s, mask)					     \
  1823 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1824 	 + (s & d & (~(mask | mask << 16))))
  1825 
  1826 static void
  1827 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1828 {
  1829     int width = info->d_width;
  1830     int height = info->d_height;
  1831     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1832     int srcskip = info->s_skip >> 1;
  1833     Uint16 *dstp = (Uint16 *) info->d_pixels;
  1834     int dstskip = info->d_skip >> 1;
  1835 
  1836     while (height--) {
  1837         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1838             /*
  1839              * Source and destination not aligned, pipeline it.
  1840              * This is mostly a win for big blits but no loss for
  1841              * small ones
  1842              */
  1843             Uint32 prev_sw;
  1844             int w = width;
  1845 
  1846             /* handle odd destination */
  1847             if ((uintptr_t) dstp & 2) {
  1848                 Uint16 d = *dstp, s = *srcp;
  1849                 *dstp = BLEND16_50(d, s, mask);
  1850                 dstp++;
  1851                 srcp++;
  1852                 w--;
  1853             }
  1854             srcp++;             /* srcp is now 32-bit aligned */
  1855 
  1856             /* bootstrap pipeline with first halfword */
  1857             prev_sw = ((Uint32 *) srcp)[-1];
  1858 
  1859             while (w > 1) {
  1860                 Uint32 sw, dw, s;
  1861                 sw = *(Uint32 *) srcp;
  1862                 dw = *(Uint32 *) dstp;
  1863 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1864                 s = (prev_sw << 16) + (sw >> 16);
  1865 #else
  1866                 s = (prev_sw >> 16) + (sw << 16);
  1867 #endif
  1868                 prev_sw = sw;
  1869                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1870                 dstp += 2;
  1871                 srcp += 2;
  1872                 w -= 2;
  1873             }
  1874 
  1875             /* final pixel if any */
  1876             if (w) {
  1877                 Uint16 d = *dstp, s;
  1878 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1879                 s = (Uint16) prev_sw;
  1880 #else
  1881                 s = (Uint16) (prev_sw >> 16);
  1882 #endif
  1883                 *dstp = BLEND16_50(d, s, mask);
  1884                 srcp++;
  1885                 dstp++;
  1886             }
  1887             srcp += srcskip - 1;
  1888             dstp += dstskip;
  1889         } else {
  1890             /* source and destination are aligned */
  1891             int w = width;
  1892 
  1893             /* first odd pixel? */
  1894             if ((uintptr_t) srcp & 2) {
  1895                 Uint16 d = *dstp, s = *srcp;
  1896                 *dstp = BLEND16_50(d, s, mask);
  1897                 srcp++;
  1898                 dstp++;
  1899                 w--;
  1900             }
  1901             /* srcp and dstp are now 32-bit aligned */
  1902 
  1903             while (w > 1) {
  1904                 Uint32 sw = *(Uint32 *) srcp;
  1905                 Uint32 dw = *(Uint32 *) dstp;
  1906                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1907                 srcp += 2;
  1908                 dstp += 2;
  1909                 w -= 2;
  1910             }
  1911 
  1912             /* last odd pixel? */
  1913             if (w) {
  1914                 Uint16 d = *dstp, s = *srcp;
  1915                 *dstp = BLEND16_50(d, s, mask);
  1916                 srcp++;
  1917                 dstp++;
  1918             }
  1919             srcp += srcskip;
  1920             dstp += dstskip;
  1921         }
  1922     }
  1923 }
  1924 
  1925 #if GCC_ASMBLIT
  1926 /* fast RGB565->RGB565 blending with surface alpha */
  1927 static void
  1928 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1929 {
  1930     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1931     if (alpha == 128) {
  1932         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1933     } else {
  1934         int width = info->d_width;
  1935         int height = info->d_height;
  1936         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1937         int srcskip = info->s_skip >> 1;
  1938         Uint16 *dstp = (Uint16 *) info->d_pixels;
  1939         int dstskip = info->d_skip >> 1;
  1940         Uint32 s, d;
  1941         Uint8 load[8];
  1942 
  1943         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1944         *(Uint64 *) load = alpha;
  1945         alpha >>= 3;            /* downscale alpha to 5 bits */
  1946 
  1947         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1948         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1949         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1950         /* position alpha to allow for mullo and mulhi on diff channels
  1951            to reduce the number of operations */
  1952         psllq_i2r(3, mm0);
  1953 
  1954         /* Setup the 565 color channel masks */
  1955         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1956         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1957         *(Uint64 *) load = 0x001F001F001F001FULL;
  1958         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1959         while (height--) {
  1960 			/* *INDENT-OFF* */
  1961 			DUFFS_LOOP_QUATRO2(
  1962 			{
  1963 				s = *srcp++;
  1964 				d = *dstp;
  1965 				/*
  1966 				 * shift out the middle component (green) to
  1967 				 * the high 16 bits, and process all three RGB
  1968 				 * components at the same time.
  1969 				 */
  1970 				s = (s | s << 16) & 0x07e0f81f;
  1971 				d = (d | d << 16) & 0x07e0f81f;
  1972 				d += (s - d) * alpha >> 5;
  1973 				d &= 0x07e0f81f;
  1974 				*dstp++ = d | d >> 16;
  1975 			},{
  1976 				s = *srcp++;
  1977 				d = *dstp;
  1978 				/*
  1979 				 * shift out the middle component (green) to
  1980 				 * the high 16 bits, and process all three RGB
  1981 				 * components at the same time.
  1982 				 */
  1983 				s = (s | s << 16) & 0x07e0f81f;
  1984 				d = (d | d << 16) & 0x07e0f81f;
  1985 				d += (s - d) * alpha >> 5;
  1986 				d &= 0x07e0f81f;
  1987 				*dstp++ = d | d >> 16;
  1988 				s = *srcp++;
  1989 				d = *dstp;
  1990 				/*
  1991 				 * shift out the middle component (green) to
  1992 				 * the high 16 bits, and process all three RGB
  1993 				 * components at the same time.
  1994 				 */
  1995 				s = (s | s << 16) & 0x07e0f81f;
  1996 				d = (d | d << 16) & 0x07e0f81f;
  1997 				d += (s - d) * alpha >> 5;
  1998 				d &= 0x07e0f81f;
  1999 				*dstp++ = d | d >> 16;
  2000 			},{
  2001 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2002 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2003 
  2004 				/* red -- does not need a mask since the right shift clears
  2005 				   the uninteresting bits */
  2006 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2007 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2008 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2009 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2010 
  2011 				/* blend */
  2012 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2013 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2014 				/* alpha used is actually 11 bits
  2015 				   11 + 5 = 16 bits, so the sign bits are lost */
  2016 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2017 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2018 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2019 
  2020 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2021 
  2022 				/* green -- process the bits in place */
  2023 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2024 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2025 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2026 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2027 
  2028 				/* blend */
  2029 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2030 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2031 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2032 				   bits are gone and the sign bits present */
  2033 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2034 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2035 
  2036 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2037 
  2038 				/* blue */
  2039 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2040 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2041 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2042 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2043 
  2044 				/* blend */
  2045 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2046 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2047 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2048 				   the interesting bits will need to be MASKed */
  2049 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2050 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2051 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2052 
  2053 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2054 
  2055 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2056 
  2057 				srcp += 4;
  2058 				dstp += 4;
  2059 			}, width);			
  2060 			/* *INDENT-ON* */
  2061             srcp += srcskip;
  2062             dstp += dstskip;
  2063         }
  2064         emms();
  2065     }
  2066 }
  2067 
  2068 /* fast RGB555->RGB555 blending with surface alpha */
  2069 static void
  2070 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2071 {
  2072     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2073     if (alpha == 128) {
  2074         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2075     } else {
  2076         int width = info->d_width;
  2077         int height = info->d_height;
  2078         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2079         int srcskip = info->s_skip >> 1;
  2080         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2081         int dstskip = info->d_skip >> 1;
  2082         Uint32 s, d;
  2083         Uint8 load[8];
  2084 
  2085         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2086         *(Uint64 *) load = alpha;
  2087         alpha >>= 3;            /* downscale alpha to 5 bits */
  2088 
  2089         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2090         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2091         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2092         /* position alpha to allow for mullo and mulhi on diff channels
  2093            to reduce the number of operations */
  2094         psllq_i2r(3, mm0);
  2095 
  2096         /* Setup the 555 color channel masks */
  2097         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2098         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2099         *(Uint64 *) load = 0x001F001F001F001FULL;
  2100         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2101         while (height--) {
  2102 			/* *INDENT-OFF* */
  2103 			DUFFS_LOOP_QUATRO2(
  2104 			{
  2105 				s = *srcp++;
  2106 				d = *dstp;
  2107 				/*
  2108 				 * shift out the middle component (green) to
  2109 				 * the high 16 bits, and process all three RGB
  2110 				 * components at the same time.
  2111 				 */
  2112 				s = (s | s << 16) & 0x03e07c1f;
  2113 				d = (d | d << 16) & 0x03e07c1f;
  2114 				d += (s - d) * alpha >> 5;
  2115 				d &= 0x03e07c1f;
  2116 				*dstp++ = d | d >> 16;
  2117 			},{
  2118 				s = *srcp++;
  2119 				d = *dstp;
  2120 				/*
  2121 				 * shift out the middle component (green) to
  2122 				 * the high 16 bits, and process all three RGB
  2123 				 * components at the same time.
  2124 				 */
  2125 				s = (s | s << 16) & 0x03e07c1f;
  2126 				d = (d | d << 16) & 0x03e07c1f;
  2127 				d += (s - d) * alpha >> 5;
  2128 				d &= 0x03e07c1f;
  2129 				*dstp++ = d | d >> 16;
  2130 			        s = *srcp++;
  2131 				d = *dstp;
  2132 				/*
  2133 				 * shift out the middle component (green) to
  2134 				 * the high 16 bits, and process all three RGB
  2135 				 * components at the same time.
  2136 				 */
  2137 				s = (s | s << 16) & 0x03e07c1f;
  2138 				d = (d | d << 16) & 0x03e07c1f;
  2139 				d += (s - d) * alpha >> 5;
  2140 				d &= 0x03e07c1f;
  2141 				*dstp++ = d | d >> 16;
  2142 			},{
  2143 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2144 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2145 
  2146 				/* red -- process the bits in place */
  2147 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2148 					/* by reusing the GREEN mask we free up another mmx
  2149 					   register to accumulate the result */
  2150 
  2151 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2152 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2153 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2154 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2155 
  2156 				/* blend */
  2157 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2158 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2159 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2160 				   cleared by a MASK below */
  2161 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2162 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2163 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2164 
  2165 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2166 
  2167 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2168 
  2169 				/* green -- process the bits in place */
  2170 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2171 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2172 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2173 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2174 
  2175 				/* blend */
  2176 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2177 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2178 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2179 				   bits are gone and the sign bits present */
  2180 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2181 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2182 
  2183 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2184 
  2185 				/* blue */
  2186 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2187 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2188 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2189 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2190 
  2191 				/* blend */
  2192 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2193 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2194 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2195 				   the interesting bits will need to be MASKed */
  2196 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2197 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2198 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2199 
  2200 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2201 
  2202 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2203 
  2204 				srcp += 4;
  2205 				dstp += 4;
  2206 			}, width);
  2207 			/* *INDENT-ON* */
  2208             srcp += srcskip;
  2209             dstp += dstskip;
  2210         }
  2211         emms();
  2212     }
  2213 }
  2214 
  2215 /* End GCC_ASMBLIT */
  2216 
  2217 #elif MSVC_ASMBLIT
  2218 /* fast RGB565->RGB565 blending with surface alpha */
  2219 static void
  2220 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2221 {
  2222     unsigned alpha = info->src->alpha;
  2223     if (alpha == 128) {
  2224         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2225     } else {
  2226         int width = info->d_width;
  2227         int height = info->d_height;
  2228         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2229         int srcskip = info->s_skip >> 1;
  2230         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2231         int dstskip = info->d_skip >> 1;
  2232         Uint32 s, d;
  2233 
  2234         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2235 
  2236         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2237         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2238         alpha >>= 3;            /* downscale alpha to 5 bits */
  2239 
  2240         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2241         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2242         /* position alpha to allow for mullo and mulhi on diff channels
  2243            to reduce the number of operations */
  2244         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2245 
  2246         /* Setup the 565 color channel masks */
  2247         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2248         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2249 
  2250         while (height--) {
  2251 			/* *INDENT-OFF* */
  2252 			DUFFS_LOOP_QUATRO2(
  2253 			{
  2254 				s = *srcp++;
  2255 				d = *dstp;
  2256 				/*
  2257 				 * shift out the middle component (green) to
  2258 				 * the high 16 bits, and process all three RGB
  2259 				 * components at the same time.
  2260 				 */
  2261 				s = (s | s << 16) & 0x07e0f81f;
  2262 				d = (d | d << 16) & 0x07e0f81f;
  2263 				d += (s - d) * alpha >> 5;
  2264 				d &= 0x07e0f81f;
  2265 				*dstp++ = (Uint16)(d | d >> 16);
  2266 			},{
  2267 				s = *srcp++;
  2268 				d = *dstp;
  2269 				/*
  2270 				 * shift out the middle component (green) to
  2271 				 * the high 16 bits, and process all three RGB
  2272 				 * components at the same time.
  2273 				 */
  2274 				s = (s | s << 16) & 0x07e0f81f;
  2275 				d = (d | d << 16) & 0x07e0f81f;
  2276 				d += (s - d) * alpha >> 5;
  2277 				d &= 0x07e0f81f;
  2278 				*dstp++ = (Uint16)(d | d >> 16);
  2279 				s = *srcp++;
  2280 				d = *dstp;
  2281 				/*
  2282 				 * shift out the middle component (green) to
  2283 				 * the high 16 bits, and process all three RGB
  2284 				 * components at the same time.
  2285 				 */
  2286 				s = (s | s << 16) & 0x07e0f81f;
  2287 				d = (d | d << 16) & 0x07e0f81f;
  2288 				d += (s - d) * alpha >> 5;
  2289 				d &= 0x07e0f81f;
  2290 				*dstp++ = (Uint16)(d | d >> 16);
  2291 			},{
  2292 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2293 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2294 
  2295 				/* red */
  2296 				src2 = src1;
  2297 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2298 
  2299 				dst2 = dst1;
  2300 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2301 
  2302 				/* blend */
  2303 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2304 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2305 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2306 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2307 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2308 
  2309 				mm_res = dst2; /* RED -> mm_res */
  2310 
  2311 				/* green -- process the bits in place */
  2312 				src2 = src1;
  2313 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2314 
  2315 				dst2 = dst1;
  2316 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2317 
  2318 				/* blend */
  2319 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2320 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2321 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2322 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2323 
  2324 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2325 
  2326 				/* blue */
  2327 				src2 = src1;
  2328 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2329 
  2330 				dst2 = dst1;
  2331 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2332 
  2333 				/* blend */
  2334 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2335 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2336 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2337 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2338 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2339 
  2340 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2341 
  2342 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2343 
  2344 				srcp += 4;
  2345 				dstp += 4;
  2346 			}, width);
  2347 			/* *INDENT-ON* */
  2348             srcp += srcskip;
  2349             dstp += dstskip;
  2350         }
  2351         _mm_empty();
  2352     }
  2353 }
  2354 
  2355 /* fast RGB555->RGB555 blending with surface alpha */
  2356 static void
  2357 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2358 {
  2359     unsigned alpha = info->src->alpha;
  2360     if (alpha == 128) {
  2361         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2362     } else {
  2363         int width = info->d_width;
  2364         int height = info->d_height;
  2365         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2366         int srcskip = info->s_skip >> 1;
  2367         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2368         int dstskip = info->d_skip >> 1;
  2369         Uint32 s, d;
  2370 
  2371         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2372 
  2373         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2374         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2375         alpha >>= 3;            /* downscale alpha to 5 bits */
  2376 
  2377         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2378         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2379         /* position alpha to allow for mullo and mulhi on diff channels
  2380            to reduce the number of operations */
  2381         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2382 
  2383         /* Setup the 555 color channel masks */
  2384         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2385         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2386         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2387 
  2388         while (height--) {
  2389 			/* *INDENT-OFF* */
  2390 			DUFFS_LOOP_QUATRO2(
  2391 			{
  2392 				s = *srcp++;
  2393 				d = *dstp;
  2394 				/*
  2395 				 * shift out the middle component (green) to
  2396 				 * the high 16 bits, and process all three RGB
  2397 				 * components at the same time.
  2398 				 */
  2399 				s = (s | s << 16) & 0x03e07c1f;
  2400 				d = (d | d << 16) & 0x03e07c1f;
  2401 				d += (s - d) * alpha >> 5;
  2402 				d &= 0x03e07c1f;
  2403 				*dstp++ = (Uint16)(d | d >> 16);
  2404 			},{
  2405 				s = *srcp++;
  2406 				d = *dstp;
  2407 				/*
  2408 				 * shift out the middle component (green) to
  2409 				 * the high 16 bits, and process all three RGB
  2410 				 * components at the same time.
  2411 				 */
  2412 				s = (s | s << 16) & 0x03e07c1f;
  2413 				d = (d | d << 16) & 0x03e07c1f;
  2414 				d += (s - d) * alpha >> 5;
  2415 				d &= 0x03e07c1f;
  2416 				*dstp++ = (Uint16)(d | d >> 16);
  2417 			        s = *srcp++;
  2418 				d = *dstp;
  2419 				/*
  2420 				 * shift out the middle component (green) to
  2421 				 * the high 16 bits, and process all three RGB
  2422 				 * components at the same time.
  2423 				 */
  2424 				s = (s | s << 16) & 0x03e07c1f;
  2425 				d = (d | d << 16) & 0x03e07c1f;
  2426 				d += (s - d) * alpha >> 5;
  2427 				d &= 0x03e07c1f;
  2428 				*dstp++ = (Uint16)(d | d >> 16);
  2429 			},{
  2430 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2431 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2432 
  2433 				/* red -- process the bits in place */
  2434 				src2 = src1;
  2435 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2436 
  2437 				dst2 = dst1;
  2438 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2439 
  2440 				/* blend */
  2441 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2442 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2443 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2444 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2445 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2446 
  2447 				mm_res = dst2; /* RED -> mm_res */
  2448 				
  2449 				/* green -- process the bits in place */
  2450 				src2 = src1;
  2451 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2452 
  2453 				dst2 = dst1;
  2454 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2455 
  2456 				/* blend */
  2457 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2458 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2459 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2460 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2461 
  2462 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2463 
  2464 				/* blue */
  2465 				src2 = src1; /* src -> src2 */
  2466 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2467 
  2468 				dst2 = dst1; /* dst -> dst2 */
  2469 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2470 
  2471 				/* blend */
  2472 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2473 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2474 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2475 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2476 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2477 
  2478 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2479 
  2480 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2481 
  2482 				srcp += 4;
  2483 				dstp += 4;
  2484 			}, width);
  2485 			/* *INDENT-ON* */
  2486             srcp += srcskip;
  2487             dstp += dstskip;
  2488         }
  2489         _mm_empty();
  2490     }
  2491 }
  2492 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2493 
  2494 /* fast RGB565->RGB565 blending with surface alpha */
  2495 static void
  2496 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2497 {
  2498     unsigned alpha = info->src->alpha;
  2499     if (alpha == 128) {
  2500         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2501     } else {
  2502         int width = info->d_width;
  2503         int height = info->d_height;
  2504         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2505         int srcskip = info->s_skip >> 1;
  2506         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2507         int dstskip = info->d_skip >> 1;
  2508         alpha >>= 3;            /* downscale alpha to 5 bits */
  2509 
  2510         while (height--) {
  2511 			/* *INDENT-OFF* */
  2512 			DUFFS_LOOP4({
  2513 				Uint32 s = *srcp++;
  2514 				Uint32 d = *dstp;
  2515 				/*
  2516 				 * shift out the middle component (green) to
  2517 				 * the high 16 bits, and process all three RGB
  2518 				 * components at the same time.
  2519 				 */
  2520 				s = (s | s << 16) & 0x07e0f81f;
  2521 				d = (d | d << 16) & 0x07e0f81f;
  2522 				d += (s - d) * alpha >> 5;
  2523 				d &= 0x07e0f81f;
  2524 				*dstp++ = (Uint16)(d | d >> 16);
  2525 			}, width);
  2526 			/* *INDENT-ON* */
  2527             srcp += srcskip;
  2528             dstp += dstskip;
  2529         }
  2530     }
  2531 }
  2532 
  2533 /* fast RGB555->RGB555 blending with surface alpha */
  2534 static void
  2535 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2536 {
  2537     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2538     if (alpha == 128) {
  2539         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2540     } else {
  2541         int width = info->d_width;
  2542         int height = info->d_height;
  2543         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2544         int srcskip = info->s_skip >> 1;
  2545         Uint16 *dstp = (Uint16 *) info->d_pixels;
  2546         int dstskip = info->d_skip >> 1;
  2547         alpha >>= 3;            /* downscale alpha to 5 bits */
  2548 
  2549         while (height--) {
  2550 			/* *INDENT-OFF* */
  2551 			DUFFS_LOOP4({
  2552 				Uint32 s = *srcp++;
  2553 				Uint32 d = *dstp;
  2554 				/*
  2555 				 * shift out the middle component (green) to
  2556 				 * the high 16 bits, and process all three RGB
  2557 				 * components at the same time.
  2558 				 */
  2559 				s = (s | s << 16) & 0x03e07c1f;
  2560 				d = (d | d << 16) & 0x03e07c1f;
  2561 				d += (s - d) * alpha >> 5;
  2562 				d &= 0x03e07c1f;
  2563 				*dstp++ = (Uint16)(d | d >> 16);
  2564 			}, width);
  2565 			/* *INDENT-ON* */
  2566             srcp += srcskip;
  2567             dstp += dstskip;
  2568         }
  2569     }
  2570 }
  2571 
  2572 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2573 static void
  2574 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2575 {
  2576     int width = info->d_width;
  2577     int height = info->d_height;
  2578     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2579     int srcskip = info->s_skip >> 2;
  2580     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2581     int dstskip = info->d_skip >> 1;
  2582 
  2583     while (height--) {
  2584 	    /* *INDENT-OFF* */
  2585 	    DUFFS_LOOP4({
  2586 		Uint32 s = *srcp;
  2587 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2588 		/* FIXME: Here we special-case opaque alpha since the
  2589 		   compositioning used (>>8 instead of /255) doesn't handle
  2590 		   it correctly. Also special-case alpha=0 for speed?
  2591 		   Benchmark this! */
  2592 		if(alpha) {   
  2593 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2594 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2595 		  } else {
  2596 		    Uint32 d = *dstp;
  2597 		    /*
  2598 		     * convert source and destination to G0RAB65565
  2599 		     * and blend all components at the same time
  2600 		     */
  2601 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2602 		      + (s >> 3 & 0x1f);
  2603 		    d = (d | d << 16) & 0x07e0f81f;
  2604 		    d += (s - d) * alpha >> 5;
  2605 		    d &= 0x07e0f81f;
  2606 		    *dstp = (Uint16)(d | d >> 16);
  2607 		  }
  2608 		}
  2609 		srcp++;
  2610 		dstp++;
  2611 	    }, width);
  2612 	    /* *INDENT-ON* */
  2613         srcp += srcskip;
  2614         dstp += dstskip;
  2615     }
  2616 }
  2617 
  2618 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2619 static void
  2620 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2621 {
  2622     int width = info->d_width;
  2623     int height = info->d_height;
  2624     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2625     int srcskip = info->s_skip >> 2;
  2626     Uint16 *dstp = (Uint16 *) info->d_pixels;
  2627     int dstskip = info->d_skip >> 1;
  2628 
  2629     while (height--) {
  2630 	    /* *INDENT-OFF* */
  2631 	    DUFFS_LOOP4({
  2632 		unsigned alpha;
  2633 		Uint32 s = *srcp;
  2634 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2635 		/* FIXME: Here we special-case opaque alpha since the
  2636 		   compositioning used (>>8 instead of /255) doesn't handle
  2637 		   it correctly. Also special-case alpha=0 for speed?
  2638 		   Benchmark this! */
  2639 		if(alpha) {   
  2640 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2641 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2642 		  } else {
  2643 		    Uint32 d = *dstp;
  2644 		    /*
  2645 		     * convert source and destination to G0RAB65565
  2646 		     * and blend all components at the same time
  2647 		     */
  2648 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2649 		      + (s >> 3 & 0x1f);
  2650 		    d = (d | d << 16) & 0x03e07c1f;
  2651 		    d += (s - d) * alpha >> 5;
  2652 		    d &= 0x03e07c1f;
  2653 		    *dstp = (Uint16)(d | d >> 16);
  2654 		  }
  2655 		}
  2656 		srcp++;
  2657 		dstp++;
  2658 	    }, width);
  2659 	    /* *INDENT-ON* */
  2660         srcp += srcskip;
  2661         dstp += dstskip;
  2662     }
  2663 }
  2664 
  2665 /* General (slow) N->N blending with per-surface alpha */
  2666 static void
  2667 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2668 {
  2669     int width = info->d_width;
  2670     int height = info->d_height;
  2671     Uint8 *src = info->s_pixels;
  2672     int srcskip = info->s_skip;
  2673     Uint8 *dst = info->d_pixels;
  2674     int dstskip = info->d_skip;
  2675     SDL_PixelFormat *srcfmt = info->src;
  2676     SDL_PixelFormat *dstfmt = info->dst;
  2677     int srcbpp = srcfmt->BytesPerPixel;
  2678     int dstbpp = dstfmt->BytesPerPixel;
  2679     unsigned sA = srcfmt->alpha;
  2680     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2681 
  2682     if (sA) {
  2683         while (height--) {
  2684 	    /* *INDENT-OFF* */
  2685 	    DUFFS_LOOP4(
  2686 	    {
  2687 		Uint32 Pixel;
  2688 		unsigned sR;
  2689 		unsigned sG;
  2690 		unsigned sB;
  2691 		unsigned dR;
  2692 		unsigned dG;
  2693 		unsigned dB;
  2694 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2695 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2696 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2697 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2698 		src += srcbpp;
  2699 		dst += dstbpp;
  2700 	    },
  2701 	    width);
  2702 	    /* *INDENT-ON* */
  2703             src += srcskip;
  2704             dst += dstskip;
  2705         }
  2706     }
  2707 }
  2708 
  2709 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2710 static void
  2711 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2712 {
  2713     int width = info->d_width;
  2714     int height = info->d_height;
  2715     Uint8 *src = info->s_pixels;
  2716     int srcskip = info->s_skip;
  2717     Uint8 *dst = info->d_pixels;
  2718     int dstskip = info->d_skip;
  2719     SDL_PixelFormat *srcfmt = info->src;
  2720     SDL_PixelFormat *dstfmt = info->dst;
  2721     Uint32 ckey = srcfmt->colorkey;
  2722     int srcbpp = srcfmt->BytesPerPixel;
  2723     int dstbpp = dstfmt->BytesPerPixel;
  2724     unsigned sA = srcfmt->alpha;
  2725     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2726 
  2727     while (height--) {
  2728 	    /* *INDENT-OFF* */
  2729 	    DUFFS_LOOP4(
  2730 	    {
  2731 		Uint32 Pixel;
  2732 		unsigned sR;
  2733 		unsigned sG;
  2734 		unsigned sB;
  2735 		unsigned dR;
  2736 		unsigned dG;
  2737 		unsigned dB;
  2738 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2739 		if(sA && Pixel != ckey) {
  2740 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2741 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2742 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2743 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2744 		}
  2745 		src += srcbpp;
  2746 		dst += dstbpp;
  2747 	    },
  2748 	    width);
  2749 	    /* *INDENT-ON* */
  2750         src += srcskip;
  2751         dst += dstskip;
  2752     }
  2753 }
  2754 
  2755 /* General (slow) N->N blending with pixel alpha */
  2756 static void
  2757 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2758 {
  2759     int width = info->d_width;
  2760     int height = info->d_height;
  2761     Uint8 *src = info->s_pixels;
  2762     int srcskip = info->s_skip;
  2763     Uint8 *dst = info->d_pixels;
  2764     int dstskip = info->d_skip;
  2765     SDL_PixelFormat *srcfmt = info->src;
  2766     SDL_PixelFormat *dstfmt = info->dst;
  2767 
  2768     int srcbpp;
  2769     int dstbpp;
  2770 
  2771     /* Set up some basic variables */
  2772     srcbpp = srcfmt->BytesPerPixel;
  2773     dstbpp = dstfmt->BytesPerPixel;
  2774 
  2775     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2776        quite right. for <8bpp source alpha, it gets them very wrong
  2777        (check all macros!)
  2778        It is unclear whether there is a good general solution that doesn't
  2779        need a branch (or a divide). */
  2780     while (height--) {
  2781 	    /* *INDENT-OFF* */
  2782 	    DUFFS_LOOP4(
  2783 	    {
  2784 		Uint32 Pixel;
  2785 		unsigned sR;
  2786 		unsigned sG;
  2787 		unsigned sB;
  2788 		unsigned dR;
  2789 		unsigned dG;
  2790 		unsigned dB;
  2791 		unsigned sA;
  2792 		unsigned dA;
  2793 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2794 		if(sA) {
  2795 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2796 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2797 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2798 		}
  2799 		src += srcbpp;
  2800 		dst += dstbpp;
  2801 	    },
  2802 	    width);
  2803 	    /* *INDENT-ON* */
  2804         src += srcskip;
  2805         dst += dstskip;
  2806     }
  2807 }
  2808 
  2809 
  2810 SDL_loblit
  2811 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2812 {
  2813     SDL_PixelFormat *sf = surface->format;
  2814     SDL_PixelFormat *df = surface->map->dst->format;
  2815 
  2816     if (sf->Amask == 0) {
  2817         if ((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2818             if (df->BytesPerPixel == 1)
  2819                 return BlitNto1SurfaceAlphaKey;
  2820             else
  2821 #if SDL_ALTIVEC_BLITTERS
  2822                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2823                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2824                     && SDL_HasAltiVec())
  2825                 return Blit32to32SurfaceAlphaKeyAltivec;
  2826             else
  2827 #endif
  2828                 return BlitNtoNSurfaceAlphaKey;
  2829         } else {
  2830             /* Per-surface alpha blits */
  2831             switch (df->BytesPerPixel) {
  2832             case 1:
  2833                 return BlitNto1SurfaceAlpha;
  2834 
  2835             case 2:
  2836                 if (surface->map->identity) {
  2837                     if (df->Gmask == 0x7e0) {
  2838 #if MMX_ASMBLIT
  2839                         if (SDL_HasMMX())
  2840                             return Blit565to565SurfaceAlphaMMX;
  2841                         else
  2842 #endif
  2843                             return Blit565to565SurfaceAlpha;
  2844                     } else if (df->Gmask == 0x3e0) {
  2845 #if MMX_ASMBLIT
  2846                         if (SDL_HasMMX())
  2847                             return Blit555to555SurfaceAlphaMMX;
  2848                         else
  2849 #endif
  2850                             return Blit555to555SurfaceAlpha;
  2851                     }
  2852                 }
  2853                 return BlitNtoNSurfaceAlpha;
  2854 
  2855             case 4:
  2856                 if (sf->Rmask == df->Rmask
  2857                     && sf->Gmask == df->Gmask
  2858                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2859 #if MMX_ASMBLIT
  2860                     if (sf->Rshift % 8 == 0
  2861                         && sf->Gshift % 8 == 0
  2862                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2863                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2864 #endif
  2865                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2866 #if SDL_ALTIVEC_BLITTERS
  2867                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2868                             && SDL_HasAltiVec())
  2869                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2870 #endif
  2871                         return BlitRGBtoRGBSurfaceAlpha;
  2872                     }
  2873                 }
  2874 #if SDL_ALTIVEC_BLITTERS
  2875                 if ((sf->BytesPerPixel == 4) &&
  2876                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2877                     && SDL_HasAltiVec())
  2878                     return Blit32to32SurfaceAlphaAltivec;
  2879                 else
  2880 #endif
  2881                     return BlitNtoNSurfaceAlpha;
  2882 
  2883             case 3:
  2884             default:
  2885                 return BlitNtoNSurfaceAlpha;
  2886             }
  2887         }
  2888     } else {
  2889         /* Per-pixel alpha blits */
  2890         switch (df->BytesPerPixel) {
  2891         case 1:
  2892             return BlitNto1PixelAlpha;
  2893 
  2894         case 2:
  2895 #if SDL_ALTIVEC_BLITTERS
  2896             if (sf->BytesPerPixel == 4
  2897                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2898                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2899                 && SDL_HasAltiVec())
  2900                 return Blit32to565PixelAlphaAltivec;
  2901             else
  2902 #endif
  2903                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2904                     && sf->Gmask == 0xff00
  2905                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2906                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2907                 if (df->Gmask == 0x7e0)
  2908                     return BlitARGBto565PixelAlpha;
  2909                 else if (df->Gmask == 0x3e0)
  2910                     return BlitARGBto555PixelAlpha;
  2911             }
  2912             return BlitNtoNPixelAlpha;
  2913 
  2914         case 4:
  2915             if (sf->Rmask == df->Rmask
  2916                 && sf->Gmask == df->Gmask
  2917                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2918 #if MMX_ASMBLIT
  2919                 if (sf->Rshift % 8 == 0
  2920                     && sf->Gshift % 8 == 0
  2921                     && sf->Bshift % 8 == 0
  2922                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2923                     if (SDL_Has3DNow())
  2924                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2925                     if (SDL_HasMMX())
  2926                         return BlitRGBtoRGBPixelAlphaMMX;
  2927                 }
  2928 #endif
  2929                 if (sf->Amask == 0xff000000) {
  2930 #if SDL_ALTIVEC_BLITTERS
  2931                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2932                         && SDL_HasAltiVec())
  2933                         return BlitRGBtoRGBPixelAlphaAltivec;
  2934 #endif
  2935                     return BlitRGBtoRGBPixelAlpha;
  2936                 }
  2937             }
  2938 #if SDL_ALTIVEC_BLITTERS
  2939             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2940                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2941                 && SDL_HasAltiVec())
  2942                 return Blit32to32PixelAlphaAltivec;
  2943             else
  2944 #endif
  2945                 return BlitNtoNPixelAlpha;
  2946 
  2947         case 3:
  2948         default:
  2949             return BlitNtoNPixelAlpha;
  2950         }
  2951     }
  2952 }
  2953 
  2954 /* vi: set ts=4 sw=4 expandtab: */