src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Mon, 12 Feb 2007 10:52:10 +0000
branchSDL-1.2
changeset 3910 af4d584e0edb
parent 3899 081aecdb0911
child 3931 d65b4a73c991
permissions -rw-r--r--
Handle source data alignment correctly in Blit32to32SurfaceAlphaAltivec().

Fixes Bugzilla #279.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
    50 {
    51 	int width = info->d_width;
    52 	int height = info->d_height;
    53 	Uint8 *src = info->s_pixels;
    54 	int srcskip = info->s_skip;
    55 	Uint8 *dst = info->d_pixels;
    56 	int dstskip = info->d_skip;
    57 	Uint8 *palmap = info->table;
    58 	SDL_PixelFormat *srcfmt = info->src;
    59 	SDL_PixelFormat *dstfmt = info->dst;
    60 	int srcbpp = srcfmt->BytesPerPixel;
    61 
    62 	const unsigned A = srcfmt->alpha;
    63 
    64 	while ( height-- ) {
    65 	    DUFFS_LOOP4(
    66 	    {
    67 		Uint32 Pixel;
    68 		unsigned sR;
    69 		unsigned sG;
    70 		unsigned sB;
    71 		unsigned dR;
    72 		unsigned dG;
    73 		unsigned dB;
    74 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    75 		dR = dstfmt->palette->colors[*dst].r;
    76 		dG = dstfmt->palette->colors[*dst].g;
    77 		dB = dstfmt->palette->colors[*dst].b;
    78 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    79 		dR &= 0xff;
    80 		dG &= 0xff;
    81 		dB &= 0xff;
    82 		/* Pack RGB into 8bit pixel */
    83 		if ( palmap == NULL ) {
    84 		    *dst =((dR>>5)<<(3+2))|
    85 			  ((dG>>5)<<(2))|
    86 			  ((dB>>6)<<(0));
    87 		} else {
    88 		    *dst = palmap[((dR>>5)<<(3+2))|
    89 				  ((dG>>5)<<(2))  |
    90 				  ((dB>>6)<<(0))];
    91 		}
    92 		dst++;
    93 		src += srcbpp;
    94 	    },
    95 	    width);
    96 	    src += srcskip;
    97 	    dst += dstskip;
    98 	}
    99 }
   100 
   101 /* N->1 blending with pixel alpha */
   102 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
   103 {
   104 	int width = info->d_width;
   105 	int height = info->d_height;
   106 	Uint8 *src = info->s_pixels;
   107 	int srcskip = info->s_skip;
   108 	Uint8 *dst = info->d_pixels;
   109 	int dstskip = info->d_skip;
   110 	Uint8 *palmap = info->table;
   111 	SDL_PixelFormat *srcfmt = info->src;
   112 	SDL_PixelFormat *dstfmt = info->dst;
   113 	int srcbpp = srcfmt->BytesPerPixel;
   114 
   115 	/* FIXME: fix alpha bit field expansion here too? */
   116 	while ( height-- ) {
   117 	    DUFFS_LOOP4(
   118 	    {
   119 		Uint32 Pixel;
   120 		unsigned sR;
   121 		unsigned sG;
   122 		unsigned sB;
   123 		unsigned sA;
   124 		unsigned dR;
   125 		unsigned dG;
   126 		unsigned dB;
   127 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   128 		dR = dstfmt->palette->colors[*dst].r;
   129 		dG = dstfmt->palette->colors[*dst].g;
   130 		dB = dstfmt->palette->colors[*dst].b;
   131 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   132 		dR &= 0xff;
   133 		dG &= 0xff;
   134 		dB &= 0xff;
   135 		/* Pack RGB into 8bit pixel */
   136 		if ( palmap == NULL ) {
   137 		    *dst =((dR>>5)<<(3+2))|
   138 			  ((dG>>5)<<(2))|
   139 			  ((dB>>6)<<(0));
   140 		} else {
   141 		    *dst = palmap[((dR>>5)<<(3+2))|
   142 				  ((dG>>5)<<(2))  |
   143 				  ((dB>>6)<<(0))  ];
   144 		}
   145 		dst++;
   146 		src += srcbpp;
   147 	    },
   148 	    width);
   149 	    src += srcskip;
   150 	    dst += dstskip;
   151 	}
   152 }
   153 
   154 /* colorkeyed N->1 blending with per-surface alpha */
   155 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
   156 {
   157 	int width = info->d_width;
   158 	int height = info->d_height;
   159 	Uint8 *src = info->s_pixels;
   160 	int srcskip = info->s_skip;
   161 	Uint8 *dst = info->d_pixels;
   162 	int dstskip = info->d_skip;
   163 	Uint8 *palmap = info->table;
   164 	SDL_PixelFormat *srcfmt = info->src;
   165 	SDL_PixelFormat *dstfmt = info->dst;
   166 	int srcbpp = srcfmt->BytesPerPixel;
   167 	Uint32 ckey = srcfmt->colorkey;
   168 
   169 	const int A = srcfmt->alpha;
   170 
   171 	while ( height-- ) {
   172 	    DUFFS_LOOP(
   173 	    {
   174 		Uint32 Pixel;
   175 		unsigned sR;
   176 		unsigned sG;
   177 		unsigned sB;
   178 		unsigned dR;
   179 		unsigned dG;
   180 		unsigned dB;
   181 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   182 		if ( Pixel != ckey ) {
   183 		    dR = dstfmt->palette->colors[*dst].r;
   184 		    dG = dstfmt->palette->colors[*dst].g;
   185 		    dB = dstfmt->palette->colors[*dst].b;
   186 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   187 		    dR &= 0xff;
   188 		    dG &= 0xff;
   189 		    dB &= 0xff;
   190 		    /* Pack RGB into 8bit pixel */
   191 		    if ( palmap == NULL ) {
   192 			*dst =((dR>>5)<<(3+2))|
   193 			      ((dG>>5)<<(2)) |
   194 			      ((dB>>6)<<(0));
   195 		    } else {
   196 			*dst = palmap[((dR>>5)<<(3+2))|
   197 				      ((dG>>5)<<(2))  |
   198 				      ((dB>>6)<<(0))  ];
   199 		    }
   200 		}
   201 		dst++;
   202 		src += srcbpp;
   203 	    },
   204 	    width);
   205 	    src += srcskip;
   206 	    dst += dstskip;
   207 	}
   208 }
   209 
   210 #if GCC_ASMBLIT
   211 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   212 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   213 {
   214 	int width = info->d_width;
   215 	int height = info->d_height;
   216 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   217 	int srcskip = info->s_skip >> 2;
   218 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   219 	int dstskip = info->d_skip >> 2;
   220 	Uint32 dalpha = info->dst->Amask;
   221 	Uint8 load[8];
   222 
   223 	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
   224 	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
   225 	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
   226 	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
   227 	movd_m2r(dalpha, mm7); /* dst alpha mask */
   228 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   229 	while(height--) {
   230 		DUFFS_LOOP_DOUBLE2(
   231 		{
   232 			Uint32 s = *srcp++;
   233 			Uint32 d = *dstp;
   234 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   235 				   + (s & d & 0x00010101)) | dalpha;
   236 		},{
   237 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   238 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   239 
   240 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   241 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   242 
   243 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   244 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   245 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   246 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   247 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   248 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   249 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   250 			
   251 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   252 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   253 			dstp += 2;
   254 			srcp += 2;
   255 		}, width);
   256 		srcp += srcskip;
   257 		dstp += dstskip;
   258 	}
   259 	emms();
   260 }
   261 
   262 /* fast RGB888->(A)RGB888 blending with surface alpha */
   263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   264 {
   265 	SDL_PixelFormat* df = info->dst;
   266 	unsigned alpha = info->src->alpha;
   267 
   268 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   269 			/* only call a128 version when R,G,B occupy lower bits */
   270 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   271 	} else {
   272 		int width = info->d_width;
   273 		int height = info->d_height;
   274 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   275 		int srcskip = info->s_skip >> 2;
   276 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   277 		int dstskip = info->d_skip >> 2;
   278 
   279 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
   280 		/* form the alpha mult */
   281 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
   282 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   283 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   284 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   285 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
   286 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
   287 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
   288 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   289 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
   290 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   291 		
   292 		while(height--) {
   293 			DUFFS_LOOP_DOUBLE2({
   294 				/* One Pixel Blend */
   295 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   296 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   297 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   298 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   299 
   300 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   301 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   302 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   303 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   304 
   305 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   306 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   307 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   308 				++srcp;
   309 				++dstp;
   310 			},{
   311 				/* Two Pixels Blend */
   312 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   313 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   314 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   315 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   316 
   317 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   318 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   319 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   320 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   321 
   322 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   323 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   324 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   325 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   326 
   327 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   328 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   329 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   330 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   331 
   332 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   333 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   334 				
   335 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   336 
   337   				srcp += 2;
   338   				dstp += 2;
   339   			}, width);
   340 			srcp += srcskip;
   341 			dstp += dstskip;
   342 		}
   343 		emms();
   344 	}
   345 }
   346 
   347 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   348 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   349 {
   350 	int width = info->d_width;
   351 	int height = info->d_height;
   352 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   353 	int srcskip = info->s_skip >> 2;
   354 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   355 	int dstskip = info->d_skip >> 2;
   356 	SDL_PixelFormat* sf = info->src;
   357 	Uint32 amask = sf->Amask;
   358 
   359 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
   360 	/* form multiplication mask */
   361 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
   362 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
   363 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
   364 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
   365 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
   366 	/* form channel masks */
   367 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
   368 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
   369 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
   370 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
   371 	/* get alpha channel shift */
   372 	movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */
   373 
   374 	while(height--) {
   375 	    DUFFS_LOOP4({
   376 		Uint32 alpha = *srcp & amask;
   377 		/* FIXME: Here we special-case opaque alpha since the
   378 			compositioning used (>>8 instead of /255) doesn't handle
   379 			it correctly. Also special-case alpha=0 for speed?
   380 			Benchmark this! */
   381 		if(alpha == 0) {
   382 			/* do nothing */
   383 		} else if(alpha == amask) {
   384 			/* opaque alpha -- copy RGB, keep dst alpha */
   385 			/* using MMX here to free up regular registers for other things */
   386 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   387 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   388 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   389 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   390 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   391 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   392 		} else {
   393 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   394 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   395 
   396 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   397 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   398 
   399 			__asm__ __volatile__ (
   400 				"movd %0, %%mm4"
   401 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   402 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   403 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   404 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   405 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   406 
   407 			/* blend */		    
   408 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   409 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   410 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   411 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   412 			
   413 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   414 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   415 		}
   416 		++srcp;
   417 		++dstp;
   418 	    }, width);
   419 	    srcp += srcskip;
   420 	    dstp += dstskip;
   421 	}
   422 	emms();
   423 }
   424 /* End GCC_ASMBLIT */
   425 
   426 #elif MSVC_ASMBLIT
   427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   428 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   429 {
   430 	int width = info->d_width;
   431 	int height = info->d_height;
   432 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   433 	int srcskip = info->s_skip >> 2;
   434 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   435 	int dstskip = info->d_skip >> 2;
   436 	Uint32 dalpha = info->dst->Amask;
   437 
   438 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   439 	
   440 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
   441 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
   442 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   443 
   444 	while (height--) {
   445 		int n = width;
   446 		if ( n & 1 ) {
   447 			Uint32 s = *srcp++;
   448 			Uint32 d = *dstp;
   449 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   450 				   + (s & d & 0x00010101)) | dalpha;
   451 			n--;
   452 		}
   453 		
   454 		for (n >>= 1; n > 0; --n) {
   455 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
   456 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
   457 
   458 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
   459 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   460 
   461 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
   462 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
   463 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
   464 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
   465 
   466 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
   467 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
   468 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
   469 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
   470 			
   471 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
   472 			dstp += 2;
   473 			srcp += 2;
   474 		}
   475 		
   476 		srcp += srcskip;
   477 		dstp += dstskip;
   478 	}
   479 	_mm_empty();
   480 }
   481 
   482 /* fast RGB888->(A)RGB888 blending with surface alpha */
   483 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   484 {
   485 	SDL_PixelFormat* df = info->dst;
   486 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   487 	unsigned alpha = info->src->alpha;
   488 
   489 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   490 			/* only call a128 version when R,G,B occupy lower bits */
   491 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   492 	} else {
   493 		int width = info->d_width;
   494 		int height = info->d_height;
   495 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   496 		int srcskip = info->s_skip >> 2;
   497 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   498 		int dstskip = info->d_skip >> 2;
   499 		Uint32 dalpha = df->Amask;
   500 		Uint32 amult;
   501 
   502 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   503 		
   504 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   505 		/* form the alpha mult */
   506 		amult = alpha | (alpha << 8);
   507 		amult = amult | (amult << 16);
   508 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   509 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
   510 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   511 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   512 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   513 		
   514 		while (height--) {
   515 			int n = width;
   516 			if (n & 1) {
   517 				/* One Pixel Blend */
   518 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
   519 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   520 
   521 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   522 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   523 
   524 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
   525 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   526 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   527 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   528 				
   529 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   530 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   531 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   532 
   533 				++srcp;
   534 				++dstp;
   535 				
   536 				n--;
   537 			}
   538 
   539 			for (n >>= 1; n > 0; --n) {
   540 				/* Two Pixels Blend */
   541 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
   542 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   543 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   544 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   545 
   546 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
   547 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
   548 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   549 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   550 
   551 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   552 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
   553 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
   554 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   555 
   556 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
   557 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   558 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   559 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   560 				
   561 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   562 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   563 
   564 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
   565 
   566 				srcp += 2;
   567 				dstp += 2;
   568 			}
   569 			srcp += srcskip;
   570 			dstp += dstskip;
   571 		}
   572 		_mm_empty();
   573 	}
   574 }
   575 
   576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   577 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   578 {
   579 	int width = info->d_width;
   580 	int height = info->d_height;
   581 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   582 	int srcskip = info->s_skip >> 2;
   583 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   584 	int dstskip = info->d_skip >> 2;
   585 	SDL_PixelFormat* sf = info->src;
   586 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   587 	Uint32 amask = sf->Amask;
   588 	Uint32 ashift = sf->Ashift;
   589 	Uint64 multmask;
   590 
   591 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   592 
   593 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   594 	multmask = ~(0xFFFFi64 << (ashift * 2));
   595 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   596 
   597 	while(height--) {
   598 		DUFFS_LOOP4({
   599 		Uint32 alpha = *srcp & amask;
   600 		if (alpha == 0) {
   601 			/* do nothing */
   602 		} else if (alpha == amask) {
   603 			/* opaque alpha -- copy RGB, keep dst alpha */
   604 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   605 		} else {
   606 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   607 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   608 
   609 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   610 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   611 
   612 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   613 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   614 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   615 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   616 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   617 
   618 			/* blend */		    
   619 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   620 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   621 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   622 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   623 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   624 			
   625 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   626 		}
   627 		++srcp;
   628 		++dstp;
   629 	    }, width);
   630 	    srcp += srcskip;
   631 	    dstp += dstskip;
   632 	}
   633 	_mm_empty();
   634 }
   635 /* End MSVC_ASMBLIT */
   636 
   637 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   638 
   639 #if SDL_ALTIVEC_BLITTERS
   640 #if __MWERKS__
   641 #pragma altivec_model on
   642 #endif
   643 #if HAVE_ALTIVEC_H
   644 #include <altivec.h>
   645 #endif
   646 #include <assert.h>
   647 
   648 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   649     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   650         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   651     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   652         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   653 #else
   654     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   655         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   656     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   657         (vector unsigned short) { a,b,c,d,e,f,g,h }
   658 #endif
   659 
   660 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   661 #define VECPRINT(msg, v) do { \
   662     vector unsigned int tmpvec = (vector unsigned int)(v); \
   663     unsigned int *vp = (unsigned int *)&tmpvec; \
   664     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   665 } while (0)
   666 
   667 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   668     (vector unsigned char)(
   669         0x00, 0x10, 0x02, 0x12,
   670         0x04, 0x14, 0x06, 0x16,
   671         0x08, 0x18, 0x0A, 0x1A,
   672         0x0C, 0x1C, 0x0E, 0x1E );
   673 */
   674 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   675 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   676 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   677 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   678     ? vec_lvsl(0, src) \
   679     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   680 
   681    
   682 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   683     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   684     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   685     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   686     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   687     /* valpha2 is 255-alpha */ \
   688     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   689     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   690     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   691     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   692     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   693     /* add source and dest */ \
   694     vtemp1 = vec_add(vtemp1, vtemp3); \
   695     vtemp2 = vec_add(vtemp2, vtemp4); \
   696     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   697     vtemp1 = vec_add(vtemp1, v1_16); \
   698     vtemp3 = vec_sr(vtemp1, v8_16); \
   699     vtemp1 = vec_add(vtemp1, vtemp3); \
   700     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   701     vtemp2 = vec_add(vtemp2, v1_16); \
   702     vtemp4 = vec_sr(vtemp2, v8_16); \
   703     vtemp2 = vec_add(vtemp2, vtemp4); \
   704     /* (>>8) and get ARGBARGBARGBARGB */ \
   705     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   706 } while (0)
   707  
   708 /* Calculate the permute vector used for 32->32 swizzling */
   709 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
   710                                   const SDL_PixelFormat *dstfmt)
   711 {
   712     /*
   713      * We have to assume that the bits that aren't used by other
   714      *  colors is alpha, and it's one complete byte, since some formats
   715      *  leave alpha with a zero mask, but we should still swizzle the bits.
   716      */
   717     /* ARGB */
   718     const static struct SDL_PixelFormat default_pixel_format = {
   719         NULL, 0, 0,
   720         0, 0, 0, 0,
   721         16, 8, 0, 24,
   722         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   723         0, 0};
   724     if (!srcfmt) {
   725         srcfmt = &default_pixel_format;
   726     }
   727     if (!dstfmt) {
   728         dstfmt = &default_pixel_format;
   729     }
   730     const vector unsigned char plus = VECUINT8_LITERAL
   731                                             ( 0x00, 0x00, 0x00, 0x00,
   732                                               0x04, 0x04, 0x04, 0x04,
   733                                               0x08, 0x08, 0x08, 0x08,
   734                                               0x0C, 0x0C, 0x0C, 0x0C );
   735     vector unsigned char vswiz;
   736     vector unsigned int srcvec;
   737 #define RESHIFT(X) (3 - ((X) >> 3))
   738     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   739     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   740     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   741     Uint32 amask;
   742     /* Use zero for alpha if either surface doesn't have alpha */
   743     if (dstfmt->Amask) {
   744         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   745     } else {
   746         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   747     }
   748 #undef RESHIFT  
   749     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
   750     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   751     return(vswiz);
   752 }
   753 
   754 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   755 {
   756     int height = info->d_height;
   757     Uint8 *src = (Uint8 *)info->s_pixels;
   758     int srcskip = info->s_skip;
   759     Uint8 *dst = (Uint8 *)info->d_pixels;
   760     int dstskip = info->d_skip;
   761     SDL_PixelFormat *srcfmt = info->src;
   762 
   763     vector unsigned char v0 = vec_splat_u8(0);
   764     vector unsigned short v8_16 = vec_splat_u16(8);
   765     vector unsigned short v1_16 = vec_splat_u16(1);
   766     vector unsigned short v2_16 = vec_splat_u16(2);
   767     vector unsigned short v3_16 = vec_splat_u16(3);
   768     vector unsigned int v8_32 = vec_splat_u32(8);
   769     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   770     vector unsigned short v3f = VECUINT16_LITERAL(
   771         0x003f, 0x003f, 0x003f, 0x003f,
   772         0x003f, 0x003f, 0x003f, 0x003f);
   773     vector unsigned short vfc = VECUINT16_LITERAL(
   774         0x00fc, 0x00fc, 0x00fc, 0x00fc,
   775         0x00fc, 0x00fc, 0x00fc, 0x00fc);
   776 
   777     /* 
   778         0x10 - 0x1f is the alpha
   779         0x00 - 0x0e evens are the red
   780         0x01 - 0x0f odds are zero
   781     */
   782     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
   783         0x10, 0x00, 0x01, 0x01,
   784         0x10, 0x02, 0x01, 0x01,
   785         0x10, 0x04, 0x01, 0x01,
   786         0x10, 0x06, 0x01, 0x01
   787     );
   788     vector unsigned char vredalpha2 = (vector unsigned char)(
   789         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   790     );
   791     /*
   792         0x00 - 0x0f is ARxx ARxx ARxx ARxx
   793         0x11 - 0x0f odds are blue
   794     */
   795     vector unsigned char vblue1 = VECUINT8_LITERAL(
   796         0x00, 0x01, 0x02, 0x11,
   797         0x04, 0x05, 0x06, 0x13,
   798         0x08, 0x09, 0x0a, 0x15,
   799         0x0c, 0x0d, 0x0e, 0x17
   800     );
   801     vector unsigned char vblue2 = (vector unsigned char)(
   802         vec_add((vector unsigned int)vblue1, v8_32)
   803     );
   804     /*
   805         0x00 - 0x0f is ARxB ARxB ARxB ARxB
   806         0x10 - 0x0e evens are green
   807     */
   808     vector unsigned char vgreen1 = VECUINT8_LITERAL(
   809         0x00, 0x01, 0x10, 0x03,
   810         0x04, 0x05, 0x12, 0x07,
   811         0x08, 0x09, 0x14, 0x0b,
   812         0x0c, 0x0d, 0x16, 0x0f
   813     );
   814     vector unsigned char vgreen2 = (vector unsigned char)(
   815         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   816     );
   817     vector unsigned char vgmerge = VECUINT8_LITERAL(
   818         0x00, 0x02, 0x00, 0x06,
   819         0x00, 0x0a, 0x00, 0x0e,
   820         0x00, 0x12, 0x00, 0x16,
   821         0x00, 0x1a, 0x00, 0x1e);
   822     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   823     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   824     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   825 
   826     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   827     vf800 = vec_sl(vf800, vec_splat_u16(8));
   828 
   829     while(height--) {
   830         int extrawidth;
   831         vector unsigned char valigner;
   832         vector unsigned char vsrc;
   833         vector unsigned char voverflow;
   834         int width = info->d_width;
   835 
   836 #define ONE_PIXEL_BLEND(condition, widthvar) \
   837         while (condition) { \
   838             Uint32 Pixel; \
   839             unsigned sR, sG, sB, dR, dG, dB, sA; \
   840             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   841             if(sA) { \
   842                 unsigned short dstpixel = *((unsigned short *)dst); \
   843                 dR = (dstpixel >> 8) & 0xf8; \
   844                 dG = (dstpixel >> 3) & 0xfc; \
   845                 dB = (dstpixel << 3) & 0xf8; \
   846                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   847                 *((unsigned short *)dst) = ( \
   848                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   849                 ); \
   850             } \
   851             src += 4; \
   852             dst += 2; \
   853             widthvar--; \
   854         }
   855         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   856         extrawidth = (width % 8);
   857         valigner = VEC_ALIGNER(src);
   858         vsrc = (vector unsigned char)vec_ld(0, src);
   859         width -= extrawidth;
   860         while (width) {
   861             vector unsigned char valpha;
   862             vector unsigned char vsrc1, vsrc2;
   863             vector unsigned char vdst1, vdst2;
   864             vector unsigned short vR, vG, vB;
   865             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   866 
   867             /* Load 8 pixels from src as ARGB */
   868             voverflow = (vector unsigned char)vec_ld(15, src);
   869             vsrc = vec_perm(vsrc, voverflow, valigner);
   870             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   871             src += 16;
   872             vsrc = (vector unsigned char)vec_ld(15, src);
   873             voverflow = vec_perm(voverflow, vsrc, valigner);
   874             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   875             src += 16;
   876 
   877             /* Load 8 pixels from dst as XRGB */
   878             voverflow = vec_ld(0, dst);
   879             vR = vec_and((vector unsigned short)voverflow, vf800);
   880             vB = vec_sl((vector unsigned short)voverflow, v3_16);
   881             vG = vec_sl(vB, v2_16);
   882             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   883             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   884             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   885             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   886             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   887             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   888 
   889             /* Alpha blend 8 pixels as ARGB */
   890             valpha = vec_perm(vsrc1, v0, valphaPermute);
   891             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   892             valpha = vec_perm(vsrc2, v0, valphaPermute);
   893             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   894 
   895             /* Convert 8 pixels to 565 */
   896             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   897             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   898             vgpixel = vec_and(vgpixel, vfc);
   899             vgpixel = vec_sl(vgpixel, v3_16);
   900             vrpixel = vec_sl(vpixel, v1_16);
   901             vrpixel = vec_and(vrpixel, vf800);
   902             vbpixel = vec_and(vpixel, v3f);
   903             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   904             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   905             
   906             /* Store 8 pixels */
   907             vec_st(vdst1, 0, dst);
   908 
   909             width -= 8;
   910             dst += 16;
   911         }
   912         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   913 #undef ONE_PIXEL_BLEND
   914         src += srcskip;
   915         dst += dstskip;
   916     }
   917 }
   918 
   919 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   920 {
   921     unsigned alpha = info->src->alpha;
   922     int height = info->d_height;
   923     Uint32 *srcp = (Uint32 *)info->s_pixels;
   924     int srcskip = info->s_skip >> 2;
   925     Uint32 *dstp = (Uint32 *)info->d_pixels;
   926     int dstskip = info->d_skip >> 2;
   927     SDL_PixelFormat *srcfmt = info->src;
   928     SDL_PixelFormat *dstfmt = info->dst;
   929     unsigned sA = srcfmt->alpha;
   930     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   931     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   932     Uint32 ckey = info->src->colorkey;
   933     vector unsigned char mergePermute;
   934     vector unsigned char vsrcPermute;
   935     vector unsigned char vdstPermute;
   936     vector unsigned char vsdstPermute;
   937     vector unsigned char valpha;
   938     vector unsigned char valphamask;
   939     vector unsigned char vbits;
   940     vector unsigned char v0;
   941     vector unsigned short v1;
   942     vector unsigned short v8;
   943     vector unsigned int vckey;
   944     vector unsigned int vrgbmask;
   945 
   946     mergePermute = VEC_MERGE_PERMUTE();
   947     v0 = vec_splat_u8(0);
   948     v1 = vec_splat_u16(1);
   949     v8 = vec_splat_u16(8);
   950 
   951     /* set the alpha to 255 on the destination surf */
   952     valphamask = VEC_ALPHA_MASK();
   953 
   954     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   955     vdstPermute = calc_swizzle32(NULL, dstfmt);
   956     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   957 
   958     /* set a vector full of alpha and 255-alpha */
   959     ((unsigned char *)&valpha)[0] = alpha;
   960     valpha = vec_splat(valpha, 0);
   961     vbits = (vector unsigned char)vec_splat_s8(-1);
   962 
   963     ckey &= rgbmask;
   964     ((unsigned int *)(char*)&vckey)[0] = ckey;
   965     vckey = vec_splat(vckey, 0);
   966     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
   967     vrgbmask = vec_splat(vrgbmask, 0);
   968 
   969     while(height--) {
   970         int width = info->d_width;
   971 #define ONE_PIXEL_BLEND(condition, widthvar) \
   972         while (condition) { \
   973             Uint32 Pixel; \
   974             unsigned sR, sG, sB, dR, dG, dB; \
   975             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   976             if(sA && Pixel != ckey) { \
   977                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   978                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   979                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   980                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   981             } \
   982             dstp++; \
   983             srcp++; \
   984             widthvar--; \
   985         }
   986         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   987         if (width > 0) {
   988             int extrawidth = (width % 4);
   989             vector unsigned char valigner = VEC_ALIGNER(srcp);
   990             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   991             width -= extrawidth;
   992             while (width) {
   993                 vector unsigned char vsel;
   994                 vector unsigned char voverflow;
   995                 vector unsigned char vd;
   996                 vector unsigned char vd_orig;
   997 
   998                 /* s = *srcp */
   999                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1000                 vs = vec_perm(vs, voverflow, valigner);
  1001                 
  1002                 /* vsel is set for items that match the key */
  1003                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
  1004                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
  1005 
  1006                 /* permute to source format */
  1007                 vs = vec_perm(vs, valpha, vsrcPermute);
  1008 
  1009                 /* d = *dstp */
  1010                 vd = (vector unsigned char)vec_ld(0, dstp);
  1011                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1012 
  1013                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1014 
  1015                 /* set the alpha channel to full on */
  1016                 vd = vec_or(vd, valphamask);
  1017 
  1018                 /* mask out color key */
  1019                 vd = vec_sel(vd, vd_orig, vsel);
  1020                 
  1021                 /* permute to dest format */
  1022                 vd = vec_perm(vd, vbits, vdstPermute);
  1023 
  1024                 /* *dstp = res */
  1025                 vec_st((vector unsigned int)vd, 0, dstp);
  1026                 
  1027                 srcp += 4;
  1028                 dstp += 4;
  1029                 width -= 4;
  1030                 vs = voverflow;
  1031             }
  1032             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1033         }
  1034 #undef ONE_PIXEL_BLEND
  1035  
  1036         srcp += srcskip;
  1037         dstp += dstskip;
  1038     }
  1039 }
  1040 
  1041 
  1042 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
  1043 {
  1044     int width = info->d_width;
  1045     int height = info->d_height;
  1046     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1047     int srcskip = info->s_skip >> 2;
  1048     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1049     int dstskip = info->d_skip >> 2;
  1050     SDL_PixelFormat *srcfmt = info->src;
  1051     SDL_PixelFormat *dstfmt = info->dst;
  1052     vector unsigned char mergePermute;
  1053     vector unsigned char valphaPermute;
  1054     vector unsigned char vsrcPermute;
  1055     vector unsigned char vdstPermute;
  1056     vector unsigned char vsdstPermute;
  1057     vector unsigned char valphamask;
  1058     vector unsigned char vpixelmask;
  1059     vector unsigned char v0;
  1060     vector unsigned short v1;
  1061     vector unsigned short v8;
  1062 
  1063     v0 = vec_splat_u8(0);
  1064     v1 = vec_splat_u16(1);
  1065     v8 = vec_splat_u16(8);
  1066     mergePermute = VEC_MERGE_PERMUTE();
  1067     valphamask = VEC_ALPHA_MASK();
  1068     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1069     vpixelmask = vec_nor(valphamask, v0);
  1070     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1071     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1072     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1073 
  1074 	while ( height-- ) {
  1075         width = info->d_width;
  1076 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1077             Uint32 Pixel; \
  1078             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1079             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1080             if(sA) { \
  1081               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1082               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1083               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1084             } \
  1085             ++srcp; \
  1086             ++dstp; \
  1087             widthvar--; \
  1088         }
  1089         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1090         if (width > 0) {
  1091             /* vsrcPermute */
  1092             /* vdstPermute */
  1093             int extrawidth = (width % 4);
  1094             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1095             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1096             width -= extrawidth;
  1097             while (width) {
  1098                 vector unsigned char voverflow;
  1099                 vector unsigned char vd;
  1100                 vector unsigned char valpha;
  1101                 vector unsigned char vdstalpha;
  1102                 /* s = *srcp */
  1103                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1104                 vs = vec_perm(vs, voverflow, valigner);
  1105                 vs = vec_perm(vs, v0, vsrcPermute);
  1106 
  1107                 valpha = vec_perm(vs, v0, valphaPermute);
  1108                 
  1109                 /* d = *dstp */
  1110                 vd = (vector unsigned char)vec_ld(0, dstp);
  1111                 vd = vec_perm(vd, v0, vsdstPermute);
  1112                 vdstalpha = vec_and(vd, valphamask);
  1113 
  1114                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1115 
  1116                 /* set the alpha to the dest alpha */
  1117                 vd = vec_and(vd, vpixelmask);
  1118                 vd = vec_or(vd, vdstalpha);
  1119                 vd = vec_perm(vd, v0, vdstPermute);
  1120 
  1121                 /* *dstp = res */
  1122                 vec_st((vector unsigned int)vd, 0, dstp);
  1123                 
  1124                 srcp += 4;
  1125                 dstp += 4;
  1126                 width -= 4;
  1127                 vs = voverflow;
  1128 
  1129             }
  1130             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1131         }
  1132 	    srcp += srcskip;
  1133 	    dstp += dstskip;
  1134 #undef ONE_PIXEL_BLEND
  1135 	}
  1136 }
  1137 
  1138 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1139 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
  1140 {
  1141 	int width = info->d_width;
  1142 	int height = info->d_height;
  1143 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1144 	int srcskip = info->s_skip >> 2;
  1145 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1146 	int dstskip = info->d_skip >> 2;
  1147     vector unsigned char mergePermute;
  1148     vector unsigned char valphaPermute;
  1149     vector unsigned char valphamask;
  1150     vector unsigned char vpixelmask;
  1151     vector unsigned char v0;
  1152     vector unsigned short v1;
  1153     vector unsigned short v8;
  1154     v0 = vec_splat_u8(0);
  1155     v1 = vec_splat_u16(1);
  1156     v8 = vec_splat_u16(8);
  1157     mergePermute = VEC_MERGE_PERMUTE();
  1158     valphamask = VEC_ALPHA_MASK();
  1159     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1160     
  1161  
  1162     vpixelmask = vec_nor(valphamask, v0);
  1163 	while(height--) {
  1164         width = info->d_width;
  1165 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1166         while ((condition)) { \
  1167             Uint32 dalpha; \
  1168             Uint32 d; \
  1169             Uint32 s1; \
  1170             Uint32 d1; \
  1171             Uint32 s = *srcp; \
  1172             Uint32 alpha = s >> 24; \
  1173             if(alpha) { \
  1174               if(alpha == SDL_ALPHA_OPAQUE) { \
  1175                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1176               } else { \
  1177                 d = *dstp; \
  1178                 dalpha = d & 0xff000000; \
  1179                 s1 = s & 0xff00ff; \
  1180                 d1 = d & 0xff00ff; \
  1181                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1182                 s &= 0xff00; \
  1183                 d &= 0xff00; \
  1184                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1185                 *dstp = d1 | d | dalpha; \
  1186               } \
  1187             } \
  1188             ++srcp; \
  1189             ++dstp; \
  1190             widthvar--; \
  1191 	    }
  1192         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1193         if (width > 0) {
  1194             int extrawidth = (width % 4);
  1195             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1196             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1197             width -= extrawidth;
  1198             while (width) {
  1199                 vector unsigned char voverflow;
  1200                 vector unsigned char vd;
  1201                 vector unsigned char valpha;
  1202                 vector unsigned char vdstalpha;
  1203                 /* s = *srcp */
  1204                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1205                 vs = vec_perm(vs, voverflow, valigner);
  1206 
  1207                 valpha = vec_perm(vs, v0, valphaPermute);
  1208                 
  1209                 /* d = *dstp */
  1210                 vd = (vector unsigned char)vec_ld(0, dstp);
  1211                 vdstalpha = vec_and(vd, valphamask);
  1212 
  1213                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1214 
  1215                 /* set the alpha to the dest alpha */
  1216                 vd = vec_and(vd, vpixelmask);
  1217                 vd = vec_or(vd, vdstalpha);
  1218 
  1219                 /* *dstp = res */
  1220                 vec_st((vector unsigned int)vd, 0, dstp);
  1221                 
  1222                 srcp += 4;
  1223                 dstp += 4;
  1224                 width -= 4;
  1225                 vs = voverflow;
  1226             }
  1227             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1228         }
  1229 	    srcp += srcskip;
  1230 	    dstp += dstskip;
  1231 	}
  1232 #undef ONE_PIXEL_BLEND
  1233 }
  1234 
  1235 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
  1236 {
  1237     /* XXX : 6 */
  1238 	unsigned alpha = info->src->alpha;
  1239     int height = info->d_height;
  1240     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1241     int srcskip = info->s_skip >> 2;
  1242     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1243     int dstskip = info->d_skip >> 2;
  1244     SDL_PixelFormat *srcfmt = info->src;
  1245     SDL_PixelFormat *dstfmt = info->dst;
  1246 	unsigned sA = srcfmt->alpha;
  1247 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1248     vector unsigned char mergePermute;
  1249     vector unsigned char vsrcPermute;
  1250     vector unsigned char vdstPermute;
  1251     vector unsigned char vsdstPermute;
  1252     vector unsigned char valpha;
  1253     vector unsigned char valphamask;
  1254     vector unsigned char vbits;
  1255     vector unsigned short v1;
  1256     vector unsigned short v8;
  1257 
  1258     mergePermute = VEC_MERGE_PERMUTE();
  1259     v1 = vec_splat_u16(1);
  1260     v8 = vec_splat_u16(8);
  1261 
  1262     /* set the alpha to 255 on the destination surf */
  1263     valphamask = VEC_ALPHA_MASK();
  1264 
  1265     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1266     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1267     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1268 
  1269     /* set a vector full of alpha and 255-alpha */
  1270     ((unsigned char *)&valpha)[0] = alpha;
  1271     valpha = vec_splat(valpha, 0);
  1272     vbits = (vector unsigned char)vec_splat_s8(-1);
  1273 
  1274     while(height--) {
  1275         int width = info->d_width;
  1276 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1277             Uint32 Pixel; \
  1278             unsigned sR, sG, sB, dR, dG, dB; \
  1279             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1280             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1281             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1282             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1283             ++srcp; \
  1284             ++dstp; \
  1285             widthvar--; \
  1286         }
  1287         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1288         if (width > 0) {
  1289             int extrawidth = (width % 4);
  1290             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1291             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1292             width -= extrawidth;
  1293             while (width) {
  1294                 vector unsigned char voverflow;
  1295                 vector unsigned char vd;
  1296 
  1297                 /* s = *srcp */
  1298                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1299                 vs = vec_perm(vs, voverflow, valigner);
  1300                 vs = vec_perm(vs, valpha, vsrcPermute);
  1301                 
  1302                 /* d = *dstp */
  1303                 vd = (vector unsigned char)vec_ld(0, dstp);
  1304                 vd = vec_perm(vd, vd, vsdstPermute);
  1305 
  1306                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1307 
  1308                 /* set the alpha channel to full on */
  1309                 vd = vec_or(vd, valphamask);
  1310                 vd = vec_perm(vd, vbits, vdstPermute);
  1311 
  1312                 /* *dstp = res */
  1313                 vec_st((vector unsigned int)vd, 0, dstp);
  1314                 
  1315                 srcp += 4;
  1316                 dstp += 4;
  1317                 width -= 4;
  1318                 vs = voverflow;
  1319             }
  1320             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1321         }
  1322 #undef ONE_PIXEL_BLEND
  1323  
  1324         srcp += srcskip;
  1325         dstp += dstskip;
  1326     }
  1327 
  1328 }
  1329 
  1330 
  1331 /* fast RGB888->(A)RGB888 blending */
  1332 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
  1333 {
  1334 	unsigned alpha = info->src->alpha;
  1335     int height = info->d_height;
  1336     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1337     int srcskip = info->s_skip >> 2;
  1338     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1339     int dstskip = info->d_skip >> 2;
  1340     vector unsigned char mergePermute;
  1341     vector unsigned char valpha;
  1342     vector unsigned char valphamask;
  1343     vector unsigned short v1;
  1344     vector unsigned short v8;
  1345 
  1346     mergePermute = VEC_MERGE_PERMUTE();
  1347     v1 = vec_splat_u16(1);
  1348     v8 = vec_splat_u16(8);
  1349 
  1350     /* set the alpha to 255 on the destination surf */
  1351     valphamask = VEC_ALPHA_MASK();
  1352 
  1353     /* set a vector full of alpha and 255-alpha */
  1354     ((unsigned char *)&valpha)[0] = alpha;
  1355     valpha = vec_splat(valpha, 0);
  1356 
  1357     while(height--) {
  1358         int width = info->d_width;
  1359 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1360             Uint32 s = *srcp; \
  1361             Uint32 d = *dstp; \
  1362             Uint32 s1 = s & 0xff00ff; \
  1363             Uint32 d1 = d & 0xff00ff; \
  1364             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1365                  & 0xff00ff; \
  1366             s &= 0xff00; \
  1367             d &= 0xff00; \
  1368             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1369             *dstp = d1 | d | 0xff000000; \
  1370             ++srcp; \
  1371             ++dstp; \
  1372             widthvar--; \
  1373         }
  1374         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1375         if (width > 0) {
  1376             int extrawidth = (width % 4);
  1377             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1378             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1379             width -= extrawidth;
  1380             while (width) {
  1381                 vector unsigned char voverflow;
  1382                 vector unsigned char vd;
  1383 
  1384                 /* s = *srcp */
  1385                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1386                 vs = vec_perm(vs, voverflow, valigner);
  1387                 
  1388                 /* d = *dstp */
  1389                 vd = (vector unsigned char)vec_ld(0, dstp);
  1390 
  1391                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1392 
  1393                 /* set the alpha channel to full on */
  1394                 vd = vec_or(vd, valphamask);
  1395 
  1396                 /* *dstp = res */
  1397                 vec_st((vector unsigned int)vd, 0, dstp);
  1398                 
  1399                 srcp += 4;
  1400                 dstp += 4;
  1401                 width -= 4;
  1402                 vs = voverflow;
  1403             }
  1404             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1405         }
  1406 #undef ONE_PIXEL_BLEND
  1407  
  1408         srcp += srcskip;
  1409         dstp += dstskip;
  1410     }
  1411 }
  1412 #if __MWERKS__
  1413 #pragma altivec_model off
  1414 #endif
  1415 #endif /* SDL_ALTIVEC_BLITTERS */
  1416 
  1417 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1418 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  1419 {
  1420 	int width = info->d_width;
  1421 	int height = info->d_height;
  1422 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1423 	int srcskip = info->s_skip >> 2;
  1424 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1425 	int dstskip = info->d_skip >> 2;
  1426 
  1427 	while(height--) {
  1428 	    DUFFS_LOOP4({
  1429 		    Uint32 s = *srcp++;
  1430 		    Uint32 d = *dstp;
  1431 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1432 			       + (s & d & 0x00010101)) | 0xff000000;
  1433 	    }, width);
  1434 	    srcp += srcskip;
  1435 	    dstp += dstskip;
  1436 	}
  1437 }
  1438 
  1439 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1440 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  1441 {
  1442 	unsigned alpha = info->src->alpha;
  1443 	if(alpha == 128) {
  1444 		BlitRGBtoRGBSurfaceAlpha128(info);
  1445 	} else {
  1446 		int width = info->d_width;
  1447 		int height = info->d_height;
  1448 		Uint32 *srcp = (Uint32 *)info->s_pixels;
  1449 		int srcskip = info->s_skip >> 2;
  1450 		Uint32 *dstp = (Uint32 *)info->d_pixels;
  1451 		int dstskip = info->d_skip >> 2;
  1452 		Uint32 s;
  1453 		Uint32 d;
  1454 		Uint32 s1;
  1455 		Uint32 d1;
  1456 
  1457 		while(height--) {
  1458 			DUFFS_LOOP_DOUBLE2({
  1459 				/* One Pixel Blend */
  1460 				s = *srcp;
  1461 				d = *dstp;
  1462 				s1 = s & 0xff00ff;
  1463 				d1 = d & 0xff00ff;
  1464 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1465 				     & 0xff00ff;
  1466 				s &= 0xff00;
  1467 				d &= 0xff00;
  1468 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1469 				*dstp = d1 | d | 0xff000000;
  1470 				++srcp;
  1471 				++dstp;
  1472 			},{
  1473 			        /* Two Pixels Blend */
  1474 				s = *srcp;
  1475 				d = *dstp;
  1476 				s1 = s & 0xff00ff;
  1477 				d1 = d & 0xff00ff;
  1478 				d1 += (s1 - d1) * alpha >> 8;
  1479 				d1 &= 0xff00ff;
  1480 				     
  1481 				s = ((s & 0xff00) >> 8) | 
  1482 					((srcp[1] & 0xff00) << 8);
  1483 				d = ((d & 0xff00) >> 8) |
  1484 					((dstp[1] & 0xff00) << 8);
  1485 				d += (s - d) * alpha >> 8;
  1486 				d &= 0x00ff00ff;
  1487 				
  1488 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1489 				++srcp;
  1490 				
  1491 			        s1 = *srcp;
  1492 				d1 = *dstp;
  1493 				s1 &= 0xff00ff;
  1494 				d1 &= 0xff00ff;
  1495 				d1 += (s1 - d1) * alpha >> 8;
  1496 				d1 &= 0xff00ff;
  1497 				
  1498 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1499 				++srcp;
  1500 				++dstp;
  1501 			}, width);
  1502 			srcp += srcskip;
  1503 			dstp += dstskip;
  1504 		}
  1505 	}
  1506 }
  1507 
  1508 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1509 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
  1510 {
  1511 	int width = info->d_width;
  1512 	int height = info->d_height;
  1513 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1514 	int srcskip = info->s_skip >> 2;
  1515 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1516 	int dstskip = info->d_skip >> 2;
  1517 
  1518 	while(height--) {
  1519 	    DUFFS_LOOP4({
  1520 		Uint32 dalpha;
  1521 		Uint32 d;
  1522 		Uint32 s1;
  1523 		Uint32 d1;
  1524 		Uint32 s = *srcp;
  1525 		Uint32 alpha = s >> 24;
  1526 		/* FIXME: Here we special-case opaque alpha since the
  1527 		   compositioning used (>>8 instead of /255) doesn't handle
  1528 		   it correctly. Also special-case alpha=0 for speed?
  1529 		   Benchmark this! */
  1530 		if(alpha) {   
  1531 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1532 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1533 		  } else {
  1534 		    /*
  1535 		     * take out the middle component (green), and process
  1536 		     * the other two in parallel. One multiply less.
  1537 		     */
  1538 		    d = *dstp;
  1539 		    dalpha = d & 0xff000000;
  1540 		    s1 = s & 0xff00ff;
  1541 		    d1 = d & 0xff00ff;
  1542 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1543 		    s &= 0xff00;
  1544 		    d &= 0xff00;
  1545 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1546 		    *dstp = d1 | d | dalpha;
  1547 		  }
  1548 		}
  1549 		++srcp;
  1550 		++dstp;
  1551 	    }, width);
  1552 	    srcp += srcskip;
  1553 	    dstp += dstskip;
  1554 	}
  1555 }
  1556 
  1557 #if GCC_ASMBLIT
  1558 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1559 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1560 {
  1561 	int width = info->d_width;
  1562 	int height = info->d_height;
  1563 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1564 	int srcskip = info->s_skip >> 2;
  1565 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1566 	int dstskip = info->d_skip >> 2;
  1567 	SDL_PixelFormat* sf = info->src;
  1568 	Uint32 amask = sf->Amask;
  1569 	Uint32 ashift = sf->Ashift;
  1570 
  1571 	__asm__ (
  1572 	/* make mm6 all zeros. */
  1573 	"pxor       %%mm6, %%mm6\n"
  1574 	
  1575 	/* Make a mask to preserve the alpha. */
  1576 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
  1577 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
  1578 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
  1579 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
  1580 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
  1581 
  1582 	/* form channel masks */
  1583 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
  1584 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
  1585 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
  1586 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
  1587 	
  1588 	/* get alpha channel shift */
  1589 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
  1590 
  1591 	  : /* nothing */ : "m" (amask), "m" (ashift) );
  1592 
  1593 	while(height--) {
  1594 
  1595 	    DUFFS_LOOP4({
  1596 		Uint32 alpha;
  1597 
  1598 		__asm__ (
  1599 		"prefetch 64(%0)\n"
  1600 		"prefetch 64(%1)\n"
  1601 			: : "r" (srcp), "r" (dstp) );
  1602 
  1603 		alpha = *srcp & amask;
  1604 		/* FIXME: Here we special-case opaque alpha since the
  1605 		   compositioning used (>>8 instead of /255) doesn't handle
  1606 		   it correctly. Also special-case alpha=0 for speed?
  1607 		   Benchmark this! */
  1608 		if(alpha == 0) {
  1609 		    /* do nothing */
  1610 		}
  1611 		else if(alpha == amask) {
  1612 			/* opaque alpha -- copy RGB, keep dst alpha */
  1613 		    /* using MMX here to free up regular registers for other things */
  1614 			    __asm__ (
  1615 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1616 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1617 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1618 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1619 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1620 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1621 
  1622 		     : : "r" (srcp), "r" (dstp) );
  1623 		} 
  1624 
  1625 		else {
  1626 			    __asm__ (
  1627 		    /* load in the source, and dst. */
  1628 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1629 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1630 
  1631 		    /* Move the src alpha into mm2 */
  1632 
  1633 		    /* if supporting pshufw */
  1634 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1635 		    /*"psrlw     $8, %%mm2\n" */
  1636 		    
  1637 		    /* else: */
  1638 		    "movd       %2,    %%mm2\n"
  1639 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1640 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1641 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1642 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1643 
  1644 		    /* move the colors into words. */
  1645 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1646 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1647 
  1648 		    /* src - dst */
  1649 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1650 
  1651 		    /* A * (src-dst) */
  1652 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1653 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1654 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1655 
  1656 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1657 		    
  1658 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1659 
  1660 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1661 
  1662 		}
  1663 		++srcp;
  1664 		++dstp;
  1665 	    }, width);
  1666 	    srcp += srcskip;
  1667 	    dstp += dstskip;
  1668 	}
  1669 
  1670 	__asm__ (
  1671 	"emms\n"
  1672 		:   );
  1673 }
  1674 /* End GCC_ASMBLIT*/
  1675 
  1676 #elif MSVC_ASMBLIT
  1677 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1678 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1679 {
  1680 	int width = info->d_width;
  1681 	int height = info->d_height;
  1682 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1683 	int srcskip = info->s_skip >> 2;
  1684 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1685 	int dstskip = info->d_skip >> 2;
  1686 	SDL_PixelFormat* sf = info->src;
  1687 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1688 	Uint32 amask = sf->Amask;
  1689 	Uint32 ashift = sf->Ashift;
  1690 	Uint64 multmask;
  1691 	
  1692 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1693 
  1694 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
  1695 	multmask = ~(0xFFFFi64 << (ashift * 2));
  1696 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
  1697 
  1698 	while(height--) {
  1699 	    DUFFS_LOOP4({
  1700 		Uint32 alpha;
  1701 
  1702 		_m_prefetch(srcp + 16);
  1703 		_m_prefetch(dstp + 16);
  1704 
  1705 		alpha = *srcp & amask;
  1706 		if (alpha == 0) {
  1707 			/* do nothing */
  1708 		} else if (alpha == amask) {
  1709 			/* copy RGB, keep dst alpha */
  1710 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1711 		} else {
  1712 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1713 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1714 
  1715 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1716 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1717 
  1718 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1719 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1720 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1721 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1722 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1723 
  1724 			/* blend */		    
  1725 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1726 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1727 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1728 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1729 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1730 			
  1731 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1732 		}
  1733 		++srcp;
  1734 		++dstp;
  1735 	    }, width);
  1736 	    srcp += srcskip;
  1737 	    dstp += dstskip;
  1738 	}
  1739 	_mm_empty();
  1740 }
  1741 /* End MSVC_ASMBLIT */
  1742 
  1743 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1744 
  1745 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1746 
  1747 /* blend a single 16 bit pixel at 50% */
  1748 #define BLEND16_50(d, s, mask)						\
  1749 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1750 
  1751 /* blend two 16 bit pixels at 50% */
  1752 #define BLEND2x16_50(d, s, mask)					     \
  1753 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1754 	 + (s & d & (~(mask | mask << 16))))
  1755 
  1756 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  1757 {
  1758 	int width = info->d_width;
  1759 	int height = info->d_height;
  1760 	Uint16 *srcp = (Uint16 *)info->s_pixels;
  1761 	int srcskip = info->s_skip >> 1;
  1762 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  1763 	int dstskip = info->d_skip >> 1;
  1764 
  1765 	while(height--) {
  1766 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  1767 			/*
  1768 			 * Source and destination not aligned, pipeline it.
  1769 			 * This is mostly a win for big blits but no loss for
  1770 			 * small ones
  1771 			 */
  1772 			Uint32 prev_sw;
  1773 			int w = width;
  1774 
  1775 			/* handle odd destination */
  1776 			if((uintptr_t)dstp & 2) {
  1777 				Uint16 d = *dstp, s = *srcp;
  1778 				*dstp = BLEND16_50(d, s, mask);
  1779 				dstp++;
  1780 				srcp++;
  1781 				w--;
  1782 			}
  1783 			srcp++;	/* srcp is now 32-bit aligned */
  1784 
  1785 			/* bootstrap pipeline with first halfword */
  1786 			prev_sw = ((Uint32 *)srcp)[-1];
  1787 
  1788 			while(w > 1) {
  1789 				Uint32 sw, dw, s;
  1790 				sw = *(Uint32 *)srcp;
  1791 				dw = *(Uint32 *)dstp;
  1792 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1793 				s = (prev_sw << 16) + (sw >> 16);
  1794 #else
  1795 				s = (prev_sw >> 16) + (sw << 16);
  1796 #endif
  1797 				prev_sw = sw;
  1798 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  1799 				dstp += 2;
  1800 				srcp += 2;
  1801 				w -= 2;
  1802 			}
  1803 
  1804 			/* final pixel if any */
  1805 			if(w) {
  1806 				Uint16 d = *dstp, s;
  1807 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1808 				s = (Uint16)prev_sw;
  1809 #else
  1810 				s = (Uint16)(prev_sw >> 16);
  1811 #endif
  1812 				*dstp = BLEND16_50(d, s, mask);
  1813 				srcp++;
  1814 				dstp++;
  1815 			}
  1816 			srcp += srcskip - 1;
  1817 			dstp += dstskip;
  1818 		} else {
  1819 			/* source and destination are aligned */
  1820 			int w = width;
  1821 
  1822 			/* first odd pixel? */
  1823 			if((uintptr_t)srcp & 2) {
  1824 				Uint16 d = *dstp, s = *srcp;
  1825 				*dstp = BLEND16_50(d, s, mask);
  1826 				srcp++;
  1827 				dstp++;
  1828 				w--;
  1829 			}
  1830 			/* srcp and dstp are now 32-bit aligned */
  1831 
  1832 			while(w > 1) {
  1833 				Uint32 sw = *(Uint32 *)srcp;
  1834 				Uint32 dw = *(Uint32 *)dstp;
  1835 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  1836 				srcp += 2;
  1837 				dstp += 2;
  1838 				w -= 2;
  1839 			}
  1840 
  1841 			/* last odd pixel? */
  1842 			if(w) {
  1843 				Uint16 d = *dstp, s = *srcp;
  1844 				*dstp = BLEND16_50(d, s, mask);
  1845 				srcp++;
  1846 				dstp++;
  1847 			}
  1848 			srcp += srcskip;
  1849 			dstp += dstskip;
  1850 		}
  1851 	}
  1852 }
  1853 
  1854 #if GCC_ASMBLIT
  1855 /* fast RGB565->RGB565 blending with surface alpha */
  1856 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  1857 {
  1858 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1859 	if(alpha == 128) {
  1860 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  1861 	} else {
  1862 		int width = info->d_width;
  1863 		int height = info->d_height;
  1864 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  1865 		int srcskip = info->s_skip >> 1;
  1866 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  1867 		int dstskip = info->d_skip >> 1;
  1868 		Uint32 s, d;
  1869 		Uint8 load[8];
  1870 	  
  1871 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  1872 		*(Uint64 *)load = alpha;
  1873 		alpha >>= 3;		/* downscale alpha to 5 bits */
  1874 
  1875 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
  1876 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  1877 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  1878 		/* position alpha to allow for mullo and mulhi on diff channels
  1879 		   to reduce the number of operations */
  1880 		psllq_i2r(3, mm0);
  1881 	  
  1882 		/* Setup the 565 color channel masks */
  1883 		*(Uint64 *)load = 0x07E007E007E007E0ULL;
  1884 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
  1885 		*(Uint64 *)load = 0x001F001F001F001FULL;
  1886 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
  1887 		while(height--) {
  1888 			DUFFS_LOOP_QUATRO2(
  1889 			{
  1890 				s = *srcp++;
  1891 				d = *dstp;
  1892 				/*
  1893 				 * shift out the middle component (green) to
  1894 				 * the high 16 bits, and process all three RGB
  1895 				 * components at the same time.
  1896 				 */
  1897 				s = (s | s << 16) & 0x07e0f81f;
  1898 				d = (d | d << 16) & 0x07e0f81f;
  1899 				d += (s - d) * alpha >> 5;
  1900 				d &= 0x07e0f81f;
  1901 				*dstp++ = d | d >> 16;
  1902 			},{
  1903 				s = *srcp++;
  1904 				d = *dstp;
  1905 				/*
  1906 				 * shift out the middle component (green) to
  1907 				 * the high 16 bits, and process all three RGB
  1908 				 * components at the same time.
  1909 				 */
  1910 				s = (s | s << 16) & 0x07e0f81f;
  1911 				d = (d | d << 16) & 0x07e0f81f;
  1912 				d += (s - d) * alpha >> 5;
  1913 				d &= 0x07e0f81f;
  1914 				*dstp++ = d | d >> 16;
  1915 				s = *srcp++;
  1916 				d = *dstp;
  1917 				/*
  1918 				 * shift out the middle component (green) to
  1919 				 * the high 16 bits, and process all three RGB
  1920 				 * components at the same time.
  1921 				 */
  1922 				s = (s | s << 16) & 0x07e0f81f;
  1923 				d = (d | d << 16) & 0x07e0f81f;
  1924 				d += (s - d) * alpha >> 5;
  1925 				d &= 0x07e0f81f;
  1926 				*dstp++ = d | d >> 16;
  1927 			},{
  1928 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  1929 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  1930 
  1931 				/* red -- does not need a mask since the right shift clears
  1932 				   the uninteresting bits */
  1933 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1934 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1935 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  1936 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  1937 
  1938 				/* blend */
  1939 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1940 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1941 				/* alpha used is actually 11 bits
  1942 				   11 + 5 = 16 bits, so the sign bits are lost */
  1943 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1944 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1945 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  1946 
  1947 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  1948 
  1949 				/* green -- process the bits in place */
  1950 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1951 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1952 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  1953 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  1954 
  1955 				/* blend */
  1956 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1957 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1958 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  1959 				   bits are gone and the sign bits present */
  1960 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  1961 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1962 
  1963 				por_r2r(mm6, mm1); /* save new greens in dsts */
  1964 
  1965 				/* blue */
  1966 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1967 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1968 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  1969 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1970 
  1971 				/* blend */
  1972 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1973 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1974 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  1975 				   the interesting bits will need to be MASKed */
  1976 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1977 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1978 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1979 
  1980 				por_r2r(mm6, mm1); /* save new blues in dsts */
  1981 
  1982 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  1983 
  1984 				srcp += 4;
  1985 				dstp += 4;
  1986 			}, width);			
  1987 			srcp += srcskip;
  1988 			dstp += dstskip;
  1989 		}
  1990 		emms();
  1991 	}
  1992 }
  1993 
  1994 /* fast RGB555->RGB555 blending with surface alpha */
  1995 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  1996 {
  1997 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1998 	if(alpha == 128) {
  1999 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2000 	} else {
  2001 		int width = info->d_width;
  2002 		int height = info->d_height;
  2003 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2004 		int srcskip = info->s_skip >> 1;
  2005 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2006 		int dstskip = info->d_skip >> 1;
  2007 		Uint32 s, d;
  2008 		Uint8 load[8];
  2009 	  
  2010 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2011 		*(Uint64 *)load = alpha;
  2012 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2013 
  2014 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
  2015 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  2016 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  2017 		/* position alpha to allow for mullo and mulhi on diff channels
  2018 		   to reduce the number of operations */
  2019 		psllq_i2r(3, mm0);
  2020 
  2021 		/* Setup the 555 color channel masks */
  2022 		*(Uint64 *)load = 0x03E003E003E003E0ULL;
  2023 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
  2024 		*(Uint64 *)load = 0x001F001F001F001FULL;
  2025 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
  2026 		while(height--) {
  2027 			DUFFS_LOOP_QUATRO2(
  2028 			{
  2029 				s = *srcp++;
  2030 				d = *dstp;
  2031 				/*
  2032 				 * shift out the middle component (green) to
  2033 				 * the high 16 bits, and process all three RGB
  2034 				 * components at the same time.
  2035 				 */
  2036 				s = (s | s << 16) & 0x03e07c1f;
  2037 				d = (d | d << 16) & 0x03e07c1f;
  2038 				d += (s - d) * alpha >> 5;
  2039 				d &= 0x03e07c1f;
  2040 				*dstp++ = d | d >> 16;
  2041 			},{
  2042 				s = *srcp++;
  2043 				d = *dstp;
  2044 				/*
  2045 				 * shift out the middle component (green) to
  2046 				 * the high 16 bits, and process all three RGB
  2047 				 * components at the same time.
  2048 				 */
  2049 				s = (s | s << 16) & 0x03e07c1f;
  2050 				d = (d | d << 16) & 0x03e07c1f;
  2051 				d += (s - d) * alpha >> 5;
  2052 				d &= 0x03e07c1f;
  2053 				*dstp++ = d | d >> 16;
  2054 			        s = *srcp++;
  2055 				d = *dstp;
  2056 				/*
  2057 				 * shift out the middle component (green) to
  2058 				 * the high 16 bits, and process all three RGB
  2059 				 * components at the same time.
  2060 				 */
  2061 				s = (s | s << 16) & 0x03e07c1f;
  2062 				d = (d | d << 16) & 0x03e07c1f;
  2063 				d += (s - d) * alpha >> 5;
  2064 				d &= 0x03e07c1f;
  2065 				*dstp++ = d | d >> 16;
  2066 			},{
  2067 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2068 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2069 
  2070 				/* red -- process the bits in place */
  2071 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2072 					/* by reusing the GREEN mask we free up another mmx
  2073 					   register to accumulate the result */
  2074 
  2075 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2076 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2077 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2078 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2079 
  2080 				/* blend */
  2081 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2082 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2083 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2084 				   cleared by a MASK below */
  2085 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2086 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2087 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2088 
  2089 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2090 
  2091 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2092 
  2093 				/* green -- process the bits in place */
  2094 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2095 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2096 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2097 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2098 
  2099 				/* blend */
  2100 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2101 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2102 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2103 				   bits are gone and the sign bits present */
  2104 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2105 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2106 
  2107 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2108 
  2109 				/* blue */
  2110 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2111 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2112 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2113 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2114 
  2115 				/* blend */
  2116 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2117 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2118 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2119 				   the interesting bits will need to be MASKed */
  2120 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2121 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2122 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2123 
  2124 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2125 
  2126 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2127 
  2128 				srcp += 4;
  2129 				dstp += 4;
  2130 			}, width);			
  2131 			srcp += srcskip;
  2132 			dstp += dstskip;
  2133 		}
  2134 		emms();
  2135 	}
  2136 }
  2137 /* End GCC_ASMBLIT */
  2138 
  2139 #elif MSVC_ASMBLIT
  2140 /* fast RGB565->RGB565 blending with surface alpha */
  2141 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  2142 {
  2143 	unsigned alpha = info->src->alpha;
  2144 	if(alpha == 128) {
  2145 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2146 	} else {
  2147 		int width = info->d_width;
  2148 		int height = info->d_height;
  2149 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2150 		int srcskip = info->s_skip >> 1;
  2151 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2152 		int dstskip = info->d_skip >> 1;
  2153 		Uint32 s, d;
  2154 	  
  2155 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2156 
  2157 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2158 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2159 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2160 
  2161 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2162 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2163 		/* position alpha to allow for mullo and mulhi on diff channels
  2164 		   to reduce the number of operations */
  2165 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2166 	  
  2167 		/* Setup the 565 color channel masks */
  2168 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
  2169 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2170 		
  2171 		while(height--) {
  2172 			DUFFS_LOOP_QUATRO2(
  2173 			{
  2174 				s = *srcp++;
  2175 				d = *dstp;
  2176 				/*
  2177 				 * shift out the middle component (green) to
  2178 				 * the high 16 bits, and process all three RGB
  2179 				 * components at the same time.
  2180 				 */
  2181 				s = (s | s << 16) & 0x07e0f81f;
  2182 				d = (d | d << 16) & 0x07e0f81f;
  2183 				d += (s - d) * alpha >> 5;
  2184 				d &= 0x07e0f81f;
  2185 				*dstp++ = (Uint16)(d | d >> 16);
  2186 			},{
  2187 				s = *srcp++;
  2188 				d = *dstp;
  2189 				/*
  2190 				 * shift out the middle component (green) to
  2191 				 * the high 16 bits, and process all three RGB
  2192 				 * components at the same time.
  2193 				 */
  2194 				s = (s | s << 16) & 0x07e0f81f;
  2195 				d = (d | d << 16) & 0x07e0f81f;
  2196 				d += (s - d) * alpha >> 5;
  2197 				d &= 0x07e0f81f;
  2198 				*dstp++ = (Uint16)(d | d >> 16);
  2199 				s = *srcp++;
  2200 				d = *dstp;
  2201 				/*
  2202 				 * shift out the middle component (green) to
  2203 				 * the high 16 bits, and process all three RGB
  2204 				 * components at the same time.
  2205 				 */
  2206 				s = (s | s << 16) & 0x07e0f81f;
  2207 				d = (d | d << 16) & 0x07e0f81f;
  2208 				d += (s - d) * alpha >> 5;
  2209 				d &= 0x07e0f81f;
  2210 				*dstp++ = (Uint16)(d | d >> 16);
  2211 			},{
  2212 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2213 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2214 
  2215 				/* red */
  2216 				src2 = src1;
  2217 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2218 
  2219 				dst2 = dst1;
  2220 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2221 
  2222 				/* blend */
  2223 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2224 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2225 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2226 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2227 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2228 
  2229 				mm_res = dst2; /* RED -> mm_res */
  2230 
  2231 				/* green -- process the bits in place */
  2232 				src2 = src1;
  2233 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2234 
  2235 				dst2 = dst1;
  2236 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2237 
  2238 				/* blend */
  2239 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2240 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2241 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2242 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2243 
  2244 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2245 
  2246 				/* blue */
  2247 				src2 = src1;
  2248 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2249 
  2250 				dst2 = dst1;
  2251 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2252 
  2253 				/* blend */
  2254 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2255 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2256 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2257 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2258 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2259 
  2260 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2261 
  2262 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2263 
  2264 				srcp += 4;
  2265 				dstp += 4;
  2266 			}, width);			
  2267 			srcp += srcskip;
  2268 			dstp += dstskip;
  2269 		}
  2270 		_mm_empty();
  2271 	}
  2272 }
  2273 
  2274 /* fast RGB555->RGB555 blending with surface alpha */
  2275 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2276 {
  2277 	unsigned alpha = info->src->alpha;
  2278 	if(alpha == 128) {
  2279 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2280 	} else {
  2281 		int width = info->d_width;
  2282 		int height = info->d_height;
  2283 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2284 		int srcskip = info->s_skip >> 1;
  2285 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2286 		int dstskip = info->d_skip >> 1;
  2287 		Uint32 s, d;
  2288 	  
  2289 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2290 
  2291 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2292 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2293 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2294 
  2295 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2296 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2297 		/* position alpha to allow for mullo and mulhi on diff channels
  2298 		   to reduce the number of operations */
  2299 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2300 	  
  2301 		/* Setup the 555 color channel masks */
  2302 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
  2303 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
  2304 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2305 
  2306 		while(height--) {
  2307 			DUFFS_LOOP_QUATRO2(
  2308 			{
  2309 				s = *srcp++;
  2310 				d = *dstp;
  2311 				/*
  2312 				 * shift out the middle component (green) to
  2313 				 * the high 16 bits, and process all three RGB
  2314 				 * components at the same time.
  2315 				 */
  2316 				s = (s | s << 16) & 0x03e07c1f;
  2317 				d = (d | d << 16) & 0x03e07c1f;
  2318 				d += (s - d) * alpha >> 5;
  2319 				d &= 0x03e07c1f;
  2320 				*dstp++ = (Uint16)(d | d >> 16);
  2321 			},{
  2322 				s = *srcp++;
  2323 				d = *dstp;
  2324 				/*
  2325 				 * shift out the middle component (green) to
  2326 				 * the high 16 bits, and process all three RGB
  2327 				 * components at the same time.
  2328 				 */
  2329 				s = (s | s << 16) & 0x03e07c1f;
  2330 				d = (d | d << 16) & 0x03e07c1f;
  2331 				d += (s - d) * alpha >> 5;
  2332 				d &= 0x03e07c1f;
  2333 				*dstp++ = (Uint16)(d | d >> 16);
  2334 			        s = *srcp++;
  2335 				d = *dstp;
  2336 				/*
  2337 				 * shift out the middle component (green) to
  2338 				 * the high 16 bits, and process all three RGB
  2339 				 * components at the same time.
  2340 				 */
  2341 				s = (s | s << 16) & 0x03e07c1f;
  2342 				d = (d | d << 16) & 0x03e07c1f;
  2343 				d += (s - d) * alpha >> 5;
  2344 				d &= 0x03e07c1f;
  2345 				*dstp++ = (Uint16)(d | d >> 16);
  2346 			},{
  2347 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2348 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2349 
  2350 				/* red -- process the bits in place */
  2351 				src2 = src1;
  2352 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2353 
  2354 				dst2 = dst1;
  2355 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2356 
  2357 				/* blend */
  2358 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2359 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2360 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2361 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2362 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2363 
  2364 				mm_res = dst2; /* RED -> mm_res */
  2365 				
  2366 				/* green -- process the bits in place */
  2367 				src2 = src1;
  2368 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2369 
  2370 				dst2 = dst1;
  2371 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2372 
  2373 				/* blend */
  2374 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2375 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2376 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2377 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2378 
  2379 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2380 
  2381 				/* blue */
  2382 				src2 = src1; /* src -> src2 */
  2383 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2384 
  2385 				dst2 = dst1; /* dst -> dst2 */
  2386 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2387 
  2388 				/* blend */
  2389 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2390 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2391 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2392 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2393 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2394 
  2395 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2396 
  2397 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2398 
  2399 				srcp += 4;
  2400 				dstp += 4;
  2401 			}, width);			
  2402 			srcp += srcskip;
  2403 			dstp += dstskip;
  2404 		}
  2405 		_mm_empty();
  2406 	}
  2407 }
  2408 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2409 
  2410 /* fast RGB565->RGB565 blending with surface alpha */
  2411 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  2412 {
  2413 	unsigned alpha = info->src->alpha;
  2414 	if(alpha == 128) {
  2415 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2416 	} else {
  2417 		int width = info->d_width;
  2418 		int height = info->d_height;
  2419 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2420 		int srcskip = info->s_skip >> 1;
  2421 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2422 		int dstskip = info->d_skip >> 1;
  2423 		alpha >>= 3;	/* downscale alpha to 5 bits */
  2424 
  2425 		while(height--) {
  2426 			DUFFS_LOOP4({
  2427 				Uint32 s = *srcp++;
  2428 				Uint32 d = *dstp;
  2429 				/*
  2430 				 * shift out the middle component (green) to
  2431 				 * the high 16 bits, and process all three RGB
  2432 				 * components at the same time.
  2433 				 */
  2434 				s = (s | s << 16) & 0x07e0f81f;
  2435 				d = (d | d << 16) & 0x07e0f81f;
  2436 				d += (s - d) * alpha >> 5;
  2437 				d &= 0x07e0f81f;
  2438 				*dstp++ = (Uint16)(d | d >> 16);
  2439 			}, width);
  2440 			srcp += srcskip;
  2441 			dstp += dstskip;
  2442 		}
  2443 	}
  2444 }
  2445 
  2446 /* fast RGB555->RGB555 blending with surface alpha */
  2447 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  2448 {
  2449 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2450 	if(alpha == 128) {
  2451 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2452 	} else {
  2453 		int width = info->d_width;
  2454 		int height = info->d_height;
  2455 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2456 		int srcskip = info->s_skip >> 1;
  2457 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2458 		int dstskip = info->d_skip >> 1;
  2459 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2460 
  2461 		while(height--) {
  2462 			DUFFS_LOOP4({
  2463 				Uint32 s = *srcp++;
  2464 				Uint32 d = *dstp;
  2465 				/*
  2466 				 * shift out the middle component (green) to
  2467 				 * the high 16 bits, and process all three RGB
  2468 				 * components at the same time.
  2469 				 */
  2470 				s = (s | s << 16) & 0x03e07c1f;
  2471 				d = (d | d << 16) & 0x03e07c1f;
  2472 				d += (s - d) * alpha >> 5;
  2473 				d &= 0x03e07c1f;
  2474 				*dstp++ = (Uint16)(d | d >> 16);
  2475 			}, width);
  2476 			srcp += srcskip;
  2477 			dstp += dstskip;
  2478 		}
  2479 	}
  2480 }
  2481 
  2482 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2483 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  2484 {
  2485 	int width = info->d_width;
  2486 	int height = info->d_height;
  2487 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2488 	int srcskip = info->s_skip >> 2;
  2489 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2490 	int dstskip = info->d_skip >> 1;
  2491 
  2492 	while(height--) {
  2493 	    DUFFS_LOOP4({
  2494 		Uint32 s = *srcp;
  2495 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2496 		/* FIXME: Here we special-case opaque alpha since the
  2497 		   compositioning used (>>8 instead of /255) doesn't handle
  2498 		   it correctly. Also special-case alpha=0 for speed?
  2499 		   Benchmark this! */
  2500 		if(alpha) {   
  2501 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2502 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2503 		  } else {
  2504 		    Uint32 d = *dstp;
  2505 		    /*
  2506 		     * convert source and destination to G0RAB65565
  2507 		     * and blend all components at the same time
  2508 		     */
  2509 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2510 		      + (s >> 3 & 0x1f);
  2511 		    d = (d | d << 16) & 0x07e0f81f;
  2512 		    d += (s - d) * alpha >> 5;
  2513 		    d &= 0x07e0f81f;
  2514 		    *dstp = (Uint16)(d | d >> 16);
  2515 		  }
  2516 		}
  2517 		srcp++;
  2518 		dstp++;
  2519 	    }, width);
  2520 	    srcp += srcskip;
  2521 	    dstp += dstskip;
  2522 	}
  2523 }
  2524 
  2525 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2526 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  2527 {
  2528 	int width = info->d_width;
  2529 	int height = info->d_height;
  2530 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2531 	int srcskip = info->s_skip >> 2;
  2532 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2533 	int dstskip = info->d_skip >> 1;
  2534 
  2535 	while(height--) {
  2536 	    DUFFS_LOOP4({
  2537 		unsigned alpha;
  2538 		Uint32 s = *srcp;
  2539 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2540 		/* FIXME: Here we special-case opaque alpha since the
  2541 		   compositioning used (>>8 instead of /255) doesn't handle
  2542 		   it correctly. Also special-case alpha=0 for speed?
  2543 		   Benchmark this! */
  2544 		if(alpha) {   
  2545 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2546 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2547 		  } else {
  2548 		    Uint32 d = *dstp;
  2549 		    /*
  2550 		     * convert source and destination to G0RAB65565
  2551 		     * and blend all components at the same time
  2552 		     */
  2553 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2554 		      + (s >> 3 & 0x1f);
  2555 		    d = (d | d << 16) & 0x03e07c1f;
  2556 		    d += (s - d) * alpha >> 5;
  2557 		    d &= 0x03e07c1f;
  2558 		    *dstp = (Uint16)(d | d >> 16);
  2559 		  }
  2560 		}
  2561 		srcp++;
  2562 		dstp++;
  2563 	    }, width);
  2564 	    srcp += srcskip;
  2565 	    dstp += dstskip;
  2566 	}
  2567 }
  2568 
  2569 /* General (slow) N->N blending with per-surface alpha */
  2570 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  2571 {
  2572 	int width = info->d_width;
  2573 	int height = info->d_height;
  2574 	Uint8 *src = info->s_pixels;
  2575 	int srcskip = info->s_skip;
  2576 	Uint8 *dst = info->d_pixels;
  2577 	int dstskip = info->d_skip;
  2578 	SDL_PixelFormat *srcfmt = info->src;
  2579 	SDL_PixelFormat *dstfmt = info->dst;
  2580 	int srcbpp = srcfmt->BytesPerPixel;
  2581 	int dstbpp = dstfmt->BytesPerPixel;
  2582 	unsigned sA = srcfmt->alpha;
  2583 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2584 
  2585 	if(sA) {
  2586 	  while ( height-- ) {
  2587 	    DUFFS_LOOP4(
  2588 	    {
  2589 		Uint32 Pixel;
  2590 		unsigned sR;
  2591 		unsigned sG;
  2592 		unsigned sB;
  2593 		unsigned dR;
  2594 		unsigned dG;
  2595 		unsigned dB;
  2596 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2597 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2598 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2599 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2600 		src += srcbpp;
  2601 		dst += dstbpp;
  2602 	    },
  2603 	    width);
  2604 	    src += srcskip;
  2605 	    dst += dstskip;
  2606 	  }
  2607 	}
  2608 }
  2609 
  2610 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2611 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  2612 {
  2613 	int width = info->d_width;
  2614 	int height = info->d_height;
  2615 	Uint8 *src = info->s_pixels;
  2616 	int srcskip = info->s_skip;
  2617 	Uint8 *dst = info->d_pixels;
  2618 	int dstskip = info->d_skip;
  2619 	SDL_PixelFormat *srcfmt = info->src;
  2620 	SDL_PixelFormat *dstfmt = info->dst;
  2621 	Uint32 ckey = srcfmt->colorkey;
  2622 	int srcbpp = srcfmt->BytesPerPixel;
  2623 	int dstbpp = dstfmt->BytesPerPixel;
  2624 	unsigned sA = srcfmt->alpha;
  2625 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2626 
  2627 	while ( height-- ) {
  2628 	    DUFFS_LOOP4(
  2629 	    {
  2630 		Uint32 Pixel;
  2631 		unsigned sR;
  2632 		unsigned sG;
  2633 		unsigned sB;
  2634 		unsigned dR;
  2635 		unsigned dG;
  2636 		unsigned dB;
  2637 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2638 		if(sA && Pixel != ckey) {
  2639 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2640 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2641 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2642 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2643 		}
  2644 		src += srcbpp;
  2645 		dst += dstbpp;
  2646 	    },
  2647 	    width);
  2648 	    src += srcskip;
  2649 	    dst += dstskip;
  2650 	}
  2651 }
  2652 
  2653 /* General (slow) N->N blending with pixel alpha */
  2654 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  2655 {
  2656 	int width = info->d_width;
  2657 	int height = info->d_height;
  2658 	Uint8 *src = info->s_pixels;
  2659 	int srcskip = info->s_skip;
  2660 	Uint8 *dst = info->d_pixels;
  2661 	int dstskip = info->d_skip;
  2662 	SDL_PixelFormat *srcfmt = info->src;
  2663 	SDL_PixelFormat *dstfmt = info->dst;
  2664 
  2665 	int  srcbpp;
  2666 	int  dstbpp;
  2667 
  2668 	/* Set up some basic variables */
  2669 	srcbpp = srcfmt->BytesPerPixel;
  2670 	dstbpp = dstfmt->BytesPerPixel;
  2671 
  2672 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2673 	   quite right. for <8bpp source alpha, it gets them very wrong
  2674 	   (check all macros!)
  2675 	   It is unclear whether there is a good general solution that doesn't
  2676 	   need a branch (or a divide). */
  2677 	while ( height-- ) {
  2678 	    DUFFS_LOOP4(
  2679 	    {
  2680 		Uint32 Pixel;
  2681 		unsigned sR;
  2682 		unsigned sG;
  2683 		unsigned sB;
  2684 		unsigned dR;
  2685 		unsigned dG;
  2686 		unsigned dB;
  2687 		unsigned sA;
  2688 		unsigned dA;
  2689 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2690 		if(sA) {
  2691 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2692 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2693 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2694 		}
  2695 		src += srcbpp;
  2696 		dst += dstbpp;
  2697 	    },
  2698 	    width);
  2699 	    src += srcskip;
  2700 	    dst += dstskip;
  2701 	}
  2702 }
  2703 
  2704 
  2705 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
  2706 {
  2707     SDL_PixelFormat *sf = surface->format;
  2708     SDL_PixelFormat *df = surface->map->dst->format;
  2709 
  2710     if(sf->Amask == 0) {
  2711 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2712 	    if(df->BytesPerPixel == 1)
  2713 		return BlitNto1SurfaceAlphaKey;
  2714 	    else
  2715 #if SDL_ALTIVEC_BLITTERS
  2716 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2717 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2718             return Blit32to32SurfaceAlphaKeyAltivec;
  2719         else
  2720 #endif
  2721             return BlitNtoNSurfaceAlphaKey;
  2722 	} else {
  2723 	    /* Per-surface alpha blits */
  2724 	    switch(df->BytesPerPixel) {
  2725 	    case 1:
  2726 		return BlitNto1SurfaceAlpha;
  2727 
  2728 	    case 2:
  2729 		if(surface->map->identity) {
  2730 		    if(df->Gmask == 0x7e0)
  2731 		    {
  2732 #if MMX_ASMBLIT
  2733 		if(SDL_HasMMX())
  2734 			return Blit565to565SurfaceAlphaMMX;
  2735 		else
  2736 #endif
  2737 			return Blit565to565SurfaceAlpha;
  2738 		    }
  2739 		    else if(df->Gmask == 0x3e0)
  2740 		    {
  2741 #if MMX_ASMBLIT
  2742 		if(SDL_HasMMX())
  2743 			return Blit555to555SurfaceAlphaMMX;
  2744 		else
  2745 #endif
  2746 			return Blit555to555SurfaceAlpha;
  2747 		    }
  2748 		}
  2749 		return BlitNtoNSurfaceAlpha;
  2750 
  2751 	    case 4:
  2752 		if(sf->Rmask == df->Rmask
  2753 		   && sf->Gmask == df->Gmask
  2754 		   && sf->Bmask == df->Bmask
  2755 		   && sf->BytesPerPixel == 4)
  2756 		{
  2757 #if MMX_ASMBLIT
  2758 			if(sf->Rshift % 8 == 0
  2759 			   && sf->Gshift % 8 == 0
  2760 			   && sf->Bshift % 8 == 0
  2761 			   && SDL_HasMMX())
  2762 			    return BlitRGBtoRGBSurfaceAlphaMMX;
  2763 #endif
  2764 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
  2765 			{
  2766 #if SDL_ALTIVEC_BLITTERS
  2767 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2768 					&& SDL_HasAltiVec())
  2769 					return BlitRGBtoRGBSurfaceAlphaAltivec;
  2770 #endif
  2771 				return BlitRGBtoRGBSurfaceAlpha;
  2772 			}
  2773 		}
  2774 #if SDL_ALTIVEC_BLITTERS
  2775 		if((sf->BytesPerPixel == 4) &&
  2776 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2777 			return Blit32to32SurfaceAlphaAltivec;
  2778 		else
  2779 #endif
  2780 			return BlitNtoNSurfaceAlpha;
  2781 
  2782 	    case 3:
  2783 	    default:
  2784 		return BlitNtoNSurfaceAlpha;
  2785 	    }
  2786 	}
  2787     } else {
  2788 	/* Per-pixel alpha blits */
  2789 	switch(df->BytesPerPixel) {
  2790 	case 1:
  2791 	    return BlitNto1PixelAlpha;
  2792 
  2793 	case 2:
  2794 #if SDL_ALTIVEC_BLITTERS
  2795 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
  2796            df->Gmask == 0x7e0 &&
  2797 	   df->Bmask == 0x1f && SDL_HasAltiVec())
  2798             return Blit32to565PixelAlphaAltivec;
  2799         else
  2800 #endif
  2801 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2802 	       && sf->Gmask == 0xff00
  2803 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2804 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2805 		if(df->Gmask == 0x7e0)
  2806 		    return BlitARGBto565PixelAlpha;
  2807 		else if(df->Gmask == 0x3e0)
  2808 		    return BlitARGBto555PixelAlpha;
  2809 	    }
  2810 	    return BlitNtoNPixelAlpha;
  2811 
  2812 	case 4:
  2813 	    if(sf->Rmask == df->Rmask
  2814 	       && sf->Gmask == df->Gmask
  2815 	       && sf->Bmask == df->Bmask
  2816 	       && sf->BytesPerPixel == 4)
  2817 	    {
  2818 #if MMX_ASMBLIT
  2819 		if(sf->Rshift % 8 == 0
  2820 		   && sf->Gshift % 8 == 0
  2821 		   && sf->Bshift % 8 == 0
  2822 		   && sf->Ashift % 8 == 0
  2823 		   && sf->Aloss == 0)
  2824 		{
  2825 			if(SDL_Has3DNow())
  2826 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2827 			if(SDL_HasMMX())
  2828 				return BlitRGBtoRGBPixelAlphaMMX;
  2829 		}
  2830 #endif
  2831 		if(sf->Amask == 0xff000000)
  2832 		{
  2833 #if SDL_ALTIVEC_BLITTERS
  2834 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2835 				&& SDL_HasAltiVec())
  2836 				return BlitRGBtoRGBPixelAlphaAltivec;
  2837 #endif
  2838 			return BlitRGBtoRGBPixelAlpha;
  2839 		}
  2840 	    }
  2841 #if SDL_ALTIVEC_BLITTERS
  2842 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
  2843 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2844 		return Blit32to32PixelAlphaAltivec;
  2845 	    else
  2846 #endif
  2847 		return BlitNtoNPixelAlpha;
  2848 
  2849 	case 3:
  2850 	default:
  2851 	    return BlitNtoNPixelAlpha;
  2852 	}
  2853     }
  2854 }
  2855