src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Sat, 23 Sep 2006 23:15:34 +0000
branchSDL-1.2
changeset 3870 571c75f3d093
parent 1795 398ac0f88e4d
child 3899 081aecdb0911
permissions -rw-r--r--
(none)
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 #if SDL_ASSEMBLY_ROUTINES
    28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    29 #define MMX_ASMBLIT 1
    30 #define GCC_ASMBLIT 1
    31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
    32 #define MMX_ASMBLIT 1
    33 #define MSVC_ASMBLIT 1
    34 #endif
    35 #endif /* SDL_ASSEMBLY_ROUTINES */
    36 
    37 /* Function to check the CPU flags */
    38 #include "SDL_cpuinfo.h"
    39 #if GCC_ASMBLIT
    40 #include "mmx.h"
    41 #elif MSVC_ASMBLIT
    42 #include <mmintrin.h>
    43 #include <mm3dnow.h>
    44 #endif
    45 
    46 /* Functions to perform alpha blended blitting */
    47 
    48 /* N->1 blending with per-surface alpha */
    49 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
    50 {
    51 	int width = info->d_width;
    52 	int height = info->d_height;
    53 	Uint8 *src = info->s_pixels;
    54 	int srcskip = info->s_skip;
    55 	Uint8 *dst = info->d_pixels;
    56 	int dstskip = info->d_skip;
    57 	Uint8 *palmap = info->table;
    58 	SDL_PixelFormat *srcfmt = info->src;
    59 	SDL_PixelFormat *dstfmt = info->dst;
    60 	int srcbpp = srcfmt->BytesPerPixel;
    61 
    62 	const unsigned A = srcfmt->alpha;
    63 
    64 	while ( height-- ) {
    65 	    DUFFS_LOOP4(
    66 	    {
    67 		Uint32 Pixel;
    68 		unsigned sR;
    69 		unsigned sG;
    70 		unsigned sB;
    71 		unsigned dR;
    72 		unsigned dG;
    73 		unsigned dB;
    74 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    75 		dR = dstfmt->palette->colors[*dst].r;
    76 		dG = dstfmt->palette->colors[*dst].g;
    77 		dB = dstfmt->palette->colors[*dst].b;
    78 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    79 		dR &= 0xff;
    80 		dG &= 0xff;
    81 		dB &= 0xff;
    82 		/* Pack RGB into 8bit pixel */
    83 		if ( palmap == NULL ) {
    84 		    *dst =((dR>>5)<<(3+2))|
    85 			  ((dG>>5)<<(2))|
    86 			  ((dB>>6)<<(0));
    87 		} else {
    88 		    *dst = palmap[((dR>>5)<<(3+2))|
    89 				  ((dG>>5)<<(2))  |
    90 				  ((dB>>6)<<(0))];
    91 		}
    92 		dst++;
    93 		src += srcbpp;
    94 	    },
    95 	    width);
    96 	    src += srcskip;
    97 	    dst += dstskip;
    98 	}
    99 }
   100 
   101 /* N->1 blending with pixel alpha */
   102 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
   103 {
   104 	int width = info->d_width;
   105 	int height = info->d_height;
   106 	Uint8 *src = info->s_pixels;
   107 	int srcskip = info->s_skip;
   108 	Uint8 *dst = info->d_pixels;
   109 	int dstskip = info->d_skip;
   110 	Uint8 *palmap = info->table;
   111 	SDL_PixelFormat *srcfmt = info->src;
   112 	SDL_PixelFormat *dstfmt = info->dst;
   113 	int srcbpp = srcfmt->BytesPerPixel;
   114 
   115 	/* FIXME: fix alpha bit field expansion here too? */
   116 	while ( height-- ) {
   117 	    DUFFS_LOOP4(
   118 	    {
   119 		Uint32 Pixel;
   120 		unsigned sR;
   121 		unsigned sG;
   122 		unsigned sB;
   123 		unsigned sA;
   124 		unsigned dR;
   125 		unsigned dG;
   126 		unsigned dB;
   127 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   128 		dR = dstfmt->palette->colors[*dst].r;
   129 		dG = dstfmt->palette->colors[*dst].g;
   130 		dB = dstfmt->palette->colors[*dst].b;
   131 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   132 		dR &= 0xff;
   133 		dG &= 0xff;
   134 		dB &= 0xff;
   135 		/* Pack RGB into 8bit pixel */
   136 		if ( palmap == NULL ) {
   137 		    *dst =((dR>>5)<<(3+2))|
   138 			  ((dG>>5)<<(2))|
   139 			  ((dB>>6)<<(0));
   140 		} else {
   141 		    *dst = palmap[((dR>>5)<<(3+2))|
   142 				  ((dG>>5)<<(2))  |
   143 				  ((dB>>6)<<(0))  ];
   144 		}
   145 		dst++;
   146 		src += srcbpp;
   147 	    },
   148 	    width);
   149 	    src += srcskip;
   150 	    dst += dstskip;
   151 	}
   152 }
   153 
   154 /* colorkeyed N->1 blending with per-surface alpha */
   155 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
   156 {
   157 	int width = info->d_width;
   158 	int height = info->d_height;
   159 	Uint8 *src = info->s_pixels;
   160 	int srcskip = info->s_skip;
   161 	Uint8 *dst = info->d_pixels;
   162 	int dstskip = info->d_skip;
   163 	Uint8 *palmap = info->table;
   164 	SDL_PixelFormat *srcfmt = info->src;
   165 	SDL_PixelFormat *dstfmt = info->dst;
   166 	int srcbpp = srcfmt->BytesPerPixel;
   167 	Uint32 ckey = srcfmt->colorkey;
   168 
   169 	const int A = srcfmt->alpha;
   170 
   171 	while ( height-- ) {
   172 	    DUFFS_LOOP(
   173 	    {
   174 		Uint32 Pixel;
   175 		unsigned sR;
   176 		unsigned sG;
   177 		unsigned sB;
   178 		unsigned dR;
   179 		unsigned dG;
   180 		unsigned dB;
   181 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   182 		if ( Pixel != ckey ) {
   183 		    dR = dstfmt->palette->colors[*dst].r;
   184 		    dG = dstfmt->palette->colors[*dst].g;
   185 		    dB = dstfmt->palette->colors[*dst].b;
   186 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   187 		    dR &= 0xff;
   188 		    dG &= 0xff;
   189 		    dB &= 0xff;
   190 		    /* Pack RGB into 8bit pixel */
   191 		    if ( palmap == NULL ) {
   192 			*dst =((dR>>5)<<(3+2))|
   193 			      ((dG>>5)<<(2)) |
   194 			      ((dB>>6)<<(0));
   195 		    } else {
   196 			*dst = palmap[((dR>>5)<<(3+2))|
   197 				      ((dG>>5)<<(2))  |
   198 				      ((dB>>6)<<(0))  ];
   199 		    }
   200 		}
   201 		dst++;
   202 		src += srcbpp;
   203 	    },
   204 	    width);
   205 	    src += srcskip;
   206 	    dst += dstskip;
   207 	}
   208 }
   209 
   210 #if GCC_ASMBLIT
   211 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   212 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   213 {
   214 	int width = info->d_width;
   215 	int height = info->d_height;
   216 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   217 	int srcskip = info->s_skip >> 2;
   218 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   219 	int dstskip = info->d_skip >> 2;
   220 	Uint32 dalpha = info->dst->Amask;
   221 	Uint8 load[8];
   222 
   223 	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
   224 	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
   225 	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
   226 	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
   227 	movd_m2r(dalpha, mm7); /* dst alpha mask */
   228 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   229 	while(height--) {
   230 		DUFFS_LOOP_DOUBLE2(
   231 		{
   232 			Uint32 s = *srcp++;
   233 			Uint32 d = *dstp;
   234 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   235 				   + (s & d & 0x00010101)) | dalpha;
   236 		},{
   237 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   238 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   239 
   240 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   241 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   242 
   243 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   244 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   245 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   246 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   247 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   248 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   249 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   250 			
   251 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   252 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   253 			dstp += 2;
   254 			srcp += 2;
   255 		}, width);
   256 		srcp += srcskip;
   257 		dstp += dstskip;
   258 	}
   259 	emms();
   260 }
   261 
   262 /* fast RGB888->(A)RGB888 blending with surface alpha */
   263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   264 {
   265 	SDL_PixelFormat* df = info->dst;
   266 	unsigned alpha = info->src->alpha;
   267 
   268 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   269 			/* only call a128 version when R,G,B occupy lower bits */
   270 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   271 	} else {
   272 		int width = info->d_width;
   273 		int height = info->d_height;
   274 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   275 		int srcskip = info->s_skip >> 2;
   276 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   277 		int dstskip = info->d_skip >> 2;
   278 
   279 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
   280 		/* form the alpha mult */
   281 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
   282 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   283 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   284 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   285 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
   286 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
   287 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
   288 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   289 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
   290 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   291 		
   292 		while(height--) {
   293 			DUFFS_LOOP_DOUBLE2({
   294 				/* One Pixel Blend */
   295 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   296 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   297 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   298 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   299 
   300 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   301 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   302 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   303 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   304 
   305 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   306 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   307 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   308 				++srcp;
   309 				++dstp;
   310 			},{
   311 				/* Two Pixels Blend */
   312 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   313 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   314 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   315 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   316 
   317 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   318 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   319 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   320 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   321 
   322 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   323 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   324 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   325 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   326 
   327 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   328 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   329 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   330 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   331 
   332 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   333 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   334 				
   335 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   336 
   337   				srcp += 2;
   338   				dstp += 2;
   339   			}, width);
   340 			srcp += srcskip;
   341 			dstp += dstskip;
   342 		}
   343 		emms();
   344 	}
   345 }
   346 
   347 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   348 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   349 {
   350 	int width = info->d_width;
   351 	int height = info->d_height;
   352 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   353 	int srcskip = info->s_skip >> 2;
   354 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   355 	int dstskip = info->d_skip >> 2;
   356 	SDL_PixelFormat* sf = info->src;
   357 	Uint32 amask = sf->Amask;
   358 
   359 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
   360 	/* form multiplication mask */
   361 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
   362 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
   363 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
   364 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
   365 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
   366 	/* form channel masks */
   367 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
   368 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
   369 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
   370 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
   371 	/* get alpha channel shift */
   372 	movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */
   373 
   374 	while(height--) {
   375 	    DUFFS_LOOP4({
   376 		Uint32 alpha = *srcp & amask;
   377 		/* FIXME: Here we special-case opaque alpha since the
   378 			compositioning used (>>8 instead of /255) doesn't handle
   379 			it correctly. Also special-case alpha=0 for speed?
   380 			Benchmark this! */
   381 		if(alpha == 0) {
   382 			/* do nothing */
   383 		} else if(alpha == amask) {
   384 			/* opaque alpha -- copy RGB, keep dst alpha */
   385 			/* using MMX here to free up regular registers for other things */
   386 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   387 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   388 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   389 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   390 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   391 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   392 		} else {
   393 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   394 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   395 
   396 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   397 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   398 
   399 			__asm__ __volatile__ (
   400 				"movd %0, %%mm4"
   401 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   402 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   403 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   404 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   405 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   406 
   407 			/* blend */		    
   408 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   409 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   410 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   411 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   412 			
   413 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   414 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   415 		}
   416 		++srcp;
   417 		++dstp;
   418 	    }, width);
   419 	    srcp += srcskip;
   420 	    dstp += dstskip;
   421 	}
   422 	emms();
   423 }
   424 /* End GCC_ASMBLIT */
   425 
   426 #elif MSVC_ASMBLIT
   427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   428 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   429 {
   430 	int width = info->d_width;
   431 	int height = info->d_height;
   432 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   433 	int srcskip = info->s_skip >> 2;
   434 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   435 	int dstskip = info->d_skip >> 2;
   436 	Uint32 dalpha = info->dst->Amask;
   437 
   438 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   439 	
   440 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
   441 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
   442 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   443 
   444 	while (height--) {
   445 		int n = width;
   446 		if ( n & 1 ) {
   447 			Uint32 s = *srcp++;
   448 			Uint32 d = *dstp;
   449 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   450 				   + (s & d & 0x00010101)) | dalpha;
   451 			n--;
   452 		}
   453 		
   454 		for (n >>= 1; n > 0; --n) {
   455 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
   456 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
   457 
   458 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
   459 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   460 
   461 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
   462 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
   463 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
   464 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
   465 
   466 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
   467 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
   468 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
   469 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
   470 			
   471 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
   472 			dstp += 2;
   473 			srcp += 2;
   474 		}
   475 		
   476 		srcp += srcskip;
   477 		dstp += dstskip;
   478 	}
   479 	_mm_empty();
   480 }
   481 
   482 /* fast RGB888->(A)RGB888 blending with surface alpha */
   483 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   484 {
   485 	SDL_PixelFormat* df = info->dst;
   486 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   487 	unsigned alpha = info->src->alpha;
   488 
   489 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   490 			/* only call a128 version when R,G,B occupy lower bits */
   491 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   492 	} else {
   493 		int width = info->d_width;
   494 		int height = info->d_height;
   495 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   496 		int srcskip = info->s_skip >> 2;
   497 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   498 		int dstskip = info->d_skip >> 2;
   499 		Uint32 dalpha = df->Amask;
   500 		Uint32 amult;
   501 
   502 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   503 		
   504 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   505 		/* form the alpha mult */
   506 		amult = alpha | (alpha << 8);
   507 		amult = amult | (amult << 16);
   508 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   509 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
   510 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   511 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   512 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   513 		
   514 		while (height--) {
   515 			int n = width;
   516 			if (n & 1) {
   517 				/* One Pixel Blend */
   518 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
   519 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   520 
   521 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   522 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   523 
   524 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
   525 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   526 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   527 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   528 				
   529 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   530 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   531 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   532 
   533 				++srcp;
   534 				++dstp;
   535 				
   536 				n--;
   537 			}
   538 
   539 			for (n >>= 1; n > 0; --n) {
   540 				/* Two Pixels Blend */
   541 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
   542 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   543 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   544 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   545 
   546 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
   547 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
   548 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   549 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   550 
   551 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   552 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
   553 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
   554 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   555 
   556 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
   557 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   558 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   559 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   560 				
   561 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   562 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   563 
   564 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
   565 
   566 				srcp += 2;
   567 				dstp += 2;
   568 			}
   569 			srcp += srcskip;
   570 			dstp += dstskip;
   571 		}
   572 		_mm_empty();
   573 	}
   574 }
   575 
   576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   577 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   578 {
   579 	int width = info->d_width;
   580 	int height = info->d_height;
   581 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   582 	int srcskip = info->s_skip >> 2;
   583 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   584 	int dstskip = info->d_skip >> 2;
   585 	SDL_PixelFormat* sf = info->src;
   586 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   587 	Uint32 amask = sf->Amask;
   588 	Uint32 ashift = sf->Ashift;
   589 	Uint64 multmask;
   590 
   591 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   592 
   593 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   594 	multmask = ~(0xFFFFi64 << (ashift * 2));
   595 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   596 
   597 	while(height--) {
   598 		DUFFS_LOOP4({
   599 		Uint32 alpha = *srcp & amask;
   600 		if (alpha == 0) {
   601 			/* do nothing */
   602 		} else if (alpha == amask) {
   603 			/* opaque alpha -- copy RGB, keep dst alpha */
   604 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   605 		} else {
   606 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   607 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   608 
   609 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   610 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   611 
   612 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   613 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   614 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   615 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   616 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   617 
   618 			/* blend */		    
   619 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   620 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   621 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   622 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   623 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   624 			
   625 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   626 		}
   627 		++srcp;
   628 		++dstp;
   629 	    }, width);
   630 	    srcp += srcskip;
   631 	    dstp += dstskip;
   632 	}
   633 	_mm_empty();
   634 }
   635 /* End MSVC_ASMBLIT */
   636 
   637 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   638 
   639 #if SDL_ALTIVEC_BLITTERS
   640 #if __MWERKS__
   641 #pragma altivec_model on
   642 #endif
   643 #if HAVE_ALTIVEC_H
   644 #include <altivec.h>
   645 #endif
   646 #include <assert.h>
   647 
   648 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   649     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   650         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   651     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   652         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   653 #else
   654     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   655         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   656     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   657         (vector unsigned short) { a,b,c,d,e,f,g,h }
   658 #endif
   659 
   660 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   661 #define VECPRINT(msg, v) do { \
   662     vector unsigned int tmpvec = (vector unsigned int)(v); \
   663     unsigned int *vp = (unsigned int *)&tmpvec; \
   664     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   665 } while (0)
   666 
   667 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   668     (vector unsigned char)(
   669         0x00, 0x10, 0x02, 0x12,
   670         0x04, 0x14, 0x06, 0x16,
   671         0x08, 0x18, 0x0A, 0x1A,
   672         0x0C, 0x1C, 0x0E, 0x1E );
   673 */
   674 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   675 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   676 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   677 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   678     ? vec_lvsl(0, src) \
   679     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   680 
   681    
   682 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   683     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   684     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   685     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   686     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   687     /* valpha2 is 255-alpha */ \
   688     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   689     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   690     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   691     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   692     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   693     /* add source and dest */ \
   694     vtemp1 = vec_add(vtemp1, vtemp3); \
   695     vtemp2 = vec_add(vtemp2, vtemp4); \
   696     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   697     vtemp1 = vec_add(vtemp1, v1_16); \
   698     vtemp3 = vec_sr(vtemp1, v8_16); \
   699     vtemp1 = vec_add(vtemp1, vtemp3); \
   700     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   701     vtemp2 = vec_add(vtemp2, v1_16); \
   702     vtemp4 = vec_sr(vtemp2, v8_16); \
   703     vtemp2 = vec_add(vtemp2, vtemp4); \
   704     /* (>>8) and get ARGBARGBARGBARGB */ \
   705     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   706 } while (0)
   707  
   708 /* Calculate the permute vector used for 32->32 swizzling */
   709 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
   710                                   const SDL_PixelFormat *dstfmt)
   711 {
   712     /*
   713      * We have to assume that the bits that aren't used by other
   714      *  colors is alpha, and it's one complete byte, since some formats
   715      *  leave alpha with a zero mask, but we should still swizzle the bits.
   716      */
   717     /* ARGB */
   718     const static struct SDL_PixelFormat default_pixel_format = {
   719         NULL, 0, 0,
   720         0, 0, 0, 0,
   721         16, 8, 0, 24,
   722         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   723         0, 0};
   724     if (!srcfmt) {
   725         srcfmt = &default_pixel_format;
   726     }
   727     if (!dstfmt) {
   728         dstfmt = &default_pixel_format;
   729     }
   730     const vector unsigned char plus = VECUINT8_LITERAL
   731                                             ( 0x00, 0x00, 0x00, 0x00,
   732                                               0x04, 0x04, 0x04, 0x04,
   733                                               0x08, 0x08, 0x08, 0x08,
   734                                               0x0C, 0x0C, 0x0C, 0x0C );
   735     vector unsigned char vswiz;
   736     vector unsigned int srcvec;
   737 #define RESHIFT(X) (3 - ((X) >> 3))
   738     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   739     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   740     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   741     Uint32 amask;
   742     /* Use zero for alpha if either surface doesn't have alpha */
   743     if (dstfmt->Amask) {
   744         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   745     } else {
   746         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   747     }
   748 #undef RESHIFT  
   749     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
   750     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   751     return(vswiz);
   752 }
   753 
   754 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   755 {
   756     int height = info->d_height;
   757     Uint8 *src = (Uint8 *)info->s_pixels;
   758     int srcskip = info->s_skip;
   759     Uint8 *dst = (Uint8 *)info->d_pixels;
   760     int dstskip = info->d_skip;
   761     SDL_PixelFormat *srcfmt = info->src;
   762 
   763     vector unsigned char v0 = vec_splat_u8(0);
   764     vector unsigned short v8_16 = vec_splat_u16(8);
   765     vector unsigned short v1_16 = vec_splat_u16(1);
   766     vector unsigned short v2_16 = vec_splat_u16(2);
   767     vector unsigned short v3_16 = vec_splat_u16(3);
   768     vector unsigned int v8_32 = vec_splat_u32(8);
   769     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   770     vector unsigned short v3f = VECUINT16_LITERAL(
   771         0x003f, 0x003f, 0x003f, 0x003f,
   772         0x003f, 0x003f, 0x003f, 0x003f);
   773     vector unsigned short vfc = VECUINT16_LITERAL(
   774         0x00fc, 0x00fc, 0x00fc, 0x00fc,
   775         0x00fc, 0x00fc, 0x00fc, 0x00fc);
   776 
   777     /* 
   778         0x10 - 0x1f is the alpha
   779         0x00 - 0x0e evens are the red
   780         0x01 - 0x0f odds are zero
   781     */
   782     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
   783         0x10, 0x00, 0x01, 0x01,
   784         0x10, 0x02, 0x01, 0x01,
   785         0x10, 0x04, 0x01, 0x01,
   786         0x10, 0x06, 0x01, 0x01
   787     );
   788     vector unsigned char vredalpha2 = (vector unsigned char)(
   789         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   790     );
   791     /*
   792         0x00 - 0x0f is ARxx ARxx ARxx ARxx
   793         0x11 - 0x0f odds are blue
   794     */
   795     vector unsigned char vblue1 = VECUINT8_LITERAL(
   796         0x00, 0x01, 0x02, 0x11,
   797         0x04, 0x05, 0x06, 0x13,
   798         0x08, 0x09, 0x0a, 0x15,
   799         0x0c, 0x0d, 0x0e, 0x17
   800     );
   801     vector unsigned char vblue2 = (vector unsigned char)(
   802         vec_add((vector unsigned int)vblue1, v8_32)
   803     );
   804     /*
   805         0x00 - 0x0f is ARxB ARxB ARxB ARxB
   806         0x10 - 0x0e evens are green
   807     */
   808     vector unsigned char vgreen1 = VECUINT8_LITERAL(
   809         0x00, 0x01, 0x10, 0x03,
   810         0x04, 0x05, 0x12, 0x07,
   811         0x08, 0x09, 0x14, 0x0b,
   812         0x0c, 0x0d, 0x16, 0x0f
   813     );
   814     vector unsigned char vgreen2 = (vector unsigned char)(
   815         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   816     );
   817     vector unsigned char vgmerge = VECUINT8_LITERAL(
   818         0x00, 0x02, 0x00, 0x06,
   819         0x00, 0x0a, 0x00, 0x0e,
   820         0x00, 0x12, 0x00, 0x16,
   821         0x00, 0x1a, 0x00, 0x1e);
   822     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   823     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   824     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   825 
   826     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   827     vf800 = vec_sl(vf800, vec_splat_u16(8));
   828 
   829     while(height--) {
   830         int extrawidth;
   831         vector unsigned char valigner;
   832         vector unsigned char vsrc;
   833         vector unsigned char voverflow;
   834         int width = info->d_width;
   835 
   836 #define ONE_PIXEL_BLEND(condition, widthvar) \
   837         while (condition) { \
   838             Uint32 Pixel; \
   839             unsigned sR, sG, sB, dR, dG, dB, sA; \
   840             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   841             if(sA) { \
   842                 unsigned short dstpixel = *((unsigned short *)dst); \
   843                 dR = (dstpixel >> 8) & 0xf8; \
   844                 dG = (dstpixel >> 3) & 0xfc; \
   845                 dB = (dstpixel << 3) & 0xf8; \
   846                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   847                 *((unsigned short *)dst) = ( \
   848                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   849                 ); \
   850             } \
   851             src += 4; \
   852             dst += 2; \
   853             widthvar--; \
   854         }
   855         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   856         extrawidth = (width % 8);
   857         valigner = VEC_ALIGNER(src);
   858         vsrc = (vector unsigned char)vec_ld(0, src);
   859         width -= extrawidth;
   860         while (width) {
   861             vector unsigned char valpha;
   862             vector unsigned char vsrc1, vsrc2;
   863             vector unsigned char vdst1, vdst2;
   864             vector unsigned short vR, vG, vB;
   865             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   866 
   867             /* Load 8 pixels from src as ARGB */
   868             voverflow = (vector unsigned char)vec_ld(15, src);
   869             vsrc = vec_perm(vsrc, voverflow, valigner);
   870             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   871             src += 16;
   872             vsrc = (vector unsigned char)vec_ld(15, src);
   873             voverflow = vec_perm(voverflow, vsrc, valigner);
   874             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   875             src += 16;
   876 
   877             /* Load 8 pixels from dst as XRGB */
   878             voverflow = vec_ld(0, dst);
   879             vR = vec_and((vector unsigned short)voverflow, vf800);
   880             vB = vec_sl((vector unsigned short)voverflow, v3_16);
   881             vG = vec_sl(vB, v2_16);
   882             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   883             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   884             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   885             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   886             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   887             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   888 
   889             /* Alpha blend 8 pixels as ARGB */
   890             valpha = vec_perm(vsrc1, v0, valphaPermute);
   891             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   892             valpha = vec_perm(vsrc2, v0, valphaPermute);
   893             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   894 
   895             /* Convert 8 pixels to 565 */
   896             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   897             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   898             vgpixel = vec_and(vgpixel, vfc);
   899             vgpixel = vec_sl(vgpixel, v3_16);
   900             vrpixel = vec_sl(vpixel, v1_16);
   901             vrpixel = vec_and(vrpixel, vf800);
   902             vbpixel = vec_and(vpixel, v3f);
   903             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   904             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   905             
   906             /* Store 8 pixels */
   907             vec_st(vdst1, 0, dst);
   908 
   909             width -= 8;
   910             dst += 16;
   911         }
   912         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   913 #undef ONE_PIXEL_BLEND
   914         src += srcskip;
   915         dst += dstskip;
   916     }
   917 }
   918 
   919 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   920 {
   921     unsigned alpha = info->src->alpha;
   922     int height = info->d_height;
   923     Uint32 *srcp = (Uint32 *)info->s_pixels;
   924     int srcskip = info->s_skip >> 2;
   925     Uint32 *dstp = (Uint32 *)info->d_pixels;
   926     int dstskip = info->d_skip >> 2;
   927     SDL_PixelFormat *srcfmt = info->src;
   928     SDL_PixelFormat *dstfmt = info->dst;
   929     unsigned sA = srcfmt->alpha;
   930     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   931     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   932     Uint32 ckey = info->src->colorkey;
   933     vector unsigned char mergePermute;
   934     vector unsigned char vsrcPermute;
   935     vector unsigned char vdstPermute;
   936     vector unsigned char vsdstPermute;
   937     vector unsigned char valpha;
   938     vector unsigned char valphamask;
   939     vector unsigned char vbits;
   940     vector unsigned char v0;
   941     vector unsigned short v1;
   942     vector unsigned short v8;
   943     vector unsigned int vckey;
   944     vector unsigned int vrgbmask;
   945 
   946     mergePermute = VEC_MERGE_PERMUTE();
   947     v0 = vec_splat_u8(0);
   948     v1 = vec_splat_u16(1);
   949     v8 = vec_splat_u16(8);
   950 
   951     /* set the alpha to 255 on the destination surf */
   952     valphamask = VEC_ALPHA_MASK();
   953 
   954     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   955     vdstPermute = calc_swizzle32(NULL, dstfmt);
   956     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   957 
   958     /* set a vector full of alpha and 255-alpha */
   959     ((unsigned char *)&valpha)[0] = alpha;
   960     valpha = vec_splat(valpha, 0);
   961     vbits = (vector unsigned char)vec_splat_s8(-1);
   962 
   963     ckey &= rgbmask;
   964     ((unsigned int *)(char*)&vckey)[0] = ckey;
   965     vckey = vec_splat(vckey, 0);
   966     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
   967     vrgbmask = vec_splat(vrgbmask, 0);
   968 
   969     while(height--) {
   970         int width = info->d_width;
   971 #define ONE_PIXEL_BLEND(condition, widthvar) \
   972         while (condition) { \
   973             Uint32 Pixel; \
   974             unsigned sR, sG, sB, dR, dG, dB; \
   975             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   976             if(sA && Pixel != ckey) { \
   977                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   978                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   979                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   980                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   981             } \
   982             dstp++; \
   983             srcp++; \
   984             widthvar--; \
   985         }
   986         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   987         if (width > 0) {
   988             int extrawidth = (width % 4);
   989             vector unsigned char valigner = VEC_ALIGNER(srcp);
   990             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   991             width -= extrawidth;
   992             while (width) {
   993                 vector unsigned char vsel;
   994                 vector unsigned char voverflow;
   995                 vector unsigned char vd;
   996                 vector unsigned char vd_orig;
   997 
   998                 /* s = *srcp */
   999                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1000                 vs = vec_perm(vs, voverflow, valigner);
  1001                 
  1002                 /* vsel is set for items that match the key */
  1003                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
  1004                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
  1005 
  1006                 /* permute to source format */
  1007                 vs = vec_perm(vs, valpha, vsrcPermute);
  1008 
  1009                 /* d = *dstp */
  1010                 vd = (vector unsigned char)vec_ld(0, dstp);
  1011                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1012 
  1013                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1014 
  1015                 /* set the alpha channel to full on */
  1016                 vd = vec_or(vd, valphamask);
  1017 
  1018                 /* mask out color key */
  1019                 vd = vec_sel(vd, vd_orig, vsel);
  1020                 
  1021                 /* permute to dest format */
  1022                 vd = vec_perm(vd, vbits, vdstPermute);
  1023 
  1024                 /* *dstp = res */
  1025                 vec_st((vector unsigned int)vd, 0, dstp);
  1026                 
  1027                 srcp += 4;
  1028                 dstp += 4;
  1029                 width -= 4;
  1030                 vs = voverflow;
  1031             }
  1032             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1033         }
  1034 #undef ONE_PIXEL_BLEND
  1035  
  1036         srcp += srcskip;
  1037         dstp += dstskip;
  1038     }
  1039 }
  1040 
  1041 
  1042 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
  1043 {
  1044     int width = info->d_width;
  1045     int height = info->d_height;
  1046     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1047     int srcskip = info->s_skip >> 2;
  1048     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1049     int dstskip = info->d_skip >> 2;
  1050     SDL_PixelFormat *srcfmt = info->src;
  1051     SDL_PixelFormat *dstfmt = info->dst;
  1052     vector unsigned char mergePermute;
  1053     vector unsigned char valphaPermute;
  1054     vector unsigned char vsrcPermute;
  1055     vector unsigned char vdstPermute;
  1056     vector unsigned char vsdstPermute;
  1057     vector unsigned char valphamask;
  1058     vector unsigned char vpixelmask;
  1059     vector unsigned char v0;
  1060     vector unsigned short v1;
  1061     vector unsigned short v8;
  1062 
  1063     v0 = vec_splat_u8(0);
  1064     v1 = vec_splat_u16(1);
  1065     v8 = vec_splat_u16(8);
  1066     mergePermute = VEC_MERGE_PERMUTE();
  1067     valphamask = VEC_ALPHA_MASK();
  1068     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1069     vpixelmask = vec_nor(valphamask, v0);
  1070     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1071     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1072     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1073 
  1074 	while ( height-- ) {
  1075         width = info->d_width;
  1076 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1077             Uint32 Pixel; \
  1078             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1079             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1080             if(sA) { \
  1081               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1082               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1083               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1084             } \
  1085             ++srcp; \
  1086             ++dstp; \
  1087             widthvar--; \
  1088         }
  1089         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1090         if (width > 0) {
  1091             /* vsrcPermute */
  1092             /* vdstPermute */
  1093             int extrawidth = (width % 4);
  1094             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1095             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1096             width -= extrawidth;
  1097             while (width) {
  1098                 vector unsigned char voverflow;
  1099                 vector unsigned char vd;
  1100                 vector unsigned char valpha;
  1101                 vector unsigned char vdstalpha;
  1102                 /* s = *srcp */
  1103                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1104                 vs = vec_perm(vs, voverflow, valigner);
  1105                 vs = vec_perm(vs, v0, vsrcPermute);
  1106 
  1107                 valpha = vec_perm(vs, v0, valphaPermute);
  1108                 
  1109                 /* d = *dstp */
  1110                 vd = (vector unsigned char)vec_ld(0, dstp);
  1111                 vd = vec_perm(vd, v0, vsdstPermute);
  1112                 vdstalpha = vec_and(vd, valphamask);
  1113 
  1114                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1115 
  1116                 /* set the alpha to the dest alpha */
  1117                 vd = vec_and(vd, vpixelmask);
  1118                 vd = vec_or(vd, vdstalpha);
  1119                 vd = vec_perm(vd, v0, vdstPermute);
  1120 
  1121                 /* *dstp = res */
  1122                 vec_st((vector unsigned int)vd, 0, dstp);
  1123                 
  1124                 srcp += 4;
  1125                 dstp += 4;
  1126                 width -= 4;
  1127                 vs = voverflow;
  1128 
  1129             }
  1130             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1131         }
  1132 	    srcp += srcskip;
  1133 	    dstp += dstskip;
  1134 #undef ONE_PIXEL_BLEND
  1135 	}
  1136 }
  1137 
  1138 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1139 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
  1140 {
  1141 	int width = info->d_width;
  1142 	int height = info->d_height;
  1143 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1144 	int srcskip = info->s_skip >> 2;
  1145 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1146 	int dstskip = info->d_skip >> 2;
  1147     vector unsigned char mergePermute;
  1148     vector unsigned char valphaPermute;
  1149     vector unsigned char valphamask;
  1150     vector unsigned char vpixelmask;
  1151     vector unsigned char v0;
  1152     vector unsigned short v1;
  1153     vector unsigned short v8;
  1154     v0 = vec_splat_u8(0);
  1155     v1 = vec_splat_u16(1);
  1156     v8 = vec_splat_u16(8);
  1157     mergePermute = VEC_MERGE_PERMUTE();
  1158     valphamask = VEC_ALPHA_MASK();
  1159     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1160     
  1161  
  1162     vpixelmask = vec_nor(valphamask, v0);
  1163 	while(height--) {
  1164         width = info->d_width;
  1165 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1166         while ((condition)) { \
  1167             Uint32 dalpha; \
  1168             Uint32 d; \
  1169             Uint32 s1; \
  1170             Uint32 d1; \
  1171             Uint32 s = *srcp; \
  1172             Uint32 alpha = s >> 24; \
  1173             if(alpha) { \
  1174               if(alpha == SDL_ALPHA_OPAQUE) { \
  1175                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1176               } else { \
  1177                 d = *dstp; \
  1178                 dalpha = d & 0xff000000; \
  1179                 s1 = s & 0xff00ff; \
  1180                 d1 = d & 0xff00ff; \
  1181                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1182                 s &= 0xff00; \
  1183                 d &= 0xff00; \
  1184                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1185                 *dstp = d1 | d | dalpha; \
  1186               } \
  1187             } \
  1188             ++srcp; \
  1189             ++dstp; \
  1190             widthvar--; \
  1191 	    }
  1192         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1193         if (width > 0) {
  1194             int extrawidth = (width % 4);
  1195             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1196             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1197             width -= extrawidth;
  1198             while (width) {
  1199                 vector unsigned char voverflow;
  1200                 vector unsigned char vd;
  1201                 vector unsigned char valpha;
  1202                 vector unsigned char vdstalpha;
  1203                 /* s = *srcp */
  1204                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1205                 vs = vec_perm(vs, voverflow, valigner);
  1206 
  1207                 valpha = vec_perm(vs, v0, valphaPermute);
  1208                 
  1209                 /* d = *dstp */
  1210                 vd = (vector unsigned char)vec_ld(0, dstp);
  1211                 vdstalpha = vec_and(vd, valphamask);
  1212 
  1213                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1214 
  1215                 /* set the alpha to the dest alpha */
  1216                 vd = vec_and(vd, vpixelmask);
  1217                 vd = vec_or(vd, vdstalpha);
  1218 
  1219                 /* *dstp = res */
  1220                 vec_st((vector unsigned int)vd, 0, dstp);
  1221                 
  1222                 srcp += 4;
  1223                 dstp += 4;
  1224                 width -= 4;
  1225                 vs = voverflow;
  1226             }
  1227             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1228         }
  1229 	    srcp += srcskip;
  1230 	    dstp += dstskip;
  1231 	}
  1232 #undef ONE_PIXEL_BLEND
  1233 }
  1234 
  1235 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
  1236 {
  1237     /* XXX : 6 */
  1238 	unsigned alpha = info->src->alpha;
  1239     int height = info->d_height;
  1240     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1241     int srcskip = info->s_skip >> 2;
  1242     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1243     int dstskip = info->d_skip >> 2;
  1244     SDL_PixelFormat *srcfmt = info->src;
  1245     SDL_PixelFormat *dstfmt = info->dst;
  1246 	unsigned sA = srcfmt->alpha;
  1247 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1248     vector unsigned char mergePermute;
  1249     vector unsigned char vsrcPermute;
  1250     vector unsigned char vdstPermute;
  1251     vector unsigned char vsdstPermute;
  1252     vector unsigned char valpha;
  1253     vector unsigned char valphamask;
  1254     vector unsigned char vbits;
  1255     vector unsigned short v1;
  1256     vector unsigned short v8;
  1257 
  1258     mergePermute = VEC_MERGE_PERMUTE();
  1259     v1 = vec_splat_u16(1);
  1260     v8 = vec_splat_u16(8);
  1261 
  1262     /* set the alpha to 255 on the destination surf */
  1263     valphamask = VEC_ALPHA_MASK();
  1264 
  1265     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1266     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1267     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1268 
  1269     /* set a vector full of alpha and 255-alpha */
  1270     ((unsigned char *)&valpha)[0] = alpha;
  1271     valpha = vec_splat(valpha, 0);
  1272     vbits = (vector unsigned char)vec_splat_s8(-1);
  1273 
  1274     while(height--) {
  1275         int width = info->d_width;
  1276 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1277             Uint32 Pixel; \
  1278             unsigned sR, sG, sB, dR, dG, dB; \
  1279             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1280             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1281             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1282             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1283             ++srcp; \
  1284             ++dstp; \
  1285             widthvar--; \
  1286         }
  1287         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1288         if (width > 0) {
  1289             int extrawidth = (width % 4);
  1290             vector unsigned char valigner = vec_lvsl(0, srcp);
  1291             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1292             width -= extrawidth;
  1293             while (width) {
  1294                 vector unsigned char voverflow;
  1295                 vector unsigned char vd;
  1296 
  1297                 /* s = *srcp */
  1298                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1299                 vs = vec_perm(vs, voverflow, valigner);
  1300                 vs = vec_perm(vs, valpha, vsrcPermute);
  1301                 
  1302                 /* d = *dstp */
  1303                 vd = (vector unsigned char)vec_ld(0, dstp);
  1304                 vd = vec_perm(vd, vd, vsdstPermute);
  1305 
  1306                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1307 
  1308                 /* set the alpha channel to full on */
  1309                 vd = vec_or(vd, valphamask);
  1310                 vd = vec_perm(vd, vbits, vdstPermute);
  1311 
  1312                 /* *dstp = res */
  1313                 vec_st((vector unsigned int)vd, 0, dstp);
  1314                 
  1315                 srcp += 4;
  1316                 dstp += 4;
  1317                 width -= 4;
  1318                 vs = voverflow;
  1319             }
  1320             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1321         }
  1322 #undef ONE_PIXEL_BLEND
  1323  
  1324         srcp += srcskip;
  1325         dstp += dstskip;
  1326     }
  1327 
  1328 }
  1329 
  1330 
  1331 /* fast RGB888->(A)RGB888 blending */
  1332 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
  1333 {
  1334 	unsigned alpha = info->src->alpha;
  1335     int height = info->d_height;
  1336     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1337     int srcskip = info->s_skip >> 2;
  1338     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1339     int dstskip = info->d_skip >> 2;
  1340     vector unsigned char mergePermute;
  1341     vector unsigned char valpha;
  1342     vector unsigned char valphamask;
  1343     vector unsigned short v1;
  1344     vector unsigned short v8;
  1345 
  1346     mergePermute = VEC_MERGE_PERMUTE();
  1347     v1 = vec_splat_u16(1);
  1348     v8 = vec_splat_u16(8);
  1349 
  1350     /* set the alpha to 255 on the destination surf */
  1351     valphamask = VEC_ALPHA_MASK();
  1352 
  1353     /* set a vector full of alpha and 255-alpha */
  1354     ((unsigned char *)&valpha)[0] = alpha;
  1355     valpha = vec_splat(valpha, 0);
  1356 
  1357     while(height--) {
  1358         int width = info->d_width;
  1359 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1360             Uint32 s = *srcp; \
  1361             Uint32 d = *dstp; \
  1362             Uint32 s1 = s & 0xff00ff; \
  1363             Uint32 d1 = d & 0xff00ff; \
  1364             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1365                  & 0xff00ff; \
  1366             s &= 0xff00; \
  1367             d &= 0xff00; \
  1368             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1369             *dstp = d1 | d | 0xff000000; \
  1370             ++srcp; \
  1371             ++dstp; \
  1372             widthvar--; \
  1373         }
  1374         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1375         if (width > 0) {
  1376             int extrawidth = (width % 4);
  1377             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1378             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1379             width -= extrawidth;
  1380             while (width) {
  1381                 vector unsigned char voverflow;
  1382                 vector unsigned char vd;
  1383 
  1384                 /* s = *srcp */
  1385                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1386                 vs = vec_perm(vs, voverflow, valigner);
  1387                 
  1388                 /* d = *dstp */
  1389                 vd = (vector unsigned char)vec_ld(0, dstp);
  1390 
  1391                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1392 
  1393                 /* set the alpha channel to full on */
  1394                 vd = vec_or(vd, valphamask);
  1395 
  1396                 /* *dstp = res */
  1397                 vec_st((vector unsigned int)vd, 0, dstp);
  1398                 
  1399                 srcp += 4;
  1400                 dstp += 4;
  1401                 width -= 4;
  1402                 vs = voverflow;
  1403             }
  1404             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1405         }
  1406 #undef ONE_PIXEL_BLEND
  1407  
  1408         srcp += srcskip;
  1409         dstp += dstskip;
  1410     }
  1411 }
  1412 #if __MWERKS__
  1413 #pragma altivec_model off
  1414 #endif
  1415 #endif /* SDL_ALTIVEC_BLITTERS */
  1416 
  1417 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1418 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  1419 {
  1420 	int width = info->d_width;
  1421 	int height = info->d_height;
  1422 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1423 	int srcskip = info->s_skip >> 2;
  1424 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1425 	int dstskip = info->d_skip >> 2;
  1426 
  1427 	while(height--) {
  1428 	    DUFFS_LOOP4({
  1429 		    Uint32 s = *srcp++;
  1430 		    Uint32 d = *dstp;
  1431 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1432 			       + (s & d & 0x00010101)) | 0xff000000;
  1433 	    }, width);
  1434 	    srcp += srcskip;
  1435 	    dstp += dstskip;
  1436 	}
  1437 }
  1438 
  1439 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1440 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  1441 {
  1442 	unsigned alpha = info->src->alpha;
  1443 	if(alpha == 128) {
  1444 		BlitRGBtoRGBSurfaceAlpha128(info);
  1445 	} else {
  1446 		int width = info->d_width;
  1447 		int height = info->d_height;
  1448 		Uint32 *srcp = (Uint32 *)info->s_pixels;
  1449 		int srcskip = info->s_skip >> 2;
  1450 		Uint32 *dstp = (Uint32 *)info->d_pixels;
  1451 		int dstskip = info->d_skip >> 2;
  1452 		Uint32 s;
  1453 		Uint32 d;
  1454 		Uint32 s1;
  1455 		Uint32 d1;
  1456 
  1457 		while(height--) {
  1458 			DUFFS_LOOP_DOUBLE2({
  1459 				/* One Pixel Blend */
  1460 				s = *srcp;
  1461 				d = *dstp;
  1462 				s1 = s & 0xff00ff;
  1463 				d1 = d & 0xff00ff;
  1464 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1465 				     & 0xff00ff;
  1466 				s &= 0xff00;
  1467 				d &= 0xff00;
  1468 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1469 				*dstp = d1 | d | 0xff000000;
  1470 				++srcp;
  1471 				++dstp;
  1472 			},{
  1473 			        /* Two Pixels Blend */
  1474 				s = *srcp;
  1475 				d = *dstp;
  1476 				s1 = s & 0xff00ff;
  1477 				d1 = d & 0xff00ff;
  1478 				d1 += (s1 - d1) * alpha >> 8;
  1479 				d1 &= 0xff00ff;
  1480 				     
  1481 				s = ((s & 0xff00) >> 8) | 
  1482 					((srcp[1] & 0xff00) << 8);
  1483 				d = ((d & 0xff00) >> 8) |
  1484 					((dstp[1] & 0xff00) << 8);
  1485 				d += (s - d) * alpha >> 8;
  1486 				d &= 0x00ff00ff;
  1487 				
  1488 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1489 				++srcp;
  1490 				
  1491 			        s1 = *srcp;
  1492 				d1 = *dstp;
  1493 				s1 &= 0xff00ff;
  1494 				d1 &= 0xff00ff;
  1495 				d1 += (s1 - d1) * alpha >> 8;
  1496 				d1 &= 0xff00ff;
  1497 				
  1498 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1499 				++srcp;
  1500 				++dstp;
  1501 			}, width);
  1502 			srcp += srcskip;
  1503 			dstp += dstskip;
  1504 		}
  1505 	}
  1506 }
  1507 
  1508 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1509 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
  1510 {
  1511 	int width = info->d_width;
  1512 	int height = info->d_height;
  1513 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1514 	int srcskip = info->s_skip >> 2;
  1515 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1516 	int dstskip = info->d_skip >> 2;
  1517 
  1518 	while(height--) {
  1519 	    DUFFS_LOOP4({
  1520 		Uint32 dalpha;
  1521 		Uint32 d;
  1522 		Uint32 s1;
  1523 		Uint32 d1;
  1524 		Uint32 s = *srcp;
  1525 		Uint32 alpha = s >> 24;
  1526 		/* FIXME: Here we special-case opaque alpha since the
  1527 		   compositioning used (>>8 instead of /255) doesn't handle
  1528 		   it correctly. Also special-case alpha=0 for speed?
  1529 		   Benchmark this! */
  1530 		if(alpha) {   
  1531 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1532 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1533 		  } else {
  1534 		    /*
  1535 		     * take out the middle component (green), and process
  1536 		     * the other two in parallel. One multiply less.
  1537 		     */
  1538 		    d = *dstp;
  1539 		    dalpha = d & 0xff000000;
  1540 		    s1 = s & 0xff00ff;
  1541 		    d1 = d & 0xff00ff;
  1542 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1543 		    s &= 0xff00;
  1544 		    d &= 0xff00;
  1545 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1546 		    *dstp = d1 | d | dalpha;
  1547 		  }
  1548 		}
  1549 		++srcp;
  1550 		++dstp;
  1551 	    }, width);
  1552 	    srcp += srcskip;
  1553 	    dstp += dstskip;
  1554 	}
  1555 }
  1556 
  1557 #if GCC_ASMBLIT
  1558 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1559 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1560 {
  1561 	int width = info->d_width;
  1562 	int height = info->d_height;
  1563 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1564 	int srcskip = info->s_skip >> 2;
  1565 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1566 	int dstskip = info->d_skip >> 2;
  1567 	SDL_PixelFormat* sf = info->src;
  1568 	Uint32 amask = sf->Amask;
  1569 
  1570 	__asm__ (
  1571 	/* make mm6 all zeros. */
  1572 	"pxor       %%mm6, %%mm6\n"
  1573 	
  1574 	/* Make a mask to preserve the alpha. */
  1575 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
  1576 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
  1577 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
  1578 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
  1579 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
  1580 
  1581 	/* form channel masks */
  1582 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
  1583 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
  1584 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
  1585 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
  1586 	
  1587 	/* get alpha channel shift */
  1588 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
  1589 
  1590 	  : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) );
  1591 
  1592 	while(height--) {
  1593 
  1594 	    DUFFS_LOOP4({
  1595 		Uint32 alpha;
  1596 
  1597 		__asm__ (
  1598 		"prefetch 64(%0)\n"
  1599 		"prefetch 64(%1)\n"
  1600 			: : "r" (srcp), "r" (dstp) );
  1601 
  1602 		alpha = *srcp & amask;
  1603 		/* FIXME: Here we special-case opaque alpha since the
  1604 		   compositioning used (>>8 instead of /255) doesn't handle
  1605 		   it correctly. Also special-case alpha=0 for speed?
  1606 		   Benchmark this! */
  1607 		if(alpha == 0) {
  1608 		    /* do nothing */
  1609 		}
  1610 		else if(alpha == amask) {
  1611 			/* opaque alpha -- copy RGB, keep dst alpha */
  1612 		    /* using MMX here to free up regular registers for other things */
  1613 			    __asm__ (
  1614 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1615 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1616 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1617 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1618 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1619 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1620 
  1621 		     : : "r" (srcp), "r" (dstp) );
  1622 		} 
  1623 
  1624 		else {
  1625 			    __asm__ (
  1626 		    /* load in the source, and dst. */
  1627 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1628 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1629 
  1630 		    /* Move the src alpha into mm2 */
  1631 
  1632 		    /* if supporting pshufw */
  1633 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1634 		    /*"psrlw     $8, %%mm2\n" */
  1635 		    
  1636 		    /* else: */
  1637 		    "movd       %2,    %%mm2\n"
  1638 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1639 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1640 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1641 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1642 
  1643 		    /* move the colors into words. */
  1644 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1645 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1646 
  1647 		    /* src - dst */
  1648 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1649 
  1650 		    /* A * (src-dst) */
  1651 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1652 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1653 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1654 
  1655 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1656 		    
  1657 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1658 
  1659 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1660 
  1661 		}
  1662 		++srcp;
  1663 		++dstp;
  1664 	    }, width);
  1665 	    srcp += srcskip;
  1666 	    dstp += dstskip;
  1667 	}
  1668 
  1669 	__asm__ (
  1670 	"emms\n"
  1671 		:   );
  1672 }
  1673 /* End GCC_ASMBLIT*/
  1674 
  1675 #elif MSVC_ASMBLIT
  1676 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1677 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1678 {
  1679 	int width = info->d_width;
  1680 	int height = info->d_height;
  1681 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1682 	int srcskip = info->s_skip >> 2;
  1683 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1684 	int dstskip = info->d_skip >> 2;
  1685 	SDL_PixelFormat* sf = info->src;
  1686 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1687 	Uint32 amask = sf->Amask;
  1688 	Uint32 ashift = sf->Ashift;
  1689 	Uint64 multmask;
  1690 	
  1691 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1692 
  1693 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
  1694 	multmask = ~(0xFFFFi64 << (ashift * 2));
  1695 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
  1696 
  1697 	while(height--) {
  1698 	    DUFFS_LOOP4({
  1699 		Uint32 alpha;
  1700 
  1701 		_m_prefetch(srcp + 16);
  1702 		_m_prefetch(dstp + 16);
  1703 
  1704 		alpha = *srcp & amask;
  1705 		if (alpha == 0) {
  1706 			/* do nothing */
  1707 		} else if (alpha == amask) {
  1708 			/* copy RGB, keep dst alpha */
  1709 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1710 		} else {
  1711 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1712 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1713 
  1714 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1715 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1716 
  1717 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1718 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1719 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1720 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1721 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1722 
  1723 			/* blend */		    
  1724 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1725 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1726 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1727 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1728 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1729 			
  1730 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1731 		}
  1732 		++srcp;
  1733 		++dstp;
  1734 	    }, width);
  1735 	    srcp += srcskip;
  1736 	    dstp += dstskip;
  1737 	}
  1738 	_mm_empty();
  1739 }
  1740 /* End MSVC_ASMBLIT */
  1741 
  1742 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1743 
  1744 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1745 
  1746 /* blend a single 16 bit pixel at 50% */
  1747 #define BLEND16_50(d, s, mask)						\
  1748 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1749 
  1750 /* blend two 16 bit pixels at 50% */
  1751 #define BLEND2x16_50(d, s, mask)					     \
  1752 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1753 	 + (s & d & (~(mask | mask << 16))))
  1754 
  1755 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  1756 {
  1757 	int width = info->d_width;
  1758 	int height = info->d_height;
  1759 	Uint16 *srcp = (Uint16 *)info->s_pixels;
  1760 	int srcskip = info->s_skip >> 1;
  1761 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  1762 	int dstskip = info->d_skip >> 1;
  1763 
  1764 	while(height--) {
  1765 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  1766 			/*
  1767 			 * Source and destination not aligned, pipeline it.
  1768 			 * This is mostly a win for big blits but no loss for
  1769 			 * small ones
  1770 			 */
  1771 			Uint32 prev_sw;
  1772 			int w = width;
  1773 
  1774 			/* handle odd destination */
  1775 			if((uintptr_t)dstp & 2) {
  1776 				Uint16 d = *dstp, s = *srcp;
  1777 				*dstp = BLEND16_50(d, s, mask);
  1778 				dstp++;
  1779 				srcp++;
  1780 				w--;
  1781 			}
  1782 			srcp++;	/* srcp is now 32-bit aligned */
  1783 
  1784 			/* bootstrap pipeline with first halfword */
  1785 			prev_sw = ((Uint32 *)srcp)[-1];
  1786 
  1787 			while(w > 1) {
  1788 				Uint32 sw, dw, s;
  1789 				sw = *(Uint32 *)srcp;
  1790 				dw = *(Uint32 *)dstp;
  1791 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1792 				s = (prev_sw << 16) + (sw >> 16);
  1793 #else
  1794 				s = (prev_sw >> 16) + (sw << 16);
  1795 #endif
  1796 				prev_sw = sw;
  1797 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  1798 				dstp += 2;
  1799 				srcp += 2;
  1800 				w -= 2;
  1801 			}
  1802 
  1803 			/* final pixel if any */
  1804 			if(w) {
  1805 				Uint16 d = *dstp, s;
  1806 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1807 				s = (Uint16)prev_sw;
  1808 #else
  1809 				s = (Uint16)(prev_sw >> 16);
  1810 #endif
  1811 				*dstp = BLEND16_50(d, s, mask);
  1812 				srcp++;
  1813 				dstp++;
  1814 			}
  1815 			srcp += srcskip - 1;
  1816 			dstp += dstskip;
  1817 		} else {
  1818 			/* source and destination are aligned */
  1819 			int w = width;
  1820 
  1821 			/* first odd pixel? */
  1822 			if((uintptr_t)srcp & 2) {
  1823 				Uint16 d = *dstp, s = *srcp;
  1824 				*dstp = BLEND16_50(d, s, mask);
  1825 				srcp++;
  1826 				dstp++;
  1827 				w--;
  1828 			}
  1829 			/* srcp and dstp are now 32-bit aligned */
  1830 
  1831 			while(w > 1) {
  1832 				Uint32 sw = *(Uint32 *)srcp;
  1833 				Uint32 dw = *(Uint32 *)dstp;
  1834 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  1835 				srcp += 2;
  1836 				dstp += 2;
  1837 				w -= 2;
  1838 			}
  1839 
  1840 			/* last odd pixel? */
  1841 			if(w) {
  1842 				Uint16 d = *dstp, s = *srcp;
  1843 				*dstp = BLEND16_50(d, s, mask);
  1844 				srcp++;
  1845 				dstp++;
  1846 			}
  1847 			srcp += srcskip;
  1848 			dstp += dstskip;
  1849 		}
  1850 	}
  1851 }
  1852 
  1853 #if GCC_ASMBLIT
  1854 /* fast RGB565->RGB565 blending with surface alpha */
  1855 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  1856 {
  1857 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1858 	if(alpha == 128) {
  1859 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  1860 	} else {
  1861 		int width = info->d_width;
  1862 		int height = info->d_height;
  1863 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  1864 		int srcskip = info->s_skip >> 1;
  1865 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  1866 		int dstskip = info->d_skip >> 1;
  1867 		Uint32 s, d;
  1868 		Uint8 load[8];
  1869 	  
  1870 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  1871 		*(Uint64 *)load = alpha;
  1872 		alpha >>= 3;		/* downscale alpha to 5 bits */
  1873 
  1874 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
  1875 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  1876 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  1877 		/* position alpha to allow for mullo and mulhi on diff channels
  1878 		   to reduce the number of operations */
  1879 		psllq_i2r(3, mm0);
  1880 	  
  1881 		/* Setup the 565 color channel masks */
  1882 		*(Uint64 *)load = 0x07E007E007E007E0ULL;
  1883 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
  1884 		*(Uint64 *)load = 0x001F001F001F001FULL;
  1885 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
  1886 		while(height--) {
  1887 			DUFFS_LOOP_QUATRO2(
  1888 			{
  1889 				s = *srcp++;
  1890 				d = *dstp;
  1891 				/*
  1892 				 * shift out the middle component (green) to
  1893 				 * the high 16 bits, and process all three RGB
  1894 				 * components at the same time.
  1895 				 */
  1896 				s = (s | s << 16) & 0x07e0f81f;
  1897 				d = (d | d << 16) & 0x07e0f81f;
  1898 				d += (s - d) * alpha >> 5;
  1899 				d &= 0x07e0f81f;
  1900 				*dstp++ = d | d >> 16;
  1901 			},{
  1902 				s = *srcp++;
  1903 				d = *dstp;
  1904 				/*
  1905 				 * shift out the middle component (green) to
  1906 				 * the high 16 bits, and process all three RGB
  1907 				 * components at the same time.
  1908 				 */
  1909 				s = (s | s << 16) & 0x07e0f81f;
  1910 				d = (d | d << 16) & 0x07e0f81f;
  1911 				d += (s - d) * alpha >> 5;
  1912 				d &= 0x07e0f81f;
  1913 				*dstp++ = d | d >> 16;
  1914 				s = *srcp++;
  1915 				d = *dstp;
  1916 				/*
  1917 				 * shift out the middle component (green) to
  1918 				 * the high 16 bits, and process all three RGB
  1919 				 * components at the same time.
  1920 				 */
  1921 				s = (s | s << 16) & 0x07e0f81f;
  1922 				d = (d | d << 16) & 0x07e0f81f;
  1923 				d += (s - d) * alpha >> 5;
  1924 				d &= 0x07e0f81f;
  1925 				*dstp++ = d | d >> 16;
  1926 			},{
  1927 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  1928 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  1929 
  1930 				/* red -- does not need a mask since the right shift clears
  1931 				   the uninteresting bits */
  1932 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1933 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1934 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  1935 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  1936 
  1937 				/* blend */
  1938 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1939 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1940 				/* alpha used is actually 11 bits
  1941 				   11 + 5 = 16 bits, so the sign bits are lost */
  1942 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1943 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1944 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  1945 
  1946 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  1947 
  1948 				/* green -- process the bits in place */
  1949 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1950 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1951 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  1952 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  1953 
  1954 				/* blend */
  1955 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1956 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1957 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  1958 				   bits are gone and the sign bits present */
  1959 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  1960 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1961 
  1962 				por_r2r(mm6, mm1); /* save new greens in dsts */
  1963 
  1964 				/* blue */
  1965 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1966 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1967 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  1968 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1969 
  1970 				/* blend */
  1971 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1972 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1973 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  1974 				   the interesting bits will need to be MASKed */
  1975 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1976 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1977 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1978 
  1979 				por_r2r(mm6, mm1); /* save new blues in dsts */
  1980 
  1981 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  1982 
  1983 				srcp += 4;
  1984 				dstp += 4;
  1985 			}, width);			
  1986 			srcp += srcskip;
  1987 			dstp += dstskip;
  1988 		}
  1989 		emms();
  1990 	}
  1991 }
  1992 
  1993 /* fast RGB555->RGB555 blending with surface alpha */
  1994 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  1995 {
  1996 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1997 	if(alpha == 128) {
  1998 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  1999 	} else {
  2000 		int width = info->d_width;
  2001 		int height = info->d_height;
  2002 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2003 		int srcskip = info->s_skip >> 1;
  2004 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2005 		int dstskip = info->d_skip >> 1;
  2006 		Uint32 s, d;
  2007 		Uint8 load[8];
  2008 	  
  2009 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2010 		*(Uint64 *)load = alpha;
  2011 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2012 
  2013 		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
  2014 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  2015 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  2016 		/* position alpha to allow for mullo and mulhi on diff channels
  2017 		   to reduce the number of operations */
  2018 		psllq_i2r(3, mm0);
  2019 
  2020 		/* Setup the 555 color channel masks */
  2021 		*(Uint64 *)load = 0x03E003E003E003E0ULL;
  2022 		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
  2023 		*(Uint64 *)load = 0x001F001F001F001FULL;
  2024 		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
  2025 		while(height--) {
  2026 			DUFFS_LOOP_QUATRO2(
  2027 			{
  2028 				s = *srcp++;
  2029 				d = *dstp;
  2030 				/*
  2031 				 * shift out the middle component (green) to
  2032 				 * the high 16 bits, and process all three RGB
  2033 				 * components at the same time.
  2034 				 */
  2035 				s = (s | s << 16) & 0x03e07c1f;
  2036 				d = (d | d << 16) & 0x03e07c1f;
  2037 				d += (s - d) * alpha >> 5;
  2038 				d &= 0x03e07c1f;
  2039 				*dstp++ = d | d >> 16;
  2040 			},{
  2041 				s = *srcp++;
  2042 				d = *dstp;
  2043 				/*
  2044 				 * shift out the middle component (green) to
  2045 				 * the high 16 bits, and process all three RGB
  2046 				 * components at the same time.
  2047 				 */
  2048 				s = (s | s << 16) & 0x03e07c1f;
  2049 				d = (d | d << 16) & 0x03e07c1f;
  2050 				d += (s - d) * alpha >> 5;
  2051 				d &= 0x03e07c1f;
  2052 				*dstp++ = d | d >> 16;
  2053 			        s = *srcp++;
  2054 				d = *dstp;
  2055 				/*
  2056 				 * shift out the middle component (green) to
  2057 				 * the high 16 bits, and process all three RGB
  2058 				 * components at the same time.
  2059 				 */
  2060 				s = (s | s << 16) & 0x03e07c1f;
  2061 				d = (d | d << 16) & 0x03e07c1f;
  2062 				d += (s - d) * alpha >> 5;
  2063 				d &= 0x03e07c1f;
  2064 				*dstp++ = d | d >> 16;
  2065 			},{
  2066 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2067 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2068 
  2069 				/* red -- process the bits in place */
  2070 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2071 					/* by reusing the GREEN mask we free up another mmx
  2072 					   register to accumulate the result */
  2073 
  2074 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2075 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2076 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2077 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2078 
  2079 				/* blend */
  2080 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2081 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2082 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2083 				   cleared by a MASK below */
  2084 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2085 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2086 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2087 
  2088 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2089 
  2090 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2091 
  2092 				/* green -- process the bits in place */
  2093 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2094 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2095 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2096 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2097 
  2098 				/* blend */
  2099 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2100 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2101 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2102 				   bits are gone and the sign bits present */
  2103 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2104 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2105 
  2106 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2107 
  2108 				/* blue */
  2109 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2110 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2111 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2112 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2113 
  2114 				/* blend */
  2115 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2116 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2117 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2118 				   the interesting bits will need to be MASKed */
  2119 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2120 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2121 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2122 
  2123 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2124 
  2125 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2126 
  2127 				srcp += 4;
  2128 				dstp += 4;
  2129 			}, width);			
  2130 			srcp += srcskip;
  2131 			dstp += dstskip;
  2132 		}
  2133 		emms();
  2134 	}
  2135 }
  2136 /* End GCC_ASMBLIT */
  2137 
  2138 #elif MSVC_ASMBLIT
  2139 /* fast RGB565->RGB565 blending with surface alpha */
  2140 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  2141 {
  2142 	unsigned alpha = info->src->alpha;
  2143 	if(alpha == 128) {
  2144 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2145 	} else {
  2146 		int width = info->d_width;
  2147 		int height = info->d_height;
  2148 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2149 		int srcskip = info->s_skip >> 1;
  2150 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2151 		int dstskip = info->d_skip >> 1;
  2152 		Uint32 s, d;
  2153 	  
  2154 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2155 
  2156 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2157 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2158 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2159 
  2160 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2161 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2162 		/* position alpha to allow for mullo and mulhi on diff channels
  2163 		   to reduce the number of operations */
  2164 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2165 	  
  2166 		/* Setup the 565 color channel masks */
  2167 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
  2168 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2169 		
  2170 		while(height--) {
  2171 			DUFFS_LOOP_QUATRO2(
  2172 			{
  2173 				s = *srcp++;
  2174 				d = *dstp;
  2175 				/*
  2176 				 * shift out the middle component (green) to
  2177 				 * the high 16 bits, and process all three RGB
  2178 				 * components at the same time.
  2179 				 */
  2180 				s = (s | s << 16) & 0x07e0f81f;
  2181 				d = (d | d << 16) & 0x07e0f81f;
  2182 				d += (s - d) * alpha >> 5;
  2183 				d &= 0x07e0f81f;
  2184 				*dstp++ = (Uint16)(d | d >> 16);
  2185 			},{
  2186 				s = *srcp++;
  2187 				d = *dstp;
  2188 				/*
  2189 				 * shift out the middle component (green) to
  2190 				 * the high 16 bits, and process all three RGB
  2191 				 * components at the same time.
  2192 				 */
  2193 				s = (s | s << 16) & 0x07e0f81f;
  2194 				d = (d | d << 16) & 0x07e0f81f;
  2195 				d += (s - d) * alpha >> 5;
  2196 				d &= 0x07e0f81f;
  2197 				*dstp++ = (Uint16)(d | d >> 16);
  2198 				s = *srcp++;
  2199 				d = *dstp;
  2200 				/*
  2201 				 * shift out the middle component (green) to
  2202 				 * the high 16 bits, and process all three RGB
  2203 				 * components at the same time.
  2204 				 */
  2205 				s = (s | s << 16) & 0x07e0f81f;
  2206 				d = (d | d << 16) & 0x07e0f81f;
  2207 				d += (s - d) * alpha >> 5;
  2208 				d &= 0x07e0f81f;
  2209 				*dstp++ = (Uint16)(d | d >> 16);
  2210 			},{
  2211 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2212 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2213 
  2214 				/* red */
  2215 				src2 = src1;
  2216 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2217 
  2218 				dst2 = dst1;
  2219 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2220 
  2221 				/* blend */
  2222 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2223 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2224 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2225 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2226 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2227 
  2228 				mm_res = dst2; /* RED -> mm_res */
  2229 
  2230 				/* green -- process the bits in place */
  2231 				src2 = src1;
  2232 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2233 
  2234 				dst2 = dst1;
  2235 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2236 
  2237 				/* blend */
  2238 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2239 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2240 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2241 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2242 
  2243 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2244 
  2245 				/* blue */
  2246 				src2 = src1;
  2247 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2248 
  2249 				dst2 = dst1;
  2250 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2251 
  2252 				/* blend */
  2253 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2254 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2255 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2256 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2257 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2258 
  2259 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2260 
  2261 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2262 
  2263 				srcp += 4;
  2264 				dstp += 4;
  2265 			}, width);			
  2266 			srcp += srcskip;
  2267 			dstp += dstskip;
  2268 		}
  2269 		_mm_empty();
  2270 	}
  2271 }
  2272 
  2273 /* fast RGB555->RGB555 blending with surface alpha */
  2274 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2275 {
  2276 	unsigned alpha = info->src->alpha;
  2277 	if(alpha == 128) {
  2278 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2279 	} else {
  2280 		int width = info->d_width;
  2281 		int height = info->d_height;
  2282 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2283 		int srcskip = info->s_skip >> 1;
  2284 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2285 		int dstskip = info->d_skip >> 1;
  2286 		Uint32 s, d;
  2287 	  
  2288 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2289 
  2290 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2291 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2292 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2293 
  2294 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2295 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2296 		/* position alpha to allow for mullo and mulhi on diff channels
  2297 		   to reduce the number of operations */
  2298 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2299 	  
  2300 		/* Setup the 555 color channel masks */
  2301 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
  2302 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
  2303 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2304 
  2305 		while(height--) {
  2306 			DUFFS_LOOP_QUATRO2(
  2307 			{
  2308 				s = *srcp++;
  2309 				d = *dstp;
  2310 				/*
  2311 				 * shift out the middle component (green) to
  2312 				 * the high 16 bits, and process all three RGB
  2313 				 * components at the same time.
  2314 				 */
  2315 				s = (s | s << 16) & 0x03e07c1f;
  2316 				d = (d | d << 16) & 0x03e07c1f;
  2317 				d += (s - d) * alpha >> 5;
  2318 				d &= 0x03e07c1f;
  2319 				*dstp++ = (Uint16)(d | d >> 16);
  2320 			},{
  2321 				s = *srcp++;
  2322 				d = *dstp;
  2323 				/*
  2324 				 * shift out the middle component (green) to
  2325 				 * the high 16 bits, and process all three RGB
  2326 				 * components at the same time.
  2327 				 */
  2328 				s = (s | s << 16) & 0x03e07c1f;
  2329 				d = (d | d << 16) & 0x03e07c1f;
  2330 				d += (s - d) * alpha >> 5;
  2331 				d &= 0x03e07c1f;
  2332 				*dstp++ = (Uint16)(d | d >> 16);
  2333 			        s = *srcp++;
  2334 				d = *dstp;
  2335 				/*
  2336 				 * shift out the middle component (green) to
  2337 				 * the high 16 bits, and process all three RGB
  2338 				 * components at the same time.
  2339 				 */
  2340 				s = (s | s << 16) & 0x03e07c1f;
  2341 				d = (d | d << 16) & 0x03e07c1f;
  2342 				d += (s - d) * alpha >> 5;
  2343 				d &= 0x03e07c1f;
  2344 				*dstp++ = (Uint16)(d | d >> 16);
  2345 			},{
  2346 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2347 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2348 
  2349 				/* red -- process the bits in place */
  2350 				src2 = src1;
  2351 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2352 
  2353 				dst2 = dst1;
  2354 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2355 
  2356 				/* blend */
  2357 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2358 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2359 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2360 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2361 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2362 
  2363 				mm_res = dst2; /* RED -> mm_res */
  2364 				
  2365 				/* green -- process the bits in place */
  2366 				src2 = src1;
  2367 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2368 
  2369 				dst2 = dst1;
  2370 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2371 
  2372 				/* blend */
  2373 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2374 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2375 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2376 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2377 
  2378 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2379 
  2380 				/* blue */
  2381 				src2 = src1; /* src -> src2 */
  2382 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2383 
  2384 				dst2 = dst1; /* dst -> dst2 */
  2385 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2386 
  2387 				/* blend */
  2388 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2389 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2390 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2391 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2392 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2393 
  2394 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2395 
  2396 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2397 
  2398 				srcp += 4;
  2399 				dstp += 4;
  2400 			}, width);			
  2401 			srcp += srcskip;
  2402 			dstp += dstskip;
  2403 		}
  2404 		_mm_empty();
  2405 	}
  2406 }
  2407 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2408 
  2409 /* fast RGB565->RGB565 blending with surface alpha */
  2410 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  2411 {
  2412 	unsigned alpha = info->src->alpha;
  2413 	if(alpha == 128) {
  2414 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2415 	} else {
  2416 		int width = info->d_width;
  2417 		int height = info->d_height;
  2418 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2419 		int srcskip = info->s_skip >> 1;
  2420 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2421 		int dstskip = info->d_skip >> 1;
  2422 		alpha >>= 3;	/* downscale alpha to 5 bits */
  2423 
  2424 		while(height--) {
  2425 			DUFFS_LOOP4({
  2426 				Uint32 s = *srcp++;
  2427 				Uint32 d = *dstp;
  2428 				/*
  2429 				 * shift out the middle component (green) to
  2430 				 * the high 16 bits, and process all three RGB
  2431 				 * components at the same time.
  2432 				 */
  2433 				s = (s | s << 16) & 0x07e0f81f;
  2434 				d = (d | d << 16) & 0x07e0f81f;
  2435 				d += (s - d) * alpha >> 5;
  2436 				d &= 0x07e0f81f;
  2437 				*dstp++ = (Uint16)(d | d >> 16);
  2438 			}, width);
  2439 			srcp += srcskip;
  2440 			dstp += dstskip;
  2441 		}
  2442 	}
  2443 }
  2444 
  2445 /* fast RGB555->RGB555 blending with surface alpha */
  2446 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  2447 {
  2448 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2449 	if(alpha == 128) {
  2450 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2451 	} else {
  2452 		int width = info->d_width;
  2453 		int height = info->d_height;
  2454 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2455 		int srcskip = info->s_skip >> 1;
  2456 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2457 		int dstskip = info->d_skip >> 1;
  2458 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2459 
  2460 		while(height--) {
  2461 			DUFFS_LOOP4({
  2462 				Uint32 s = *srcp++;
  2463 				Uint32 d = *dstp;
  2464 				/*
  2465 				 * shift out the middle component (green) to
  2466 				 * the high 16 bits, and process all three RGB
  2467 				 * components at the same time.
  2468 				 */
  2469 				s = (s | s << 16) & 0x03e07c1f;
  2470 				d = (d | d << 16) & 0x03e07c1f;
  2471 				d += (s - d) * alpha >> 5;
  2472 				d &= 0x03e07c1f;
  2473 				*dstp++ = (Uint16)(d | d >> 16);
  2474 			}, width);
  2475 			srcp += srcskip;
  2476 			dstp += dstskip;
  2477 		}
  2478 	}
  2479 }
  2480 
  2481 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2482 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  2483 {
  2484 	int width = info->d_width;
  2485 	int height = info->d_height;
  2486 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2487 	int srcskip = info->s_skip >> 2;
  2488 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2489 	int dstskip = info->d_skip >> 1;
  2490 
  2491 	while(height--) {
  2492 	    DUFFS_LOOP4({
  2493 		Uint32 s = *srcp;
  2494 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2495 		/* FIXME: Here we special-case opaque alpha since the
  2496 		   compositioning used (>>8 instead of /255) doesn't handle
  2497 		   it correctly. Also special-case alpha=0 for speed?
  2498 		   Benchmark this! */
  2499 		if(alpha) {   
  2500 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2501 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2502 		  } else {
  2503 		    Uint32 d = *dstp;
  2504 		    /*
  2505 		     * convert source and destination to G0RAB65565
  2506 		     * and blend all components at the same time
  2507 		     */
  2508 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2509 		      + (s >> 3 & 0x1f);
  2510 		    d = (d | d << 16) & 0x07e0f81f;
  2511 		    d += (s - d) * alpha >> 5;
  2512 		    d &= 0x07e0f81f;
  2513 		    *dstp = (Uint16)(d | d >> 16);
  2514 		  }
  2515 		}
  2516 		srcp++;
  2517 		dstp++;
  2518 	    }, width);
  2519 	    srcp += srcskip;
  2520 	    dstp += dstskip;
  2521 	}
  2522 }
  2523 
  2524 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2525 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  2526 {
  2527 	int width = info->d_width;
  2528 	int height = info->d_height;
  2529 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2530 	int srcskip = info->s_skip >> 2;
  2531 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2532 	int dstskip = info->d_skip >> 1;
  2533 
  2534 	while(height--) {
  2535 	    DUFFS_LOOP4({
  2536 		unsigned alpha;
  2537 		Uint32 s = *srcp;
  2538 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2539 		/* FIXME: Here we special-case opaque alpha since the
  2540 		   compositioning used (>>8 instead of /255) doesn't handle
  2541 		   it correctly. Also special-case alpha=0 for speed?
  2542 		   Benchmark this! */
  2543 		if(alpha) {   
  2544 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2545 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2546 		  } else {
  2547 		    Uint32 d = *dstp;
  2548 		    /*
  2549 		     * convert source and destination to G0RAB65565
  2550 		     * and blend all components at the same time
  2551 		     */
  2552 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2553 		      + (s >> 3 & 0x1f);
  2554 		    d = (d | d << 16) & 0x03e07c1f;
  2555 		    d += (s - d) * alpha >> 5;
  2556 		    d &= 0x03e07c1f;
  2557 		    *dstp = (Uint16)(d | d >> 16);
  2558 		  }
  2559 		}
  2560 		srcp++;
  2561 		dstp++;
  2562 	    }, width);
  2563 	    srcp += srcskip;
  2564 	    dstp += dstskip;
  2565 	}
  2566 }
  2567 
  2568 /* General (slow) N->N blending with per-surface alpha */
  2569 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  2570 {
  2571 	int width = info->d_width;
  2572 	int height = info->d_height;
  2573 	Uint8 *src = info->s_pixels;
  2574 	int srcskip = info->s_skip;
  2575 	Uint8 *dst = info->d_pixels;
  2576 	int dstskip = info->d_skip;
  2577 	SDL_PixelFormat *srcfmt = info->src;
  2578 	SDL_PixelFormat *dstfmt = info->dst;
  2579 	int srcbpp = srcfmt->BytesPerPixel;
  2580 	int dstbpp = dstfmt->BytesPerPixel;
  2581 	unsigned sA = srcfmt->alpha;
  2582 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2583 
  2584 	if(sA) {
  2585 	  while ( height-- ) {
  2586 	    DUFFS_LOOP4(
  2587 	    {
  2588 		Uint32 Pixel;
  2589 		unsigned sR;
  2590 		unsigned sG;
  2591 		unsigned sB;
  2592 		unsigned dR;
  2593 		unsigned dG;
  2594 		unsigned dB;
  2595 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2596 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2597 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2598 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2599 		src += srcbpp;
  2600 		dst += dstbpp;
  2601 	    },
  2602 	    width);
  2603 	    src += srcskip;
  2604 	    dst += dstskip;
  2605 	  }
  2606 	}
  2607 }
  2608 
  2609 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2610 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  2611 {
  2612 	int width = info->d_width;
  2613 	int height = info->d_height;
  2614 	Uint8 *src = info->s_pixels;
  2615 	int srcskip = info->s_skip;
  2616 	Uint8 *dst = info->d_pixels;
  2617 	int dstskip = info->d_skip;
  2618 	SDL_PixelFormat *srcfmt = info->src;
  2619 	SDL_PixelFormat *dstfmt = info->dst;
  2620 	Uint32 ckey = srcfmt->colorkey;
  2621 	int srcbpp = srcfmt->BytesPerPixel;
  2622 	int dstbpp = dstfmt->BytesPerPixel;
  2623 	unsigned sA = srcfmt->alpha;
  2624 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2625 
  2626 	while ( height-- ) {
  2627 	    DUFFS_LOOP4(
  2628 	    {
  2629 		Uint32 Pixel;
  2630 		unsigned sR;
  2631 		unsigned sG;
  2632 		unsigned sB;
  2633 		unsigned dR;
  2634 		unsigned dG;
  2635 		unsigned dB;
  2636 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2637 		if(sA && Pixel != ckey) {
  2638 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2639 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2640 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2641 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2642 		}
  2643 		src += srcbpp;
  2644 		dst += dstbpp;
  2645 	    },
  2646 	    width);
  2647 	    src += srcskip;
  2648 	    dst += dstskip;
  2649 	}
  2650 }
  2651 
  2652 /* General (slow) N->N blending with pixel alpha */
  2653 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  2654 {
  2655 	int width = info->d_width;
  2656 	int height = info->d_height;
  2657 	Uint8 *src = info->s_pixels;
  2658 	int srcskip = info->s_skip;
  2659 	Uint8 *dst = info->d_pixels;
  2660 	int dstskip = info->d_skip;
  2661 	SDL_PixelFormat *srcfmt = info->src;
  2662 	SDL_PixelFormat *dstfmt = info->dst;
  2663 
  2664 	int  srcbpp;
  2665 	int  dstbpp;
  2666 
  2667 	/* Set up some basic variables */
  2668 	srcbpp = srcfmt->BytesPerPixel;
  2669 	dstbpp = dstfmt->BytesPerPixel;
  2670 
  2671 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2672 	   quite right. for <8bpp source alpha, it gets them very wrong
  2673 	   (check all macros!)
  2674 	   It is unclear whether there is a good general solution that doesn't
  2675 	   need a branch (or a divide). */
  2676 	while ( height-- ) {
  2677 	    DUFFS_LOOP4(
  2678 	    {
  2679 		Uint32 Pixel;
  2680 		unsigned sR;
  2681 		unsigned sG;
  2682 		unsigned sB;
  2683 		unsigned dR;
  2684 		unsigned dG;
  2685 		unsigned dB;
  2686 		unsigned sA;
  2687 		unsigned dA;
  2688 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2689 		if(sA) {
  2690 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2691 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2692 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2693 		}
  2694 		src += srcbpp;
  2695 		dst += dstbpp;
  2696 	    },
  2697 	    width);
  2698 	    src += srcskip;
  2699 	    dst += dstskip;
  2700 	}
  2701 }
  2702 
  2703 
  2704 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
  2705 {
  2706     SDL_PixelFormat *sf = surface->format;
  2707     SDL_PixelFormat *df = surface->map->dst->format;
  2708 
  2709     if(sf->Amask == 0) {
  2710 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2711 	    if(df->BytesPerPixel == 1)
  2712 		return BlitNto1SurfaceAlphaKey;
  2713 	    else
  2714 #if SDL_ALTIVEC_BLITTERS
  2715 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2716 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2717             return Blit32to32SurfaceAlphaKeyAltivec;
  2718         else
  2719 #endif
  2720             return BlitNtoNSurfaceAlphaKey;
  2721 	} else {
  2722 	    /* Per-surface alpha blits */
  2723 	    switch(df->BytesPerPixel) {
  2724 	    case 1:
  2725 		return BlitNto1SurfaceAlpha;
  2726 
  2727 	    case 2:
  2728 		if(surface->map->identity) {
  2729 		    if(df->Gmask == 0x7e0)
  2730 		    {
  2731 #if MMX_ASMBLIT
  2732 		if(SDL_HasMMX())
  2733 			return Blit565to565SurfaceAlphaMMX;
  2734 		else
  2735 #endif
  2736 			return Blit565to565SurfaceAlpha;
  2737 		    }
  2738 		    else if(df->Gmask == 0x3e0)
  2739 		    {
  2740 #if MMX_ASMBLIT
  2741 		if(SDL_HasMMX())
  2742 			return Blit555to555SurfaceAlphaMMX;
  2743 		else
  2744 #endif
  2745 			return Blit555to555SurfaceAlpha;
  2746 		    }
  2747 		}
  2748 		return BlitNtoNSurfaceAlpha;
  2749 
  2750 	    case 4:
  2751 		if(sf->Rmask == df->Rmask
  2752 		   && sf->Gmask == df->Gmask
  2753 		   && sf->Bmask == df->Bmask
  2754 		   && sf->BytesPerPixel == 4)
  2755 		{
  2756 #if MMX_ASMBLIT
  2757 			if(sf->Rshift % 8 == 0
  2758 			   && sf->Gshift % 8 == 0
  2759 			   && sf->Bshift % 8 == 0
  2760 			   && SDL_HasMMX())
  2761 			    return BlitRGBtoRGBSurfaceAlphaMMX;
  2762 #endif
  2763 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
  2764 			{
  2765 #if SDL_ALTIVEC_BLITTERS
  2766 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2767 					&& SDL_HasAltiVec())
  2768 					return BlitRGBtoRGBSurfaceAlphaAltivec;
  2769 #endif
  2770 				return BlitRGBtoRGBSurfaceAlpha;
  2771 			}
  2772 		}
  2773 #if SDL_ALTIVEC_BLITTERS
  2774 		if((sf->BytesPerPixel == 4) &&
  2775 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2776 			return Blit32to32SurfaceAlphaAltivec;
  2777 		else
  2778 #endif
  2779 			return BlitNtoNSurfaceAlpha;
  2780 
  2781 	    case 3:
  2782 	    default:
  2783 		return BlitNtoNSurfaceAlpha;
  2784 	    }
  2785 	}
  2786     } else {
  2787 	/* Per-pixel alpha blits */
  2788 	switch(df->BytesPerPixel) {
  2789 	case 1:
  2790 	    return BlitNto1PixelAlpha;
  2791 
  2792 	case 2:
  2793 #if SDL_ALTIVEC_BLITTERS
  2794 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
  2795            df->Gmask == 0x7e0 &&
  2796 	   df->Bmask == 0x1f && SDL_HasAltiVec())
  2797             return Blit32to565PixelAlphaAltivec;
  2798         else
  2799 #endif
  2800 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2801 	       && sf->Gmask == 0xff00
  2802 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2803 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2804 		if(df->Gmask == 0x7e0)
  2805 		    return BlitARGBto565PixelAlpha;
  2806 		else if(df->Gmask == 0x3e0)
  2807 		    return BlitARGBto555PixelAlpha;
  2808 	    }
  2809 	    return BlitNtoNPixelAlpha;
  2810 
  2811 	case 4:
  2812 	    if(sf->Rmask == df->Rmask
  2813 	       && sf->Gmask == df->Gmask
  2814 	       && sf->Bmask == df->Bmask
  2815 	       && sf->BytesPerPixel == 4)
  2816 	    {
  2817 #if MMX_ASMBLIT
  2818 		if(sf->Rshift % 8 == 0
  2819 		   && sf->Gshift % 8 == 0
  2820 		   && sf->Bshift % 8 == 0
  2821 		   && sf->Ashift % 8 == 0
  2822 		   && sf->Aloss == 0)
  2823 		{
  2824 			if(SDL_Has3DNow())
  2825 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2826 			if(SDL_HasMMX())
  2827 				return BlitRGBtoRGBPixelAlphaMMX;
  2828 		}
  2829 #endif
  2830 		if(sf->Amask == 0xff000000)
  2831 		{
  2832 #if SDL_ALTIVEC_BLITTERS
  2833 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2834 				&& SDL_HasAltiVec())
  2835 				return BlitRGBtoRGBPixelAlphaAltivec;
  2836 #endif
  2837 			return BlitRGBtoRGBPixelAlpha;
  2838 		}
  2839 	    }
  2840 #if SDL_ALTIVEC_BLITTERS
  2841 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
  2842 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2843 		return Blit32to32PixelAlphaAltivec;
  2844 	    else
  2845 #endif
  2846 		return BlitNtoNPixelAlpha;
  2847 
  2848 	case 3:
  2849 	default:
  2850 	    return BlitNtoNPixelAlpha;
  2851 	}
  2852     }
  2853 }
  2854