src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Tue, 29 Sep 2009 04:03:58 +0000
branchSDL-1.2
changeset 4262 800f3cb78e45
parent 4159 a1b03ba2fcd0
child 4293 63b54ddd38ea
permissions -rw-r--r--
Fixed strict aliasing (or inline asm?) issue.

Some versions of GCC need this fix or alpha blending is broken.

Fixes Bugzilla #648.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2009 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /*
    28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
    29    Checking if _mm_free is #defined in malloc.h is is the only way to
    30    determine if the Processor Pack is installed, as far as I can tell.
    31 */
    32 
    33 #if SDL_ASSEMBLY_ROUTINES
    34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    35 #    define MMX_ASMBLIT 1
    36 #    define GCC_ASMBLIT 1
    37 #  elif defined(_MSC_VER) && defined(_M_IX86)
    38 #    if (_MSC_VER <= 1200)  
    39 #      include <malloc.h>   
    40 #      if defined(_mm_free)
    41 #          define HAVE_MMINTRIN_H 1
    42 #      endif
    43 #    else  /* Visual Studio > VC6 always has mmintrin.h */
    44 #      define HAVE_MMINTRIN_H 1
    45 #    endif
    46 #    if HAVE_MMINTRIN_H
    47 #      define MMX_ASMBLIT 1
    48 #      define MSVC_ASMBLIT 1
    49 #    endif
    50 #  endif
    51 #endif /* SDL_ASSEMBLY_ROUTINES */
    52 
    53 /* Function to check the CPU flags */
    54 #include "SDL_cpuinfo.h"
    55 #if GCC_ASMBLIT
    56 #include "mmx.h"
    57 #elif MSVC_ASMBLIT
    58 #include <mmintrin.h>
    59 #include <mm3dnow.h>
    60 #endif
    61 
    62 /* Functions to perform alpha blended blitting */
    63 
    64 /* N->1 blending with per-surface alpha */
    65 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
    66 {
    67 	int width = info->d_width;
    68 	int height = info->d_height;
    69 	Uint8 *src = info->s_pixels;
    70 	int srcskip = info->s_skip;
    71 	Uint8 *dst = info->d_pixels;
    72 	int dstskip = info->d_skip;
    73 	Uint8 *palmap = info->table;
    74 	SDL_PixelFormat *srcfmt = info->src;
    75 	SDL_PixelFormat *dstfmt = info->dst;
    76 	int srcbpp = srcfmt->BytesPerPixel;
    77 
    78 	const unsigned A = srcfmt->alpha;
    79 
    80 	while ( height-- ) {
    81 	    DUFFS_LOOP4(
    82 	    {
    83 		Uint32 Pixel;
    84 		unsigned sR;
    85 		unsigned sG;
    86 		unsigned sB;
    87 		unsigned dR;
    88 		unsigned dG;
    89 		unsigned dB;
    90 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    91 		dR = dstfmt->palette->colors[*dst].r;
    92 		dG = dstfmt->palette->colors[*dst].g;
    93 		dB = dstfmt->palette->colors[*dst].b;
    94 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    95 		dR &= 0xff;
    96 		dG &= 0xff;
    97 		dB &= 0xff;
    98 		/* Pack RGB into 8bit pixel */
    99 		if ( palmap == NULL ) {
   100 		    *dst =((dR>>5)<<(3+2))|
   101 			  ((dG>>5)<<(2))|
   102 			  ((dB>>6)<<(0));
   103 		} else {
   104 		    *dst = palmap[((dR>>5)<<(3+2))|
   105 				  ((dG>>5)<<(2))  |
   106 				  ((dB>>6)<<(0))];
   107 		}
   108 		dst++;
   109 		src += srcbpp;
   110 	    },
   111 	    width);
   112 	    src += srcskip;
   113 	    dst += dstskip;
   114 	}
   115 }
   116 
   117 /* N->1 blending with pixel alpha */
   118 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
   119 {
   120 	int width = info->d_width;
   121 	int height = info->d_height;
   122 	Uint8 *src = info->s_pixels;
   123 	int srcskip = info->s_skip;
   124 	Uint8 *dst = info->d_pixels;
   125 	int dstskip = info->d_skip;
   126 	Uint8 *palmap = info->table;
   127 	SDL_PixelFormat *srcfmt = info->src;
   128 	SDL_PixelFormat *dstfmt = info->dst;
   129 	int srcbpp = srcfmt->BytesPerPixel;
   130 
   131 	/* FIXME: fix alpha bit field expansion here too? */
   132 	while ( height-- ) {
   133 	    DUFFS_LOOP4(
   134 	    {
   135 		Uint32 Pixel;
   136 		unsigned sR;
   137 		unsigned sG;
   138 		unsigned sB;
   139 		unsigned sA;
   140 		unsigned dR;
   141 		unsigned dG;
   142 		unsigned dB;
   143 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   144 		dR = dstfmt->palette->colors[*dst].r;
   145 		dG = dstfmt->palette->colors[*dst].g;
   146 		dB = dstfmt->palette->colors[*dst].b;
   147 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   148 		dR &= 0xff;
   149 		dG &= 0xff;
   150 		dB &= 0xff;
   151 		/* Pack RGB into 8bit pixel */
   152 		if ( palmap == NULL ) {
   153 		    *dst =((dR>>5)<<(3+2))|
   154 			  ((dG>>5)<<(2))|
   155 			  ((dB>>6)<<(0));
   156 		} else {
   157 		    *dst = palmap[((dR>>5)<<(3+2))|
   158 				  ((dG>>5)<<(2))  |
   159 				  ((dB>>6)<<(0))  ];
   160 		}
   161 		dst++;
   162 		src += srcbpp;
   163 	    },
   164 	    width);
   165 	    src += srcskip;
   166 	    dst += dstskip;
   167 	}
   168 }
   169 
   170 /* colorkeyed N->1 blending with per-surface alpha */
   171 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
   172 {
   173 	int width = info->d_width;
   174 	int height = info->d_height;
   175 	Uint8 *src = info->s_pixels;
   176 	int srcskip = info->s_skip;
   177 	Uint8 *dst = info->d_pixels;
   178 	int dstskip = info->d_skip;
   179 	Uint8 *palmap = info->table;
   180 	SDL_PixelFormat *srcfmt = info->src;
   181 	SDL_PixelFormat *dstfmt = info->dst;
   182 	int srcbpp = srcfmt->BytesPerPixel;
   183 	Uint32 ckey = srcfmt->colorkey;
   184 
   185 	const int A = srcfmt->alpha;
   186 
   187 	while ( height-- ) {
   188 	    DUFFS_LOOP(
   189 	    {
   190 		Uint32 Pixel;
   191 		unsigned sR;
   192 		unsigned sG;
   193 		unsigned sB;
   194 		unsigned dR;
   195 		unsigned dG;
   196 		unsigned dB;
   197 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   198 		if ( Pixel != ckey ) {
   199 		    dR = dstfmt->palette->colors[*dst].r;
   200 		    dG = dstfmt->palette->colors[*dst].g;
   201 		    dB = dstfmt->palette->colors[*dst].b;
   202 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   203 		    dR &= 0xff;
   204 		    dG &= 0xff;
   205 		    dB &= 0xff;
   206 		    /* Pack RGB into 8bit pixel */
   207 		    if ( palmap == NULL ) {
   208 			*dst =((dR>>5)<<(3+2))|
   209 			      ((dG>>5)<<(2)) |
   210 			      ((dB>>6)<<(0));
   211 		    } else {
   212 			*dst = palmap[((dR>>5)<<(3+2))|
   213 				      ((dG>>5)<<(2))  |
   214 				      ((dB>>6)<<(0))  ];
   215 		    }
   216 		}
   217 		dst++;
   218 		src += srcbpp;
   219 	    },
   220 	    width);
   221 	    src += srcskip;
   222 	    dst += dstskip;
   223 	}
   224 }
   225 
   226 #if GCC_ASMBLIT
   227 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   228 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   229 {
   230 	int width = info->d_width;
   231 	int height = info->d_height;
   232 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   233 	int srcskip = info->s_skip >> 2;
   234 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   235 	int dstskip = info->d_skip >> 2;
   236 	Uint32 dalpha = info->dst->Amask;
   237 	Uint64 load;
   238 
   239 	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
   240 	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
   241 	load = 0x0001010100010101ULL;/* !alpha128 mask */
   242 	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
   243 	movd_m2r(dalpha, mm7); /* dst alpha mask */
   244 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   245 	while(height--) {
   246 		DUFFS_LOOP_DOUBLE2(
   247 		{
   248 			Uint32 s = *srcp++;
   249 			Uint32 d = *dstp;
   250 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   251 				   + (s & d & 0x00010101)) | dalpha;
   252 		},{
   253 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   254 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   255 
   256 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   257 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   258 
   259 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   260 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   261 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   262 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   263 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   264 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   265 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   266 			
   267 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   268 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   269 			dstp += 2;
   270 			srcp += 2;
   271 		}, width);
   272 		srcp += srcskip;
   273 		dstp += dstskip;
   274 	}
   275 	emms();
   276 }
   277 
   278 /* fast RGB888->(A)RGB888 blending with surface alpha */
   279 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   280 {
   281 	SDL_PixelFormat* df = info->dst;
   282 	unsigned alpha = info->src->alpha;
   283 
   284 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   285 			/* only call a128 version when R,G,B occupy lower bits */
   286 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   287 	} else {
   288 		int width = info->d_width;
   289 		int height = info->d_height;
   290 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   291 		int srcskip = info->s_skip >> 2;
   292 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   293 		int dstskip = info->d_skip >> 2;
   294 
   295 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
   296 		/* form the alpha mult */
   297 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
   298 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   299 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   300 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   301 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
   302 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
   303 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
   304 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   305 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
   306 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   307 		
   308 		while(height--) {
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   311 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   312 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   313 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   314 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   315 
   316 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   317 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   318 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   319 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   320 
   321 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   322 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   323 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   324 				++srcp;
   325 				++dstp;
   326 			},{
   327 				/* Two Pixels Blend */
   328 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   329 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   330 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   331 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   332 
   333 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   334 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   335 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   336 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   337 
   338 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   339 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   340 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   341 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   342 
   343 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   344 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   345 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   346 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   347 
   348 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   349 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   350 				
   351 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   352 
   353   				srcp += 2;
   354   				dstp += 2;
   355   			}, width);
   356 			srcp += srcskip;
   357 			dstp += dstskip;
   358 		}
   359 		emms();
   360 	}
   361 }
   362 
   363 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   364 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   365 {
   366 	int width = info->d_width;
   367 	int height = info->d_height;
   368 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   369 	int srcskip = info->s_skip >> 2;
   370 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   371 	int dstskip = info->d_skip >> 2;
   372 	SDL_PixelFormat* sf = info->src;
   373 	Uint32 amask = sf->Amask;
   374 
   375 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
   376 	/* form multiplication mask */
   377 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
   378 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
   379 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
   380 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
   381 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
   382 	/* form channel masks */
   383 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
   384 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
   385 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
   386 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
   387 	/* get alpha channel shift */
   388 	__asm__ __volatile__ (
   389 		"movd %0, %%mm5"
   390 		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   391 
   392 	while(height--) {
   393 	    DUFFS_LOOP4({
   394 		Uint32 alpha = *srcp & amask;
   395 		/* FIXME: Here we special-case opaque alpha since the
   396 			compositioning used (>>8 instead of /255) doesn't handle
   397 			it correctly. Also special-case alpha=0 for speed?
   398 			Benchmark this! */
   399 		if(alpha == 0) {
   400 			/* do nothing */
   401 		} else if(alpha == amask) {
   402 			/* opaque alpha -- copy RGB, keep dst alpha */
   403 			/* using MMX here to free up regular registers for other things */
   404 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   405 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   406 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   407 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   408 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   409 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   410 		} else {
   411 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   412 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   413 
   414 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   415 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   416 
   417 			__asm__ __volatile__ (
   418 				"movd %0, %%mm4"
   419 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   420 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   421 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   422 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   423 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   424 
   425 			/* blend */		    
   426 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   427 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   428 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   429 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   430 			
   431 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   432 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   433 		}
   434 		++srcp;
   435 		++dstp;
   436 	    }, width);
   437 	    srcp += srcskip;
   438 	    dstp += dstskip;
   439 	}
   440 	emms();
   441 }
   442 /* End GCC_ASMBLIT */
   443 
   444 #elif MSVC_ASMBLIT
   445 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   446 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   447 {
   448 	int width = info->d_width;
   449 	int height = info->d_height;
   450 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   451 	int srcskip = info->s_skip >> 2;
   452 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   453 	int dstskip = info->d_skip >> 2;
   454 	Uint32 dalpha = info->dst->Amask;
   455 
   456 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   457 	
   458 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
   459 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
   460 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   461 
   462 	while (height--) {
   463 		int n = width;
   464 		if ( n & 1 ) {
   465 			Uint32 s = *srcp++;
   466 			Uint32 d = *dstp;
   467 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   468 				   + (s & d & 0x00010101)) | dalpha;
   469 			n--;
   470 		}
   471 		
   472 		for (n >>= 1; n > 0; --n) {
   473 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
   474 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
   475 
   476 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
   477 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   478 
   479 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
   480 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
   481 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
   482 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
   483 
   484 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
   485 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
   486 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
   487 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
   488 			
   489 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
   490 			dstp += 2;
   491 			srcp += 2;
   492 		}
   493 		
   494 		srcp += srcskip;
   495 		dstp += dstskip;
   496 	}
   497 	_mm_empty();
   498 }
   499 
   500 /* fast RGB888->(A)RGB888 blending with surface alpha */
   501 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   502 {
   503 	SDL_PixelFormat* df = info->dst;
   504 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   505 	unsigned alpha = info->src->alpha;
   506 
   507 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   508 			/* only call a128 version when R,G,B occupy lower bits */
   509 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   510 	} else {
   511 		int width = info->d_width;
   512 		int height = info->d_height;
   513 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   514 		int srcskip = info->s_skip >> 2;
   515 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   516 		int dstskip = info->d_skip >> 2;
   517 		Uint32 dalpha = df->Amask;
   518 		Uint32 amult;
   519 
   520 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   521 		
   522 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   523 		/* form the alpha mult */
   524 		amult = alpha | (alpha << 8);
   525 		amult = amult | (amult << 16);
   526 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   527 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
   528 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   529 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   530 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   531 		
   532 		while (height--) {
   533 			int n = width;
   534 			if (n & 1) {
   535 				/* One Pixel Blend */
   536 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
   537 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   538 
   539 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   540 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   541 
   542 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
   543 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   544 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   545 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   546 				
   547 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   548 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   549 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   550 
   551 				++srcp;
   552 				++dstp;
   553 				
   554 				n--;
   555 			}
   556 
   557 			for (n >>= 1; n > 0; --n) {
   558 				/* Two Pixels Blend */
   559 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
   560 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   561 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   562 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   563 
   564 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
   565 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
   566 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   567 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   568 
   569 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   570 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
   571 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
   572 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   573 
   574 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
   575 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   576 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   577 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   578 				
   579 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   580 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   581 
   582 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
   583 
   584 				srcp += 2;
   585 				dstp += 2;
   586 			}
   587 			srcp += srcskip;
   588 			dstp += dstskip;
   589 		}
   590 		_mm_empty();
   591 	}
   592 }
   593 
   594 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   595 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   596 {
   597 	int width = info->d_width;
   598 	int height = info->d_height;
   599 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   600 	int srcskip = info->s_skip >> 2;
   601 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   602 	int dstskip = info->d_skip >> 2;
   603 	SDL_PixelFormat* sf = info->src;
   604 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   605 	Uint32 amask = sf->Amask;
   606 	Uint32 ashift = sf->Ashift;
   607 	Uint64 multmask;
   608 
   609 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   610 
   611 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   612 	multmask = ~(0xFFFFi64 << (ashift * 2));
   613 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   614 
   615 	while(height--) {
   616 		DUFFS_LOOP4({
   617 		Uint32 alpha = *srcp & amask;
   618 		if (alpha == 0) {
   619 			/* do nothing */
   620 		} else if (alpha == amask) {
   621 			/* opaque alpha -- copy RGB, keep dst alpha */
   622 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   623 		} else {
   624 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   625 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   626 
   627 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   628 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   629 
   630 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   631 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   632 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   633 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   634 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   635 
   636 			/* blend */		    
   637 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   638 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   639 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   640 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   641 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   642 			
   643 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   644 		}
   645 		++srcp;
   646 		++dstp;
   647 	    }, width);
   648 	    srcp += srcskip;
   649 	    dstp += dstskip;
   650 	}
   651 	_mm_empty();
   652 }
   653 /* End MSVC_ASMBLIT */
   654 
   655 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   656 
   657 #if SDL_ALTIVEC_BLITTERS
   658 #if __MWERKS__
   659 #pragma altivec_model on
   660 #endif
   661 #if HAVE_ALTIVEC_H
   662 #include <altivec.h>
   663 #endif
   664 #include <assert.h>
   665 
   666 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   667     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   668         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   669     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   670         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   671 #else
   672     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   673         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   674     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   675         (vector unsigned short) { a,b,c,d,e,f,g,h }
   676 #endif
   677 
   678 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   679 #define VECPRINT(msg, v) do { \
   680     vector unsigned int tmpvec = (vector unsigned int)(v); \
   681     unsigned int *vp = (unsigned int *)&tmpvec; \
   682     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   683 } while (0)
   684 
   685 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   686     (vector unsigned char)(
   687         0x00, 0x10, 0x02, 0x12,
   688         0x04, 0x14, 0x06, 0x16,
   689         0x08, 0x18, 0x0A, 0x1A,
   690         0x0C, 0x1C, 0x0E, 0x1E );
   691 */
   692 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   693 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   694 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   695 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   696     ? vec_lvsl(0, src) \
   697     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   698 
   699    
   700 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   701     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   702     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   703     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   704     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   705     /* valpha2 is 255-alpha */ \
   706     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   707     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   708     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   709     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   710     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   711     /* add source and dest */ \
   712     vtemp1 = vec_add(vtemp1, vtemp3); \
   713     vtemp2 = vec_add(vtemp2, vtemp4); \
   714     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   715     vtemp1 = vec_add(vtemp1, v1_16); \
   716     vtemp3 = vec_sr(vtemp1, v8_16); \
   717     vtemp1 = vec_add(vtemp1, vtemp3); \
   718     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   719     vtemp2 = vec_add(vtemp2, v1_16); \
   720     vtemp4 = vec_sr(vtemp2, v8_16); \
   721     vtemp2 = vec_add(vtemp2, vtemp4); \
   722     /* (>>8) and get ARGBARGBARGBARGB */ \
   723     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   724 } while (0)
   725  
   726 /* Calculate the permute vector used for 32->32 swizzling */
   727 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
   728                                   const SDL_PixelFormat *dstfmt)
   729 {
   730     /*
   731      * We have to assume that the bits that aren't used by other
   732      *  colors is alpha, and it's one complete byte, since some formats
   733      *  leave alpha with a zero mask, but we should still swizzle the bits.
   734      */
   735     /* ARGB */
   736     const static struct SDL_PixelFormat default_pixel_format = {
   737         NULL, 0, 0,
   738         0, 0, 0, 0,
   739         16, 8, 0, 24,
   740         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   741         0, 0};
   742     if (!srcfmt) {
   743         srcfmt = &default_pixel_format;
   744     }
   745     if (!dstfmt) {
   746         dstfmt = &default_pixel_format;
   747     }
   748     const vector unsigned char plus = VECUINT8_LITERAL
   749                                             ( 0x00, 0x00, 0x00, 0x00,
   750                                               0x04, 0x04, 0x04, 0x04,
   751                                               0x08, 0x08, 0x08, 0x08,
   752                                               0x0C, 0x0C, 0x0C, 0x0C );
   753     vector unsigned char vswiz;
   754     vector unsigned int srcvec;
   755 #define RESHIFT(X) (3 - ((X) >> 3))
   756     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   757     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   758     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   759     Uint32 amask;
   760     /* Use zero for alpha if either surface doesn't have alpha */
   761     if (dstfmt->Amask) {
   762         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   763     } else {
   764         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   765     }
   766 #undef RESHIFT  
   767     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
   768     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   769     return(vswiz);
   770 }
   771 
   772 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   773 {
   774     int height = info->d_height;
   775     Uint8 *src = (Uint8 *)info->s_pixels;
   776     int srcskip = info->s_skip;
   777     Uint8 *dst = (Uint8 *)info->d_pixels;
   778     int dstskip = info->d_skip;
   779     SDL_PixelFormat *srcfmt = info->src;
   780 
   781     vector unsigned char v0 = vec_splat_u8(0);
   782     vector unsigned short v8_16 = vec_splat_u16(8);
   783     vector unsigned short v1_16 = vec_splat_u16(1);
   784     vector unsigned short v2_16 = vec_splat_u16(2);
   785     vector unsigned short v3_16 = vec_splat_u16(3);
   786     vector unsigned int v8_32 = vec_splat_u32(8);
   787     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   788     vector unsigned short v3f = VECUINT16_LITERAL(
   789         0x003f, 0x003f, 0x003f, 0x003f,
   790         0x003f, 0x003f, 0x003f, 0x003f);
   791     vector unsigned short vfc = VECUINT16_LITERAL(
   792         0x00fc, 0x00fc, 0x00fc, 0x00fc,
   793         0x00fc, 0x00fc, 0x00fc, 0x00fc);
   794 
   795     /* 
   796         0x10 - 0x1f is the alpha
   797         0x00 - 0x0e evens are the red
   798         0x01 - 0x0f odds are zero
   799     */
   800     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
   801         0x10, 0x00, 0x01, 0x01,
   802         0x10, 0x02, 0x01, 0x01,
   803         0x10, 0x04, 0x01, 0x01,
   804         0x10, 0x06, 0x01, 0x01
   805     );
   806     vector unsigned char vredalpha2 = (vector unsigned char)(
   807         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   808     );
   809     /*
   810         0x00 - 0x0f is ARxx ARxx ARxx ARxx
   811         0x11 - 0x0f odds are blue
   812     */
   813     vector unsigned char vblue1 = VECUINT8_LITERAL(
   814         0x00, 0x01, 0x02, 0x11,
   815         0x04, 0x05, 0x06, 0x13,
   816         0x08, 0x09, 0x0a, 0x15,
   817         0x0c, 0x0d, 0x0e, 0x17
   818     );
   819     vector unsigned char vblue2 = (vector unsigned char)(
   820         vec_add((vector unsigned int)vblue1, v8_32)
   821     );
   822     /*
   823         0x00 - 0x0f is ARxB ARxB ARxB ARxB
   824         0x10 - 0x0e evens are green
   825     */
   826     vector unsigned char vgreen1 = VECUINT8_LITERAL(
   827         0x00, 0x01, 0x10, 0x03,
   828         0x04, 0x05, 0x12, 0x07,
   829         0x08, 0x09, 0x14, 0x0b,
   830         0x0c, 0x0d, 0x16, 0x0f
   831     );
   832     vector unsigned char vgreen2 = (vector unsigned char)(
   833         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   834     );
   835     vector unsigned char vgmerge = VECUINT8_LITERAL(
   836         0x00, 0x02, 0x00, 0x06,
   837         0x00, 0x0a, 0x00, 0x0e,
   838         0x00, 0x12, 0x00, 0x16,
   839         0x00, 0x1a, 0x00, 0x1e);
   840     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   841     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   842     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   843 
   844     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   845     vf800 = vec_sl(vf800, vec_splat_u16(8));
   846 
   847     while(height--) {
   848         int extrawidth;
   849         vector unsigned char valigner;
   850         vector unsigned char vsrc;
   851         vector unsigned char voverflow;
   852         int width = info->d_width;
   853 
   854 #define ONE_PIXEL_BLEND(condition, widthvar) \
   855         while (condition) { \
   856             Uint32 Pixel; \
   857             unsigned sR, sG, sB, dR, dG, dB, sA; \
   858             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   859             if(sA) { \
   860                 unsigned short dstpixel = *((unsigned short *)dst); \
   861                 dR = (dstpixel >> 8) & 0xf8; \
   862                 dG = (dstpixel >> 3) & 0xfc; \
   863                 dB = (dstpixel << 3) & 0xf8; \
   864                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   865                 *((unsigned short *)dst) = ( \
   866                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   867                 ); \
   868             } \
   869             src += 4; \
   870             dst += 2; \
   871             widthvar--; \
   872         }
   873         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   874         extrawidth = (width % 8);
   875         valigner = VEC_ALIGNER(src);
   876         vsrc = (vector unsigned char)vec_ld(0, src);
   877         width -= extrawidth;
   878         while (width) {
   879             vector unsigned char valpha;
   880             vector unsigned char vsrc1, vsrc2;
   881             vector unsigned char vdst1, vdst2;
   882             vector unsigned short vR, vG, vB;
   883             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   884 
   885             /* Load 8 pixels from src as ARGB */
   886             voverflow = (vector unsigned char)vec_ld(15, src);
   887             vsrc = vec_perm(vsrc, voverflow, valigner);
   888             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   889             src += 16;
   890             vsrc = (vector unsigned char)vec_ld(15, src);
   891             voverflow = vec_perm(voverflow, vsrc, valigner);
   892             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   893             src += 16;
   894 
   895             /* Load 8 pixels from dst as XRGB */
   896             voverflow = vec_ld(0, dst);
   897             vR = vec_and((vector unsigned short)voverflow, vf800);
   898             vB = vec_sl((vector unsigned short)voverflow, v3_16);
   899             vG = vec_sl(vB, v2_16);
   900             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   901             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   902             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   903             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   904             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   905             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   906 
   907             /* Alpha blend 8 pixels as ARGB */
   908             valpha = vec_perm(vsrc1, v0, valphaPermute);
   909             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   910             valpha = vec_perm(vsrc2, v0, valphaPermute);
   911             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   912 
   913             /* Convert 8 pixels to 565 */
   914             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   915             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   916             vgpixel = vec_and(vgpixel, vfc);
   917             vgpixel = vec_sl(vgpixel, v3_16);
   918             vrpixel = vec_sl(vpixel, v1_16);
   919             vrpixel = vec_and(vrpixel, vf800);
   920             vbpixel = vec_and(vpixel, v3f);
   921             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   922             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   923             
   924             /* Store 8 pixels */
   925             vec_st(vdst1, 0, dst);
   926 
   927             width -= 8;
   928             dst += 16;
   929         }
   930         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   931 #undef ONE_PIXEL_BLEND
   932         src += srcskip;
   933         dst += dstskip;
   934     }
   935 }
   936 
   937 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   938 {
   939     unsigned alpha = info->src->alpha;
   940     int height = info->d_height;
   941     Uint32 *srcp = (Uint32 *)info->s_pixels;
   942     int srcskip = info->s_skip >> 2;
   943     Uint32 *dstp = (Uint32 *)info->d_pixels;
   944     int dstskip = info->d_skip >> 2;
   945     SDL_PixelFormat *srcfmt = info->src;
   946     SDL_PixelFormat *dstfmt = info->dst;
   947     unsigned sA = srcfmt->alpha;
   948     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   949     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   950     Uint32 ckey = info->src->colorkey;
   951     vector unsigned char mergePermute;
   952     vector unsigned char vsrcPermute;
   953     vector unsigned char vdstPermute;
   954     vector unsigned char vsdstPermute;
   955     vector unsigned char valpha;
   956     vector unsigned char valphamask;
   957     vector unsigned char vbits;
   958     vector unsigned char v0;
   959     vector unsigned short v1;
   960     vector unsigned short v8;
   961     vector unsigned int vckey;
   962     vector unsigned int vrgbmask;
   963 
   964     mergePermute = VEC_MERGE_PERMUTE();
   965     v0 = vec_splat_u8(0);
   966     v1 = vec_splat_u16(1);
   967     v8 = vec_splat_u16(8);
   968 
   969     /* set the alpha to 255 on the destination surf */
   970     valphamask = VEC_ALPHA_MASK();
   971 
   972     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   973     vdstPermute = calc_swizzle32(NULL, dstfmt);
   974     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   975 
   976     /* set a vector full of alpha and 255-alpha */
   977     ((unsigned char *)&valpha)[0] = alpha;
   978     valpha = vec_splat(valpha, 0);
   979     vbits = (vector unsigned char)vec_splat_s8(-1);
   980 
   981     ckey &= rgbmask;
   982     ((unsigned int *)(char*)&vckey)[0] = ckey;
   983     vckey = vec_splat(vckey, 0);
   984     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
   985     vrgbmask = vec_splat(vrgbmask, 0);
   986 
   987     while(height--) {
   988         int width = info->d_width;
   989 #define ONE_PIXEL_BLEND(condition, widthvar) \
   990         while (condition) { \
   991             Uint32 Pixel; \
   992             unsigned sR, sG, sB, dR, dG, dB; \
   993             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   994             if(sA && Pixel != ckey) { \
   995                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   996                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   997                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   998                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   999             } \
  1000             dstp++; \
  1001             srcp++; \
  1002             widthvar--; \
  1003         }
  1004         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1005         if (width > 0) {
  1006             int extrawidth = (width % 4);
  1007             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1008             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1009             width -= extrawidth;
  1010             while (width) {
  1011                 vector unsigned char vsel;
  1012                 vector unsigned char voverflow;
  1013                 vector unsigned char vd;
  1014                 vector unsigned char vd_orig;
  1015 
  1016                 /* s = *srcp */
  1017                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1018                 vs = vec_perm(vs, voverflow, valigner);
  1019                 
  1020                 /* vsel is set for items that match the key */
  1021                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
  1022                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
  1023 
  1024                 /* permute to source format */
  1025                 vs = vec_perm(vs, valpha, vsrcPermute);
  1026 
  1027                 /* d = *dstp */
  1028                 vd = (vector unsigned char)vec_ld(0, dstp);
  1029                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1030 
  1031                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1032 
  1033                 /* set the alpha channel to full on */
  1034                 vd = vec_or(vd, valphamask);
  1035 
  1036                 /* mask out color key */
  1037                 vd = vec_sel(vd, vd_orig, vsel);
  1038                 
  1039                 /* permute to dest format */
  1040                 vd = vec_perm(vd, vbits, vdstPermute);
  1041 
  1042                 /* *dstp = res */
  1043                 vec_st((vector unsigned int)vd, 0, dstp);
  1044                 
  1045                 srcp += 4;
  1046                 dstp += 4;
  1047                 width -= 4;
  1048                 vs = voverflow;
  1049             }
  1050             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1051         }
  1052 #undef ONE_PIXEL_BLEND
  1053  
  1054         srcp += srcskip;
  1055         dstp += dstskip;
  1056     }
  1057 }
  1058 
  1059 
  1060 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
  1061 {
  1062     int width = info->d_width;
  1063     int height = info->d_height;
  1064     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1065     int srcskip = info->s_skip >> 2;
  1066     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1067     int dstskip = info->d_skip >> 2;
  1068     SDL_PixelFormat *srcfmt = info->src;
  1069     SDL_PixelFormat *dstfmt = info->dst;
  1070     vector unsigned char mergePermute;
  1071     vector unsigned char valphaPermute;
  1072     vector unsigned char vsrcPermute;
  1073     vector unsigned char vdstPermute;
  1074     vector unsigned char vsdstPermute;
  1075     vector unsigned char valphamask;
  1076     vector unsigned char vpixelmask;
  1077     vector unsigned char v0;
  1078     vector unsigned short v1;
  1079     vector unsigned short v8;
  1080 
  1081     v0 = vec_splat_u8(0);
  1082     v1 = vec_splat_u16(1);
  1083     v8 = vec_splat_u16(8);
  1084     mergePermute = VEC_MERGE_PERMUTE();
  1085     valphamask = VEC_ALPHA_MASK();
  1086     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1087     vpixelmask = vec_nor(valphamask, v0);
  1088     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1089     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1090     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1091 
  1092 	while ( height-- ) {
  1093         width = info->d_width;
  1094 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1095             Uint32 Pixel; \
  1096             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1097             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1098             if(sA) { \
  1099               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1100               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1101               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1102             } \
  1103             ++srcp; \
  1104             ++dstp; \
  1105             widthvar--; \
  1106         }
  1107         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1108         if (width > 0) {
  1109             /* vsrcPermute */
  1110             /* vdstPermute */
  1111             int extrawidth = (width % 4);
  1112             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1113             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1114             width -= extrawidth;
  1115             while (width) {
  1116                 vector unsigned char voverflow;
  1117                 vector unsigned char vd;
  1118                 vector unsigned char valpha;
  1119                 vector unsigned char vdstalpha;
  1120                 /* s = *srcp */
  1121                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1122                 vs = vec_perm(vs, voverflow, valigner);
  1123                 vs = vec_perm(vs, v0, vsrcPermute);
  1124 
  1125                 valpha = vec_perm(vs, v0, valphaPermute);
  1126                 
  1127                 /* d = *dstp */
  1128                 vd = (vector unsigned char)vec_ld(0, dstp);
  1129                 vd = vec_perm(vd, v0, vsdstPermute);
  1130                 vdstalpha = vec_and(vd, valphamask);
  1131 
  1132                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1133 
  1134                 /* set the alpha to the dest alpha */
  1135                 vd = vec_and(vd, vpixelmask);
  1136                 vd = vec_or(vd, vdstalpha);
  1137                 vd = vec_perm(vd, v0, vdstPermute);
  1138 
  1139                 /* *dstp = res */
  1140                 vec_st((vector unsigned int)vd, 0, dstp);
  1141                 
  1142                 srcp += 4;
  1143                 dstp += 4;
  1144                 width -= 4;
  1145                 vs = voverflow;
  1146 
  1147             }
  1148             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1149         }
  1150 	    srcp += srcskip;
  1151 	    dstp += dstskip;
  1152 #undef ONE_PIXEL_BLEND
  1153 	}
  1154 }
  1155 
  1156 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1157 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
  1158 {
  1159 	int width = info->d_width;
  1160 	int height = info->d_height;
  1161 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1162 	int srcskip = info->s_skip >> 2;
  1163 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1164 	int dstskip = info->d_skip >> 2;
  1165     vector unsigned char mergePermute;
  1166     vector unsigned char valphaPermute;
  1167     vector unsigned char valphamask;
  1168     vector unsigned char vpixelmask;
  1169     vector unsigned char v0;
  1170     vector unsigned short v1;
  1171     vector unsigned short v8;
  1172     v0 = vec_splat_u8(0);
  1173     v1 = vec_splat_u16(1);
  1174     v8 = vec_splat_u16(8);
  1175     mergePermute = VEC_MERGE_PERMUTE();
  1176     valphamask = VEC_ALPHA_MASK();
  1177     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1178     
  1179  
  1180     vpixelmask = vec_nor(valphamask, v0);
  1181 	while(height--) {
  1182         width = info->d_width;
  1183 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1184         while ((condition)) { \
  1185             Uint32 dalpha; \
  1186             Uint32 d; \
  1187             Uint32 s1; \
  1188             Uint32 d1; \
  1189             Uint32 s = *srcp; \
  1190             Uint32 alpha = s >> 24; \
  1191             if(alpha) { \
  1192               if(alpha == SDL_ALPHA_OPAQUE) { \
  1193                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1194               } else { \
  1195                 d = *dstp; \
  1196                 dalpha = d & 0xff000000; \
  1197                 s1 = s & 0xff00ff; \
  1198                 d1 = d & 0xff00ff; \
  1199                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1200                 s &= 0xff00; \
  1201                 d &= 0xff00; \
  1202                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1203                 *dstp = d1 | d | dalpha; \
  1204               } \
  1205             } \
  1206             ++srcp; \
  1207             ++dstp; \
  1208             widthvar--; \
  1209 	    }
  1210         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1211         if (width > 0) {
  1212             int extrawidth = (width % 4);
  1213             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1214             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1215             width -= extrawidth;
  1216             while (width) {
  1217                 vector unsigned char voverflow;
  1218                 vector unsigned char vd;
  1219                 vector unsigned char valpha;
  1220                 vector unsigned char vdstalpha;
  1221                 /* s = *srcp */
  1222                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1223                 vs = vec_perm(vs, voverflow, valigner);
  1224 
  1225                 valpha = vec_perm(vs, v0, valphaPermute);
  1226                 
  1227                 /* d = *dstp */
  1228                 vd = (vector unsigned char)vec_ld(0, dstp);
  1229                 vdstalpha = vec_and(vd, valphamask);
  1230 
  1231                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1232 
  1233                 /* set the alpha to the dest alpha */
  1234                 vd = vec_and(vd, vpixelmask);
  1235                 vd = vec_or(vd, vdstalpha);
  1236 
  1237                 /* *dstp = res */
  1238                 vec_st((vector unsigned int)vd, 0, dstp);
  1239                 
  1240                 srcp += 4;
  1241                 dstp += 4;
  1242                 width -= 4;
  1243                 vs = voverflow;
  1244             }
  1245             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1246         }
  1247 	    srcp += srcskip;
  1248 	    dstp += dstskip;
  1249 	}
  1250 #undef ONE_PIXEL_BLEND
  1251 }
  1252 
  1253 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
  1254 {
  1255     /* XXX : 6 */
  1256 	unsigned alpha = info->src->alpha;
  1257     int height = info->d_height;
  1258     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1259     int srcskip = info->s_skip >> 2;
  1260     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1261     int dstskip = info->d_skip >> 2;
  1262     SDL_PixelFormat *srcfmt = info->src;
  1263     SDL_PixelFormat *dstfmt = info->dst;
  1264 	unsigned sA = srcfmt->alpha;
  1265 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1266     vector unsigned char mergePermute;
  1267     vector unsigned char vsrcPermute;
  1268     vector unsigned char vdstPermute;
  1269     vector unsigned char vsdstPermute;
  1270     vector unsigned char valpha;
  1271     vector unsigned char valphamask;
  1272     vector unsigned char vbits;
  1273     vector unsigned short v1;
  1274     vector unsigned short v8;
  1275 
  1276     mergePermute = VEC_MERGE_PERMUTE();
  1277     v1 = vec_splat_u16(1);
  1278     v8 = vec_splat_u16(8);
  1279 
  1280     /* set the alpha to 255 on the destination surf */
  1281     valphamask = VEC_ALPHA_MASK();
  1282 
  1283     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1284     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1285     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1286 
  1287     /* set a vector full of alpha and 255-alpha */
  1288     ((unsigned char *)&valpha)[0] = alpha;
  1289     valpha = vec_splat(valpha, 0);
  1290     vbits = (vector unsigned char)vec_splat_s8(-1);
  1291 
  1292     while(height--) {
  1293         int width = info->d_width;
  1294 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1295             Uint32 Pixel; \
  1296             unsigned sR, sG, sB, dR, dG, dB; \
  1297             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1298             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1299             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1300             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1301             ++srcp; \
  1302             ++dstp; \
  1303             widthvar--; \
  1304         }
  1305         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1306         if (width > 0) {
  1307             int extrawidth = (width % 4);
  1308             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1309             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1310             width -= extrawidth;
  1311             while (width) {
  1312                 vector unsigned char voverflow;
  1313                 vector unsigned char vd;
  1314 
  1315                 /* s = *srcp */
  1316                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1317                 vs = vec_perm(vs, voverflow, valigner);
  1318                 vs = vec_perm(vs, valpha, vsrcPermute);
  1319                 
  1320                 /* d = *dstp */
  1321                 vd = (vector unsigned char)vec_ld(0, dstp);
  1322                 vd = vec_perm(vd, vd, vsdstPermute);
  1323 
  1324                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1325 
  1326                 /* set the alpha channel to full on */
  1327                 vd = vec_or(vd, valphamask);
  1328                 vd = vec_perm(vd, vbits, vdstPermute);
  1329 
  1330                 /* *dstp = res */
  1331                 vec_st((vector unsigned int)vd, 0, dstp);
  1332                 
  1333                 srcp += 4;
  1334                 dstp += 4;
  1335                 width -= 4;
  1336                 vs = voverflow;
  1337             }
  1338             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1339         }
  1340 #undef ONE_PIXEL_BLEND
  1341  
  1342         srcp += srcskip;
  1343         dstp += dstskip;
  1344     }
  1345 
  1346 }
  1347 
  1348 
  1349 /* fast RGB888->(A)RGB888 blending */
  1350 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
  1351 {
  1352 	unsigned alpha = info->src->alpha;
  1353     int height = info->d_height;
  1354     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1355     int srcskip = info->s_skip >> 2;
  1356     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1357     int dstskip = info->d_skip >> 2;
  1358     vector unsigned char mergePermute;
  1359     vector unsigned char valpha;
  1360     vector unsigned char valphamask;
  1361     vector unsigned short v1;
  1362     vector unsigned short v8;
  1363 
  1364     mergePermute = VEC_MERGE_PERMUTE();
  1365     v1 = vec_splat_u16(1);
  1366     v8 = vec_splat_u16(8);
  1367 
  1368     /* set the alpha to 255 on the destination surf */
  1369     valphamask = VEC_ALPHA_MASK();
  1370 
  1371     /* set a vector full of alpha and 255-alpha */
  1372     ((unsigned char *)&valpha)[0] = alpha;
  1373     valpha = vec_splat(valpha, 0);
  1374 
  1375     while(height--) {
  1376         int width = info->d_width;
  1377 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1378             Uint32 s = *srcp; \
  1379             Uint32 d = *dstp; \
  1380             Uint32 s1 = s & 0xff00ff; \
  1381             Uint32 d1 = d & 0xff00ff; \
  1382             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1383                  & 0xff00ff; \
  1384             s &= 0xff00; \
  1385             d &= 0xff00; \
  1386             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1387             *dstp = d1 | d | 0xff000000; \
  1388             ++srcp; \
  1389             ++dstp; \
  1390             widthvar--; \
  1391         }
  1392         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1393         if (width > 0) {
  1394             int extrawidth = (width % 4);
  1395             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1396             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1397             width -= extrawidth;
  1398             while (width) {
  1399                 vector unsigned char voverflow;
  1400                 vector unsigned char vd;
  1401 
  1402                 /* s = *srcp */
  1403                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1404                 vs = vec_perm(vs, voverflow, valigner);
  1405                 
  1406                 /* d = *dstp */
  1407                 vd = (vector unsigned char)vec_ld(0, dstp);
  1408 
  1409                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1410 
  1411                 /* set the alpha channel to full on */
  1412                 vd = vec_or(vd, valphamask);
  1413 
  1414                 /* *dstp = res */
  1415                 vec_st((vector unsigned int)vd, 0, dstp);
  1416                 
  1417                 srcp += 4;
  1418                 dstp += 4;
  1419                 width -= 4;
  1420                 vs = voverflow;
  1421             }
  1422             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1423         }
  1424 #undef ONE_PIXEL_BLEND
  1425  
  1426         srcp += srcskip;
  1427         dstp += dstskip;
  1428     }
  1429 }
  1430 #if __MWERKS__
  1431 #pragma altivec_model off
  1432 #endif
  1433 #endif /* SDL_ALTIVEC_BLITTERS */
  1434 
  1435 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1436 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  1437 {
  1438 	int width = info->d_width;
  1439 	int height = info->d_height;
  1440 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1441 	int srcskip = info->s_skip >> 2;
  1442 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1443 	int dstskip = info->d_skip >> 2;
  1444 
  1445 	while(height--) {
  1446 	    DUFFS_LOOP4({
  1447 		    Uint32 s = *srcp++;
  1448 		    Uint32 d = *dstp;
  1449 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1450 			       + (s & d & 0x00010101)) | 0xff000000;
  1451 	    }, width);
  1452 	    srcp += srcskip;
  1453 	    dstp += dstskip;
  1454 	}
  1455 }
  1456 
  1457 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1458 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  1459 {
  1460 	unsigned alpha = info->src->alpha;
  1461 	if(alpha == 128) {
  1462 		BlitRGBtoRGBSurfaceAlpha128(info);
  1463 	} else {
  1464 		int width = info->d_width;
  1465 		int height = info->d_height;
  1466 		Uint32 *srcp = (Uint32 *)info->s_pixels;
  1467 		int srcskip = info->s_skip >> 2;
  1468 		Uint32 *dstp = (Uint32 *)info->d_pixels;
  1469 		int dstskip = info->d_skip >> 2;
  1470 		Uint32 s;
  1471 		Uint32 d;
  1472 		Uint32 s1;
  1473 		Uint32 d1;
  1474 
  1475 		while(height--) {
  1476 			DUFFS_LOOP_DOUBLE2({
  1477 				/* One Pixel Blend */
  1478 				s = *srcp;
  1479 				d = *dstp;
  1480 				s1 = s & 0xff00ff;
  1481 				d1 = d & 0xff00ff;
  1482 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1483 				     & 0xff00ff;
  1484 				s &= 0xff00;
  1485 				d &= 0xff00;
  1486 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1487 				*dstp = d1 | d | 0xff000000;
  1488 				++srcp;
  1489 				++dstp;
  1490 			},{
  1491 			        /* Two Pixels Blend */
  1492 				s = *srcp;
  1493 				d = *dstp;
  1494 				s1 = s & 0xff00ff;
  1495 				d1 = d & 0xff00ff;
  1496 				d1 += (s1 - d1) * alpha >> 8;
  1497 				d1 &= 0xff00ff;
  1498 				     
  1499 				s = ((s & 0xff00) >> 8) | 
  1500 					((srcp[1] & 0xff00) << 8);
  1501 				d = ((d & 0xff00) >> 8) |
  1502 					((dstp[1] & 0xff00) << 8);
  1503 				d += (s - d) * alpha >> 8;
  1504 				d &= 0x00ff00ff;
  1505 				
  1506 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1507 				++srcp;
  1508 				
  1509 			        s1 = *srcp;
  1510 				d1 = *dstp;
  1511 				s1 &= 0xff00ff;
  1512 				d1 &= 0xff00ff;
  1513 				d1 += (s1 - d1) * alpha >> 8;
  1514 				d1 &= 0xff00ff;
  1515 				
  1516 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1517 				++srcp;
  1518 				++dstp;
  1519 			}, width);
  1520 			srcp += srcskip;
  1521 			dstp += dstskip;
  1522 		}
  1523 	}
  1524 }
  1525 
  1526 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1527 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
  1528 {
  1529 	int width = info->d_width;
  1530 	int height = info->d_height;
  1531 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1532 	int srcskip = info->s_skip >> 2;
  1533 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1534 	int dstskip = info->d_skip >> 2;
  1535 
  1536 	while(height--) {
  1537 	    DUFFS_LOOP4({
  1538 		Uint32 dalpha;
  1539 		Uint32 d;
  1540 		Uint32 s1;
  1541 		Uint32 d1;
  1542 		Uint32 s = *srcp;
  1543 		Uint32 alpha = s >> 24;
  1544 		/* FIXME: Here we special-case opaque alpha since the
  1545 		   compositioning used (>>8 instead of /255) doesn't handle
  1546 		   it correctly. Also special-case alpha=0 for speed?
  1547 		   Benchmark this! */
  1548 		if(alpha) {   
  1549 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1550 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1551 		  } else {
  1552 		    /*
  1553 		     * take out the middle component (green), and process
  1554 		     * the other two in parallel. One multiply less.
  1555 		     */
  1556 		    d = *dstp;
  1557 		    dalpha = d & 0xff000000;
  1558 		    s1 = s & 0xff00ff;
  1559 		    d1 = d & 0xff00ff;
  1560 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1561 		    s &= 0xff00;
  1562 		    d &= 0xff00;
  1563 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1564 		    *dstp = d1 | d | dalpha;
  1565 		  }
  1566 		}
  1567 		++srcp;
  1568 		++dstp;
  1569 	    }, width);
  1570 	    srcp += srcskip;
  1571 	    dstp += dstskip;
  1572 	}
  1573 }
  1574 
  1575 #if GCC_ASMBLIT
  1576 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1577 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1578 {
  1579 	int width = info->d_width;
  1580 	int height = info->d_height;
  1581 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1582 	int srcskip = info->s_skip >> 2;
  1583 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1584 	int dstskip = info->d_skip >> 2;
  1585 	SDL_PixelFormat* sf = info->src;
  1586 	Uint32 amask = sf->Amask;
  1587 
  1588 	__asm__ (
  1589 	/* make mm6 all zeros. */
  1590 	"pxor       %%mm6, %%mm6\n"
  1591 	
  1592 	/* Make a mask to preserve the alpha. */
  1593 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
  1594 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
  1595 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
  1596 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
  1597 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
  1598 
  1599 	/* form channel masks */
  1600 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
  1601 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
  1602 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
  1603 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
  1604 	
  1605 	/* get alpha channel shift */
  1606 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
  1607 
  1608 	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
  1609 
  1610 	while(height--) {
  1611 
  1612 	    DUFFS_LOOP4({
  1613 		Uint32 alpha;
  1614 
  1615 		__asm__ (
  1616 		"prefetch 64(%0)\n"
  1617 		"prefetch 64(%1)\n"
  1618 			: : "r" (srcp), "r" (dstp) );
  1619 
  1620 		alpha = *srcp & amask;
  1621 		/* FIXME: Here we special-case opaque alpha since the
  1622 		   compositioning used (>>8 instead of /255) doesn't handle
  1623 		   it correctly. Also special-case alpha=0 for speed?
  1624 		   Benchmark this! */
  1625 		if(alpha == 0) {
  1626 		    /* do nothing */
  1627 		}
  1628 		else if(alpha == amask) {
  1629 			/* opaque alpha -- copy RGB, keep dst alpha */
  1630 		    /* using MMX here to free up regular registers for other things */
  1631 			    __asm__ (
  1632 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1633 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1634 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1635 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1636 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1637 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1638 
  1639 		     : : "r" (srcp), "r" (dstp) );
  1640 		} 
  1641 
  1642 		else {
  1643 			    __asm__ (
  1644 		    /* load in the source, and dst. */
  1645 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1646 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1647 
  1648 		    /* Move the src alpha into mm2 */
  1649 
  1650 		    /* if supporting pshufw */
  1651 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1652 		    /*"psrlw     $8, %%mm2\n" */
  1653 		    
  1654 		    /* else: */
  1655 		    "movd       %2,    %%mm2\n"
  1656 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1657 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1658 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1659 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1660 
  1661 		    /* move the colors into words. */
  1662 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1663 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1664 
  1665 		    /* src - dst */
  1666 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1667 
  1668 		    /* A * (src-dst) */
  1669 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1670 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1671 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1672 
  1673 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1674 		    
  1675 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1676 
  1677 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1678 
  1679 		}
  1680 		++srcp;
  1681 		++dstp;
  1682 	    }, width);
  1683 	    srcp += srcskip;
  1684 	    dstp += dstskip;
  1685 	}
  1686 
  1687 	__asm__ (
  1688 	"emms\n"
  1689 		:   );
  1690 }
  1691 /* End GCC_ASMBLIT*/
  1692 
  1693 #elif MSVC_ASMBLIT
  1694 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1695 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1696 {
  1697 	int width = info->d_width;
  1698 	int height = info->d_height;
  1699 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1700 	int srcskip = info->s_skip >> 2;
  1701 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1702 	int dstskip = info->d_skip >> 2;
  1703 	SDL_PixelFormat* sf = info->src;
  1704 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1705 	Uint32 amask = sf->Amask;
  1706 	Uint32 ashift = sf->Ashift;
  1707 	Uint64 multmask;
  1708 	
  1709 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1710 
  1711 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
  1712 	multmask = ~(0xFFFFi64 << (ashift * 2));
  1713 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
  1714 
  1715 	while(height--) {
  1716 	    DUFFS_LOOP4({
  1717 		Uint32 alpha;
  1718 
  1719 		_m_prefetch(srcp + 16);
  1720 		_m_prefetch(dstp + 16);
  1721 
  1722 		alpha = *srcp & amask;
  1723 		if (alpha == 0) {
  1724 			/* do nothing */
  1725 		} else if (alpha == amask) {
  1726 			/* copy RGB, keep dst alpha */
  1727 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1728 		} else {
  1729 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1730 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1731 
  1732 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1733 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1734 
  1735 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1736 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1737 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1738 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1739 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1740 
  1741 			/* blend */		    
  1742 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1743 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1744 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1745 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1746 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1747 			
  1748 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1749 		}
  1750 		++srcp;
  1751 		++dstp;
  1752 	    }, width);
  1753 	    srcp += srcskip;
  1754 	    dstp += dstskip;
  1755 	}
  1756 	_mm_empty();
  1757 }
  1758 /* End MSVC_ASMBLIT */
  1759 
  1760 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1761 
  1762 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1763 
  1764 /* blend a single 16 bit pixel at 50% */
  1765 #define BLEND16_50(d, s, mask)						\
  1766 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1767 
  1768 /* blend two 16 bit pixels at 50% */
  1769 #define BLEND2x16_50(d, s, mask)					     \
  1770 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1771 	 + (s & d & (~(mask | mask << 16))))
  1772 
  1773 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  1774 {
  1775 	int width = info->d_width;
  1776 	int height = info->d_height;
  1777 	Uint16 *srcp = (Uint16 *)info->s_pixels;
  1778 	int srcskip = info->s_skip >> 1;
  1779 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  1780 	int dstskip = info->d_skip >> 1;
  1781 
  1782 	while(height--) {
  1783 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  1784 			/*
  1785 			 * Source and destination not aligned, pipeline it.
  1786 			 * This is mostly a win for big blits but no loss for
  1787 			 * small ones
  1788 			 */
  1789 			Uint32 prev_sw;
  1790 			int w = width;
  1791 
  1792 			/* handle odd destination */
  1793 			if((uintptr_t)dstp & 2) {
  1794 				Uint16 d = *dstp, s = *srcp;
  1795 				*dstp = BLEND16_50(d, s, mask);
  1796 				dstp++;
  1797 				srcp++;
  1798 				w--;
  1799 			}
  1800 			srcp++;	/* srcp is now 32-bit aligned */
  1801 
  1802 			/* bootstrap pipeline with first halfword */
  1803 			prev_sw = ((Uint32 *)srcp)[-1];
  1804 
  1805 			while(w > 1) {
  1806 				Uint32 sw, dw, s;
  1807 				sw = *(Uint32 *)srcp;
  1808 				dw = *(Uint32 *)dstp;
  1809 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1810 				s = (prev_sw << 16) + (sw >> 16);
  1811 #else
  1812 				s = (prev_sw >> 16) + (sw << 16);
  1813 #endif
  1814 				prev_sw = sw;
  1815 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  1816 				dstp += 2;
  1817 				srcp += 2;
  1818 				w -= 2;
  1819 			}
  1820 
  1821 			/* final pixel if any */
  1822 			if(w) {
  1823 				Uint16 d = *dstp, s;
  1824 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1825 				s = (Uint16)prev_sw;
  1826 #else
  1827 				s = (Uint16)(prev_sw >> 16);
  1828 #endif
  1829 				*dstp = BLEND16_50(d, s, mask);
  1830 				srcp++;
  1831 				dstp++;
  1832 			}
  1833 			srcp += srcskip - 1;
  1834 			dstp += dstskip;
  1835 		} else {
  1836 			/* source and destination are aligned */
  1837 			int w = width;
  1838 
  1839 			/* first odd pixel? */
  1840 			if((uintptr_t)srcp & 2) {
  1841 				Uint16 d = *dstp, s = *srcp;
  1842 				*dstp = BLEND16_50(d, s, mask);
  1843 				srcp++;
  1844 				dstp++;
  1845 				w--;
  1846 			}
  1847 			/* srcp and dstp are now 32-bit aligned */
  1848 
  1849 			while(w > 1) {
  1850 				Uint32 sw = *(Uint32 *)srcp;
  1851 				Uint32 dw = *(Uint32 *)dstp;
  1852 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  1853 				srcp += 2;
  1854 				dstp += 2;
  1855 				w -= 2;
  1856 			}
  1857 
  1858 			/* last odd pixel? */
  1859 			if(w) {
  1860 				Uint16 d = *dstp, s = *srcp;
  1861 				*dstp = BLEND16_50(d, s, mask);
  1862 				srcp++;
  1863 				dstp++;
  1864 			}
  1865 			srcp += srcskip;
  1866 			dstp += dstskip;
  1867 		}
  1868 	}
  1869 }
  1870 
  1871 #if GCC_ASMBLIT
  1872 /* fast RGB565->RGB565 blending with surface alpha */
  1873 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  1874 {
  1875 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1876 	if(alpha == 128) {
  1877 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  1878 	} else {
  1879 		int width = info->d_width;
  1880 		int height = info->d_height;
  1881 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  1882 		int srcskip = info->s_skip >> 1;
  1883 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  1884 		int dstskip = info->d_skip >> 1;
  1885 		Uint32 s, d;
  1886 		Uint64 load;
  1887 	  
  1888 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  1889 		load = alpha;
  1890 		alpha >>= 3;		/* downscale alpha to 5 bits */
  1891 
  1892 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  1893 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  1894 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  1895 		/* position alpha to allow for mullo and mulhi on diff channels
  1896 		   to reduce the number of operations */
  1897 		psllq_i2r(3, mm0);
  1898 	  
  1899 		/* Setup the 565 color channel masks */
  1900 		load = 0x07E007E007E007E0ULL;
  1901 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  1902 		load = 0x001F001F001F001FULL;
  1903 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  1904 		while(height--) {
  1905 			DUFFS_LOOP_QUATRO2(
  1906 			{
  1907 				s = *srcp++;
  1908 				d = *dstp;
  1909 				/*
  1910 				 * shift out the middle component (green) to
  1911 				 * the high 16 bits, and process all three RGB
  1912 				 * components at the same time.
  1913 				 */
  1914 				s = (s | s << 16) & 0x07e0f81f;
  1915 				d = (d | d << 16) & 0x07e0f81f;
  1916 				d += (s - d) * alpha >> 5;
  1917 				d &= 0x07e0f81f;
  1918 				*dstp++ = d | d >> 16;
  1919 			},{
  1920 				s = *srcp++;
  1921 				d = *dstp;
  1922 				/*
  1923 				 * shift out the middle component (green) to
  1924 				 * the high 16 bits, and process all three RGB
  1925 				 * components at the same time.
  1926 				 */
  1927 				s = (s | s << 16) & 0x07e0f81f;
  1928 				d = (d | d << 16) & 0x07e0f81f;
  1929 				d += (s - d) * alpha >> 5;
  1930 				d &= 0x07e0f81f;
  1931 				*dstp++ = d | d >> 16;
  1932 				s = *srcp++;
  1933 				d = *dstp;
  1934 				/*
  1935 				 * shift out the middle component (green) to
  1936 				 * the high 16 bits, and process all three RGB
  1937 				 * components at the same time.
  1938 				 */
  1939 				s = (s | s << 16) & 0x07e0f81f;
  1940 				d = (d | d << 16) & 0x07e0f81f;
  1941 				d += (s - d) * alpha >> 5;
  1942 				d &= 0x07e0f81f;
  1943 				*dstp++ = d | d >> 16;
  1944 			},{
  1945 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  1946 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  1947 
  1948 				/* red -- does not need a mask since the right shift clears
  1949 				   the uninteresting bits */
  1950 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1951 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1952 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  1953 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  1954 
  1955 				/* blend */
  1956 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1957 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1958 				/* alpha used is actually 11 bits
  1959 				   11 + 5 = 16 bits, so the sign bits are lost */
  1960 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1961 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1962 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  1963 
  1964 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  1965 
  1966 				/* green -- process the bits in place */
  1967 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1968 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1969 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  1970 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  1971 
  1972 				/* blend */
  1973 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1974 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1975 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  1976 				   bits are gone and the sign bits present */
  1977 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  1978 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1979 
  1980 				por_r2r(mm6, mm1); /* save new greens in dsts */
  1981 
  1982 				/* blue */
  1983 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1984 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1985 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  1986 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1987 
  1988 				/* blend */
  1989 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1990 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1991 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  1992 				   the interesting bits will need to be MASKed */
  1993 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1994 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1995 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1996 
  1997 				por_r2r(mm6, mm1); /* save new blues in dsts */
  1998 
  1999 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2000 
  2001 				srcp += 4;
  2002 				dstp += 4;
  2003 			}, width);			
  2004 			srcp += srcskip;
  2005 			dstp += dstskip;
  2006 		}
  2007 		emms();
  2008 	}
  2009 }
  2010 
  2011 /* fast RGB555->RGB555 blending with surface alpha */
  2012 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2013 {
  2014 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2015 	if(alpha == 128) {
  2016 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2017 	} else {
  2018 		int width = info->d_width;
  2019 		int height = info->d_height;
  2020 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2021 		int srcskip = info->s_skip >> 1;
  2022 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2023 		int dstskip = info->d_skip >> 1;
  2024 		Uint32 s, d;
  2025 		Uint64 load;
  2026 	  
  2027 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2028 		load = alpha;
  2029 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2030 
  2031 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  2032 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  2033 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  2034 		/* position alpha to allow for mullo and mulhi on diff channels
  2035 		   to reduce the number of operations */
  2036 		psllq_i2r(3, mm0);
  2037 
  2038 		/* Setup the 555 color channel masks */
  2039 		load = 0x03E003E003E003E0ULL;
  2040 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  2041 		load = 0x001F001F001F001FULL;
  2042 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  2043 		while(height--) {
  2044 			DUFFS_LOOP_QUATRO2(
  2045 			{
  2046 				s = *srcp++;
  2047 				d = *dstp;
  2048 				/*
  2049 				 * shift out the middle component (green) to
  2050 				 * the high 16 bits, and process all three RGB
  2051 				 * components at the same time.
  2052 				 */
  2053 				s = (s | s << 16) & 0x03e07c1f;
  2054 				d = (d | d << 16) & 0x03e07c1f;
  2055 				d += (s - d) * alpha >> 5;
  2056 				d &= 0x03e07c1f;
  2057 				*dstp++ = d | d >> 16;
  2058 			},{
  2059 				s = *srcp++;
  2060 				d = *dstp;
  2061 				/*
  2062 				 * shift out the middle component (green) to
  2063 				 * the high 16 bits, and process all three RGB
  2064 				 * components at the same time.
  2065 				 */
  2066 				s = (s | s << 16) & 0x03e07c1f;
  2067 				d = (d | d << 16) & 0x03e07c1f;
  2068 				d += (s - d) * alpha >> 5;
  2069 				d &= 0x03e07c1f;
  2070 				*dstp++ = d | d >> 16;
  2071 			        s = *srcp++;
  2072 				d = *dstp;
  2073 				/*
  2074 				 * shift out the middle component (green) to
  2075 				 * the high 16 bits, and process all three RGB
  2076 				 * components at the same time.
  2077 				 */
  2078 				s = (s | s << 16) & 0x03e07c1f;
  2079 				d = (d | d << 16) & 0x03e07c1f;
  2080 				d += (s - d) * alpha >> 5;
  2081 				d &= 0x03e07c1f;
  2082 				*dstp++ = d | d >> 16;
  2083 			},{
  2084 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2085 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2086 
  2087 				/* red -- process the bits in place */
  2088 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2089 					/* by reusing the GREEN mask we free up another mmx
  2090 					   register to accumulate the result */
  2091 
  2092 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2093 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2094 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2095 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2096 
  2097 				/* blend */
  2098 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2099 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2100 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2101 				   cleared by a MASK below */
  2102 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2103 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2104 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2105 
  2106 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2107 
  2108 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2109 
  2110 				/* green -- process the bits in place */
  2111 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2112 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2113 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2114 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2115 
  2116 				/* blend */
  2117 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2118 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2119 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2120 				   bits are gone and the sign bits present */
  2121 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2122 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2123 
  2124 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2125 
  2126 				/* blue */
  2127 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2128 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2129 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2130 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2131 
  2132 				/* blend */
  2133 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2134 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2135 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2136 				   the interesting bits will need to be MASKed */
  2137 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2138 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2139 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2140 
  2141 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2142 
  2143 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2144 
  2145 				srcp += 4;
  2146 				dstp += 4;
  2147 			}, width);			
  2148 			srcp += srcskip;
  2149 			dstp += dstskip;
  2150 		}
  2151 		emms();
  2152 	}
  2153 }
  2154 /* End GCC_ASMBLIT */
  2155 
  2156 #elif MSVC_ASMBLIT
  2157 /* fast RGB565->RGB565 blending with surface alpha */
  2158 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  2159 {
  2160 	unsigned alpha = info->src->alpha;
  2161 	if(alpha == 128) {
  2162 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2163 	} else {
  2164 		int width = info->d_width;
  2165 		int height = info->d_height;
  2166 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2167 		int srcskip = info->s_skip >> 1;
  2168 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2169 		int dstskip = info->d_skip >> 1;
  2170 		Uint32 s, d;
  2171 	  
  2172 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2173 
  2174 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2175 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2176 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2177 
  2178 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2179 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2180 		/* position alpha to allow for mullo and mulhi on diff channels
  2181 		   to reduce the number of operations */
  2182 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2183 	  
  2184 		/* Setup the 565 color channel masks */
  2185 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
  2186 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2187 		
  2188 		while(height--) {
  2189 			DUFFS_LOOP_QUATRO2(
  2190 			{
  2191 				s = *srcp++;
  2192 				d = *dstp;
  2193 				/*
  2194 				 * shift out the middle component (green) to
  2195 				 * the high 16 bits, and process all three RGB
  2196 				 * components at the same time.
  2197 				 */
  2198 				s = (s | s << 16) & 0x07e0f81f;
  2199 				d = (d | d << 16) & 0x07e0f81f;
  2200 				d += (s - d) * alpha >> 5;
  2201 				d &= 0x07e0f81f;
  2202 				*dstp++ = (Uint16)(d | d >> 16);
  2203 			},{
  2204 				s = *srcp++;
  2205 				d = *dstp;
  2206 				/*
  2207 				 * shift out the middle component (green) to
  2208 				 * the high 16 bits, and process all three RGB
  2209 				 * components at the same time.
  2210 				 */
  2211 				s = (s | s << 16) & 0x07e0f81f;
  2212 				d = (d | d << 16) & 0x07e0f81f;
  2213 				d += (s - d) * alpha >> 5;
  2214 				d &= 0x07e0f81f;
  2215 				*dstp++ = (Uint16)(d | d >> 16);
  2216 				s = *srcp++;
  2217 				d = *dstp;
  2218 				/*
  2219 				 * shift out the middle component (green) to
  2220 				 * the high 16 bits, and process all three RGB
  2221 				 * components at the same time.
  2222 				 */
  2223 				s = (s | s << 16) & 0x07e0f81f;
  2224 				d = (d | d << 16) & 0x07e0f81f;
  2225 				d += (s - d) * alpha >> 5;
  2226 				d &= 0x07e0f81f;
  2227 				*dstp++ = (Uint16)(d | d >> 16);
  2228 			},{
  2229 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2230 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2231 
  2232 				/* red */
  2233 				src2 = src1;
  2234 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2235 
  2236 				dst2 = dst1;
  2237 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2238 
  2239 				/* blend */
  2240 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2241 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2242 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2243 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2244 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2245 
  2246 				mm_res = dst2; /* RED -> mm_res */
  2247 
  2248 				/* green -- process the bits in place */
  2249 				src2 = src1;
  2250 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2251 
  2252 				dst2 = dst1;
  2253 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2254 
  2255 				/* blend */
  2256 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2257 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2258 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2259 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2260 
  2261 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2262 
  2263 				/* blue */
  2264 				src2 = src1;
  2265 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2266 
  2267 				dst2 = dst1;
  2268 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2269 
  2270 				/* blend */
  2271 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2272 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2273 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2274 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2275 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2276 
  2277 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2278 
  2279 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2280 
  2281 				srcp += 4;
  2282 				dstp += 4;
  2283 			}, width);			
  2284 			srcp += srcskip;
  2285 			dstp += dstskip;
  2286 		}
  2287 		_mm_empty();
  2288 	}
  2289 }
  2290 
  2291 /* fast RGB555->RGB555 blending with surface alpha */
  2292 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2293 {
  2294 	unsigned alpha = info->src->alpha;
  2295 	if(alpha == 128) {
  2296 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2297 	} else {
  2298 		int width = info->d_width;
  2299 		int height = info->d_height;
  2300 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2301 		int srcskip = info->s_skip >> 1;
  2302 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2303 		int dstskip = info->d_skip >> 1;
  2304 		Uint32 s, d;
  2305 	  
  2306 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2307 
  2308 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2309 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2310 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2311 
  2312 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2313 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2314 		/* position alpha to allow for mullo and mulhi on diff channels
  2315 		   to reduce the number of operations */
  2316 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2317 	  
  2318 		/* Setup the 555 color channel masks */
  2319 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
  2320 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
  2321 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2322 
  2323 		while(height--) {
  2324 			DUFFS_LOOP_QUATRO2(
  2325 			{
  2326 				s = *srcp++;
  2327 				d = *dstp;
  2328 				/*
  2329 				 * shift out the middle component (green) to
  2330 				 * the high 16 bits, and process all three RGB
  2331 				 * components at the same time.
  2332 				 */
  2333 				s = (s | s << 16) & 0x03e07c1f;
  2334 				d = (d | d << 16) & 0x03e07c1f;
  2335 				d += (s - d) * alpha >> 5;
  2336 				d &= 0x03e07c1f;
  2337 				*dstp++ = (Uint16)(d | d >> 16);
  2338 			},{
  2339 				s = *srcp++;
  2340 				d = *dstp;
  2341 				/*
  2342 				 * shift out the middle component (green) to
  2343 				 * the high 16 bits, and process all three RGB
  2344 				 * components at the same time.
  2345 				 */
  2346 				s = (s | s << 16) & 0x03e07c1f;
  2347 				d = (d | d << 16) & 0x03e07c1f;
  2348 				d += (s - d) * alpha >> 5;
  2349 				d &= 0x03e07c1f;
  2350 				*dstp++ = (Uint16)(d | d >> 16);
  2351 			        s = *srcp++;
  2352 				d = *dstp;
  2353 				/*
  2354 				 * shift out the middle component (green) to
  2355 				 * the high 16 bits, and process all three RGB
  2356 				 * components at the same time.
  2357 				 */
  2358 				s = (s | s << 16) & 0x03e07c1f;
  2359 				d = (d | d << 16) & 0x03e07c1f;
  2360 				d += (s - d) * alpha >> 5;
  2361 				d &= 0x03e07c1f;
  2362 				*dstp++ = (Uint16)(d | d >> 16);
  2363 			},{
  2364 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2365 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2366 
  2367 				/* red -- process the bits in place */
  2368 				src2 = src1;
  2369 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2370 
  2371 				dst2 = dst1;
  2372 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2373 
  2374 				/* blend */
  2375 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2376 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2377 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2378 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2379 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2380 
  2381 				mm_res = dst2; /* RED -> mm_res */
  2382 				
  2383 				/* green -- process the bits in place */
  2384 				src2 = src1;
  2385 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2386 
  2387 				dst2 = dst1;
  2388 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2389 
  2390 				/* blend */
  2391 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2392 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2393 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2394 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2395 
  2396 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2397 
  2398 				/* blue */
  2399 				src2 = src1; /* src -> src2 */
  2400 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2401 
  2402 				dst2 = dst1; /* dst -> dst2 */
  2403 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2404 
  2405 				/* blend */
  2406 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2407 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2408 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2409 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2410 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2411 
  2412 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2413 
  2414 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2415 
  2416 				srcp += 4;
  2417 				dstp += 4;
  2418 			}, width);			
  2419 			srcp += srcskip;
  2420 			dstp += dstskip;
  2421 		}
  2422 		_mm_empty();
  2423 	}
  2424 }
  2425 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2426 
  2427 /* fast RGB565->RGB565 blending with surface alpha */
  2428 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  2429 {
  2430 	unsigned alpha = info->src->alpha;
  2431 	if(alpha == 128) {
  2432 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2433 	} else {
  2434 		int width = info->d_width;
  2435 		int height = info->d_height;
  2436 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2437 		int srcskip = info->s_skip >> 1;
  2438 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2439 		int dstskip = info->d_skip >> 1;
  2440 		alpha >>= 3;	/* downscale alpha to 5 bits */
  2441 
  2442 		while(height--) {
  2443 			DUFFS_LOOP4({
  2444 				Uint32 s = *srcp++;
  2445 				Uint32 d = *dstp;
  2446 				/*
  2447 				 * shift out the middle component (green) to
  2448 				 * the high 16 bits, and process all three RGB
  2449 				 * components at the same time.
  2450 				 */
  2451 				s = (s | s << 16) & 0x07e0f81f;
  2452 				d = (d | d << 16) & 0x07e0f81f;
  2453 				d += (s - d) * alpha >> 5;
  2454 				d &= 0x07e0f81f;
  2455 				*dstp++ = (Uint16)(d | d >> 16);
  2456 			}, width);
  2457 			srcp += srcskip;
  2458 			dstp += dstskip;
  2459 		}
  2460 	}
  2461 }
  2462 
  2463 /* fast RGB555->RGB555 blending with surface alpha */
  2464 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  2465 {
  2466 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2467 	if(alpha == 128) {
  2468 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2469 	} else {
  2470 		int width = info->d_width;
  2471 		int height = info->d_height;
  2472 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2473 		int srcskip = info->s_skip >> 1;
  2474 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2475 		int dstskip = info->d_skip >> 1;
  2476 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2477 
  2478 		while(height--) {
  2479 			DUFFS_LOOP4({
  2480 				Uint32 s = *srcp++;
  2481 				Uint32 d = *dstp;
  2482 				/*
  2483 				 * shift out the middle component (green) to
  2484 				 * the high 16 bits, and process all three RGB
  2485 				 * components at the same time.
  2486 				 */
  2487 				s = (s | s << 16) & 0x03e07c1f;
  2488 				d = (d | d << 16) & 0x03e07c1f;
  2489 				d += (s - d) * alpha >> 5;
  2490 				d &= 0x03e07c1f;
  2491 				*dstp++ = (Uint16)(d | d >> 16);
  2492 			}, width);
  2493 			srcp += srcskip;
  2494 			dstp += dstskip;
  2495 		}
  2496 	}
  2497 }
  2498 
  2499 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2500 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  2501 {
  2502 	int width = info->d_width;
  2503 	int height = info->d_height;
  2504 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2505 	int srcskip = info->s_skip >> 2;
  2506 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2507 	int dstskip = info->d_skip >> 1;
  2508 
  2509 	while(height--) {
  2510 	    DUFFS_LOOP4({
  2511 		Uint32 s = *srcp;
  2512 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2513 		/* FIXME: Here we special-case opaque alpha since the
  2514 		   compositioning used (>>8 instead of /255) doesn't handle
  2515 		   it correctly. Also special-case alpha=0 for speed?
  2516 		   Benchmark this! */
  2517 		if(alpha) {   
  2518 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2519 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2520 		  } else {
  2521 		    Uint32 d = *dstp;
  2522 		    /*
  2523 		     * convert source and destination to G0RAB65565
  2524 		     * and blend all components at the same time
  2525 		     */
  2526 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2527 		      + (s >> 3 & 0x1f);
  2528 		    d = (d | d << 16) & 0x07e0f81f;
  2529 		    d += (s - d) * alpha >> 5;
  2530 		    d &= 0x07e0f81f;
  2531 		    *dstp = (Uint16)(d | d >> 16);
  2532 		  }
  2533 		}
  2534 		srcp++;
  2535 		dstp++;
  2536 	    }, width);
  2537 	    srcp += srcskip;
  2538 	    dstp += dstskip;
  2539 	}
  2540 }
  2541 
  2542 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2543 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  2544 {
  2545 	int width = info->d_width;
  2546 	int height = info->d_height;
  2547 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2548 	int srcskip = info->s_skip >> 2;
  2549 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2550 	int dstskip = info->d_skip >> 1;
  2551 
  2552 	while(height--) {
  2553 	    DUFFS_LOOP4({
  2554 		unsigned alpha;
  2555 		Uint32 s = *srcp;
  2556 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2557 		/* FIXME: Here we special-case opaque alpha since the
  2558 		   compositioning used (>>8 instead of /255) doesn't handle
  2559 		   it correctly. Also special-case alpha=0 for speed?
  2560 		   Benchmark this! */
  2561 		if(alpha) {   
  2562 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2563 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2564 		  } else {
  2565 		    Uint32 d = *dstp;
  2566 		    /*
  2567 		     * convert source and destination to G0RAB65565
  2568 		     * and blend all components at the same time
  2569 		     */
  2570 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2571 		      + (s >> 3 & 0x1f);
  2572 		    d = (d | d << 16) & 0x03e07c1f;
  2573 		    d += (s - d) * alpha >> 5;
  2574 		    d &= 0x03e07c1f;
  2575 		    *dstp = (Uint16)(d | d >> 16);
  2576 		  }
  2577 		}
  2578 		srcp++;
  2579 		dstp++;
  2580 	    }, width);
  2581 	    srcp += srcskip;
  2582 	    dstp += dstskip;
  2583 	}
  2584 }
  2585 
  2586 /* General (slow) N->N blending with per-surface alpha */
  2587 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  2588 {
  2589 	int width = info->d_width;
  2590 	int height = info->d_height;
  2591 	Uint8 *src = info->s_pixels;
  2592 	int srcskip = info->s_skip;
  2593 	Uint8 *dst = info->d_pixels;
  2594 	int dstskip = info->d_skip;
  2595 	SDL_PixelFormat *srcfmt = info->src;
  2596 	SDL_PixelFormat *dstfmt = info->dst;
  2597 	int srcbpp = srcfmt->BytesPerPixel;
  2598 	int dstbpp = dstfmt->BytesPerPixel;
  2599 	unsigned sA = srcfmt->alpha;
  2600 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2601 
  2602 	if(sA) {
  2603 	  while ( height-- ) {
  2604 	    DUFFS_LOOP4(
  2605 	    {
  2606 		Uint32 Pixel;
  2607 		unsigned sR;
  2608 		unsigned sG;
  2609 		unsigned sB;
  2610 		unsigned dR;
  2611 		unsigned dG;
  2612 		unsigned dB;
  2613 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2614 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2615 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2616 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2617 		src += srcbpp;
  2618 		dst += dstbpp;
  2619 	    },
  2620 	    width);
  2621 	    src += srcskip;
  2622 	    dst += dstskip;
  2623 	  }
  2624 	}
  2625 }
  2626 
  2627 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2628 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  2629 {
  2630 	int width = info->d_width;
  2631 	int height = info->d_height;
  2632 	Uint8 *src = info->s_pixels;
  2633 	int srcskip = info->s_skip;
  2634 	Uint8 *dst = info->d_pixels;
  2635 	int dstskip = info->d_skip;
  2636 	SDL_PixelFormat *srcfmt = info->src;
  2637 	SDL_PixelFormat *dstfmt = info->dst;
  2638 	Uint32 ckey = srcfmt->colorkey;
  2639 	int srcbpp = srcfmt->BytesPerPixel;
  2640 	int dstbpp = dstfmt->BytesPerPixel;
  2641 	unsigned sA = srcfmt->alpha;
  2642 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2643 
  2644 	while ( height-- ) {
  2645 	    DUFFS_LOOP4(
  2646 	    {
  2647 		Uint32 Pixel;
  2648 		unsigned sR;
  2649 		unsigned sG;
  2650 		unsigned sB;
  2651 		unsigned dR;
  2652 		unsigned dG;
  2653 		unsigned dB;
  2654 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2655 		if(sA && Pixel != ckey) {
  2656 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2657 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2658 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2659 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2660 		}
  2661 		src += srcbpp;
  2662 		dst += dstbpp;
  2663 	    },
  2664 	    width);
  2665 	    src += srcskip;
  2666 	    dst += dstskip;
  2667 	}
  2668 }
  2669 
  2670 /* General (slow) N->N blending with pixel alpha */
  2671 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  2672 {
  2673 	int width = info->d_width;
  2674 	int height = info->d_height;
  2675 	Uint8 *src = info->s_pixels;
  2676 	int srcskip = info->s_skip;
  2677 	Uint8 *dst = info->d_pixels;
  2678 	int dstskip = info->d_skip;
  2679 	SDL_PixelFormat *srcfmt = info->src;
  2680 	SDL_PixelFormat *dstfmt = info->dst;
  2681 
  2682 	int  srcbpp;
  2683 	int  dstbpp;
  2684 
  2685 	/* Set up some basic variables */
  2686 	srcbpp = srcfmt->BytesPerPixel;
  2687 	dstbpp = dstfmt->BytesPerPixel;
  2688 
  2689 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2690 	   quite right. for <8bpp source alpha, it gets them very wrong
  2691 	   (check all macros!)
  2692 	   It is unclear whether there is a good general solution that doesn't
  2693 	   need a branch (or a divide). */
  2694 	while ( height-- ) {
  2695 	    DUFFS_LOOP4(
  2696 	    {
  2697 		Uint32 Pixel;
  2698 		unsigned sR;
  2699 		unsigned sG;
  2700 		unsigned sB;
  2701 		unsigned dR;
  2702 		unsigned dG;
  2703 		unsigned dB;
  2704 		unsigned sA;
  2705 		unsigned dA;
  2706 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2707 		if(sA) {
  2708 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2709 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2710 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2711 		}
  2712 		src += srcbpp;
  2713 		dst += dstbpp;
  2714 	    },
  2715 	    width);
  2716 	    src += srcskip;
  2717 	    dst += dstskip;
  2718 	}
  2719 }
  2720 
  2721 
  2722 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
  2723 {
  2724     SDL_PixelFormat *sf = surface->format;
  2725     SDL_PixelFormat *df = surface->map->dst->format;
  2726 
  2727     if(sf->Amask == 0) {
  2728 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2729 	    if(df->BytesPerPixel == 1)
  2730 		return BlitNto1SurfaceAlphaKey;
  2731 	    else
  2732 #if SDL_ALTIVEC_BLITTERS
  2733 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2734 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2735             return Blit32to32SurfaceAlphaKeyAltivec;
  2736         else
  2737 #endif
  2738             return BlitNtoNSurfaceAlphaKey;
  2739 	} else {
  2740 	    /* Per-surface alpha blits */
  2741 	    switch(df->BytesPerPixel) {
  2742 	    case 1:
  2743 		return BlitNto1SurfaceAlpha;
  2744 
  2745 	    case 2:
  2746 		if(surface->map->identity) {
  2747 		    if(df->Gmask == 0x7e0)
  2748 		    {
  2749 #if MMX_ASMBLIT
  2750 		if(SDL_HasMMX())
  2751 			return Blit565to565SurfaceAlphaMMX;
  2752 		else
  2753 #endif
  2754 			return Blit565to565SurfaceAlpha;
  2755 		    }
  2756 		    else if(df->Gmask == 0x3e0)
  2757 		    {
  2758 #if MMX_ASMBLIT
  2759 		if(SDL_HasMMX())
  2760 			return Blit555to555SurfaceAlphaMMX;
  2761 		else
  2762 #endif
  2763 			return Blit555to555SurfaceAlpha;
  2764 		    }
  2765 		}
  2766 		return BlitNtoNSurfaceAlpha;
  2767 
  2768 	    case 4:
  2769 		if(sf->Rmask == df->Rmask
  2770 		   && sf->Gmask == df->Gmask
  2771 		   && sf->Bmask == df->Bmask
  2772 		   && sf->BytesPerPixel == 4)
  2773 		{
  2774 #if MMX_ASMBLIT
  2775 			if(sf->Rshift % 8 == 0
  2776 			   && sf->Gshift % 8 == 0
  2777 			   && sf->Bshift % 8 == 0
  2778 			   && SDL_HasMMX())
  2779 			    return BlitRGBtoRGBSurfaceAlphaMMX;
  2780 #endif
  2781 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
  2782 			{
  2783 #if SDL_ALTIVEC_BLITTERS
  2784 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2785 					&& SDL_HasAltiVec())
  2786 					return BlitRGBtoRGBSurfaceAlphaAltivec;
  2787 #endif
  2788 				return BlitRGBtoRGBSurfaceAlpha;
  2789 			}
  2790 		}
  2791 #if SDL_ALTIVEC_BLITTERS
  2792 		if((sf->BytesPerPixel == 4) &&
  2793 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2794 			return Blit32to32SurfaceAlphaAltivec;
  2795 		else
  2796 #endif
  2797 			return BlitNtoNSurfaceAlpha;
  2798 
  2799 	    case 3:
  2800 	    default:
  2801 		return BlitNtoNSurfaceAlpha;
  2802 	    }
  2803 	}
  2804     } else {
  2805 	/* Per-pixel alpha blits */
  2806 	switch(df->BytesPerPixel) {
  2807 	case 1:
  2808 	    return BlitNto1PixelAlpha;
  2809 
  2810 	case 2:
  2811 #if SDL_ALTIVEC_BLITTERS
  2812 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
  2813            df->Gmask == 0x7e0 &&
  2814 	   df->Bmask == 0x1f && SDL_HasAltiVec())
  2815             return Blit32to565PixelAlphaAltivec;
  2816         else
  2817 #endif
  2818 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2819 	       && sf->Gmask == 0xff00
  2820 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2821 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2822 		if(df->Gmask == 0x7e0)
  2823 		    return BlitARGBto565PixelAlpha;
  2824 		else if(df->Gmask == 0x3e0)
  2825 		    return BlitARGBto555PixelAlpha;
  2826 	    }
  2827 	    return BlitNtoNPixelAlpha;
  2828 
  2829 	case 4:
  2830 	    if(sf->Rmask == df->Rmask
  2831 	       && sf->Gmask == df->Gmask
  2832 	       && sf->Bmask == df->Bmask
  2833 	       && sf->BytesPerPixel == 4)
  2834 	    {
  2835 #if MMX_ASMBLIT
  2836 		if(sf->Rshift % 8 == 0
  2837 		   && sf->Gshift % 8 == 0
  2838 		   && sf->Bshift % 8 == 0
  2839 		   && sf->Ashift % 8 == 0
  2840 		   && sf->Aloss == 0)
  2841 		{
  2842 			if(SDL_Has3DNow())
  2843 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2844 			if(SDL_HasMMX())
  2845 				return BlitRGBtoRGBPixelAlphaMMX;
  2846 		}
  2847 #endif
  2848 		if(sf->Amask == 0xff000000)
  2849 		{
  2850 #if SDL_ALTIVEC_BLITTERS
  2851 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2852 				&& SDL_HasAltiVec())
  2853 				return BlitRGBtoRGBPixelAlphaAltivec;
  2854 #endif
  2855 			return BlitRGBtoRGBPixelAlpha;
  2856 		}
  2857 	    }
  2858 #if SDL_ALTIVEC_BLITTERS
  2859 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
  2860 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2861 		return Blit32to32PixelAlphaAltivec;
  2862 	    else
  2863 #endif
  2864 		return BlitNtoNPixelAlpha;
  2865 
  2866 	case 3:
  2867 	default:
  2868 	    return BlitNtoNPixelAlpha;
  2869 	}
  2870     }
  2871 }
  2872