src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Sat, 10 Sep 2011 23:21:19 -0400
branchSDL-1.2
changeset 5883 739ad55fe50d
parent 4293 63b54ddd38ea
child 5906 867c4c3604b7
permissions -rw-r--r--
Disabled MMX blitters on GCC. They break the build on tons of machines now.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2009 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /*
    28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
    29    Checking if _mm_free is #defined in malloc.h is is the only way to
    30    determine if the Processor Pack is installed, as far as I can tell.
    31 */
    32 
    33 #if SDL_ASSEMBLY_ROUTINES
    34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    35      /* forced MMX to 0...it breaks on most compilers now.  --ryan. */
    36 #    define MMX_ASMBLIT 0
    37 #    define GCC_ASMBLIT 1
    38 #  elif defined(_MSC_VER) && defined(_M_IX86)
    39 #    if (_MSC_VER <= 1200)  
    40 #      include <malloc.h>   
    41 #      if defined(_mm_free)
    42 #          define HAVE_MMINTRIN_H 1
    43 #      endif
    44 #    else  /* Visual Studio > VC6 always has mmintrin.h */
    45 #      define HAVE_MMINTRIN_H 1
    46 #    endif
    47 #    if HAVE_MMINTRIN_H
    48 #      define MMX_ASMBLIT 1
    49 #      define MSVC_ASMBLIT 1
    50 #    endif
    51 #  endif
    52 #endif /* SDL_ASSEMBLY_ROUTINES */
    53 
    54 /* Function to check the CPU flags */
    55 #include "SDL_cpuinfo.h"
    56 #if GCC_ASMBLIT
    57 #include "mmx.h"
    58 #elif MSVC_ASMBLIT
    59 #include <mmintrin.h>
    60 #include <mm3dnow.h>
    61 #endif
    62 
    63 /* Functions to perform alpha blended blitting */
    64 
    65 /* N->1 blending with per-surface alpha */
    66 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
    67 {
    68 	int width = info->d_width;
    69 	int height = info->d_height;
    70 	Uint8 *src = info->s_pixels;
    71 	int srcskip = info->s_skip;
    72 	Uint8 *dst = info->d_pixels;
    73 	int dstskip = info->d_skip;
    74 	Uint8 *palmap = info->table;
    75 	SDL_PixelFormat *srcfmt = info->src;
    76 	SDL_PixelFormat *dstfmt = info->dst;
    77 	int srcbpp = srcfmt->BytesPerPixel;
    78 
    79 	const unsigned A = srcfmt->alpha;
    80 
    81 	while ( height-- ) {
    82 	    DUFFS_LOOP4(
    83 	    {
    84 		Uint32 Pixel;
    85 		unsigned sR;
    86 		unsigned sG;
    87 		unsigned sB;
    88 		unsigned dR;
    89 		unsigned dG;
    90 		unsigned dB;
    91 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    92 		dR = dstfmt->palette->colors[*dst].r;
    93 		dG = dstfmt->palette->colors[*dst].g;
    94 		dB = dstfmt->palette->colors[*dst].b;
    95 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    96 		dR &= 0xff;
    97 		dG &= 0xff;
    98 		dB &= 0xff;
    99 		/* Pack RGB into 8bit pixel */
   100 		if ( palmap == NULL ) {
   101 		    *dst =((dR>>5)<<(3+2))|
   102 			  ((dG>>5)<<(2))|
   103 			  ((dB>>6)<<(0));
   104 		} else {
   105 		    *dst = palmap[((dR>>5)<<(3+2))|
   106 				  ((dG>>5)<<(2))  |
   107 				  ((dB>>6)<<(0))];
   108 		}
   109 		dst++;
   110 		src += srcbpp;
   111 	    },
   112 	    width);
   113 	    src += srcskip;
   114 	    dst += dstskip;
   115 	}
   116 }
   117 
   118 /* N->1 blending with pixel alpha */
   119 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
   120 {
   121 	int width = info->d_width;
   122 	int height = info->d_height;
   123 	Uint8 *src = info->s_pixels;
   124 	int srcskip = info->s_skip;
   125 	Uint8 *dst = info->d_pixels;
   126 	int dstskip = info->d_skip;
   127 	Uint8 *palmap = info->table;
   128 	SDL_PixelFormat *srcfmt = info->src;
   129 	SDL_PixelFormat *dstfmt = info->dst;
   130 	int srcbpp = srcfmt->BytesPerPixel;
   131 
   132 	/* FIXME: fix alpha bit field expansion here too? */
   133 	while ( height-- ) {
   134 	    DUFFS_LOOP4(
   135 	    {
   136 		Uint32 Pixel;
   137 		unsigned sR;
   138 		unsigned sG;
   139 		unsigned sB;
   140 		unsigned sA;
   141 		unsigned dR;
   142 		unsigned dG;
   143 		unsigned dB;
   144 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   145 		dR = dstfmt->palette->colors[*dst].r;
   146 		dG = dstfmt->palette->colors[*dst].g;
   147 		dB = dstfmt->palette->colors[*dst].b;
   148 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   149 		dR &= 0xff;
   150 		dG &= 0xff;
   151 		dB &= 0xff;
   152 		/* Pack RGB into 8bit pixel */
   153 		if ( palmap == NULL ) {
   154 		    *dst =((dR>>5)<<(3+2))|
   155 			  ((dG>>5)<<(2))|
   156 			  ((dB>>6)<<(0));
   157 		} else {
   158 		    *dst = palmap[((dR>>5)<<(3+2))|
   159 				  ((dG>>5)<<(2))  |
   160 				  ((dB>>6)<<(0))  ];
   161 		}
   162 		dst++;
   163 		src += srcbpp;
   164 	    },
   165 	    width);
   166 	    src += srcskip;
   167 	    dst += dstskip;
   168 	}
   169 }
   170 
   171 /* colorkeyed N->1 blending with per-surface alpha */
   172 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
   173 {
   174 	int width = info->d_width;
   175 	int height = info->d_height;
   176 	Uint8 *src = info->s_pixels;
   177 	int srcskip = info->s_skip;
   178 	Uint8 *dst = info->d_pixels;
   179 	int dstskip = info->d_skip;
   180 	Uint8 *palmap = info->table;
   181 	SDL_PixelFormat *srcfmt = info->src;
   182 	SDL_PixelFormat *dstfmt = info->dst;
   183 	int srcbpp = srcfmt->BytesPerPixel;
   184 	Uint32 ckey = srcfmt->colorkey;
   185 
   186 	const int A = srcfmt->alpha;
   187 
   188 	while ( height-- ) {
   189 	    DUFFS_LOOP(
   190 	    {
   191 		Uint32 Pixel;
   192 		unsigned sR;
   193 		unsigned sG;
   194 		unsigned sB;
   195 		unsigned dR;
   196 		unsigned dG;
   197 		unsigned dB;
   198 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   199 		if ( Pixel != ckey ) {
   200 		    dR = dstfmt->palette->colors[*dst].r;
   201 		    dG = dstfmt->palette->colors[*dst].g;
   202 		    dB = dstfmt->palette->colors[*dst].b;
   203 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   204 		    dR &= 0xff;
   205 		    dG &= 0xff;
   206 		    dB &= 0xff;
   207 		    /* Pack RGB into 8bit pixel */
   208 		    if ( palmap == NULL ) {
   209 			*dst =((dR>>5)<<(3+2))|
   210 			      ((dG>>5)<<(2)) |
   211 			      ((dB>>6)<<(0));
   212 		    } else {
   213 			*dst = palmap[((dR>>5)<<(3+2))|
   214 				      ((dG>>5)<<(2))  |
   215 				      ((dB>>6)<<(0))  ];
   216 		    }
   217 		}
   218 		dst++;
   219 		src += srcbpp;
   220 	    },
   221 	    width);
   222 	    src += srcskip;
   223 	    dst += dstskip;
   224 	}
   225 }
   226 
   227 #if GCC_ASMBLIT
   228 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   229 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   230 {
   231 	int width = info->d_width;
   232 	int height = info->d_height;
   233 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   234 	int srcskip = info->s_skip >> 2;
   235 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   236 	int dstskip = info->d_skip >> 2;
   237 	Uint32 dalpha = info->dst->Amask;
   238 	Uint64 load;
   239 
   240 	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
   241 	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
   242 	load = 0x0001010100010101ULL;/* !alpha128 mask */
   243 	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
   244 	movd_m2r(dalpha, mm7); /* dst alpha mask */
   245 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   246 	while(height--) {
   247 		DUFFS_LOOP_DOUBLE2(
   248 		{
   249 			Uint32 s = *srcp++;
   250 			Uint32 d = *dstp;
   251 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   252 				   + (s & d & 0x00010101)) | dalpha;
   253 		},{
   254 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   255 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   256 
   257 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   258 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   259 
   260 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   261 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   262 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   263 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   264 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   265 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   266 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   267 			
   268 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   269 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   270 			dstp += 2;
   271 			srcp += 2;
   272 		}, width);
   273 		srcp += srcskip;
   274 		dstp += dstskip;
   275 	}
   276 	emms();
   277 }
   278 
   279 /* fast RGB888->(A)RGB888 blending with surface alpha */
   280 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   281 {
   282 	SDL_PixelFormat* df = info->dst;
   283 	unsigned alpha = info->src->alpha;
   284 
   285 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   286 			/* only call a128 version when R,G,B occupy lower bits */
   287 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   288 	} else {
   289 		int width = info->d_width;
   290 		int height = info->d_height;
   291 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   292 		int srcskip = info->s_skip >> 2;
   293 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   294 		int dstskip = info->d_skip >> 2;
   295 
   296 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
   297 		/* form the alpha mult */
   298 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
   299 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   300 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   301 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   302 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
   303 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
   304 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
   305 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   306 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
   307 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   308 		
   309 		while(height--) {
   310 			DUFFS_LOOP_DOUBLE2({
   311 				/* One Pixel Blend */
   312 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   313 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   314 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   315 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   316 
   317 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   318 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   319 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   320 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   321 
   322 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   323 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   324 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   325 				++srcp;
   326 				++dstp;
   327 			},{
   328 				/* Two Pixels Blend */
   329 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   330 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   331 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   332 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   333 
   334 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   335 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   336 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   337 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   338 
   339 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   340 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   341 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   342 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   343 
   344 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   345 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   346 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   347 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   348 
   349 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   350 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   351 				
   352 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   353 
   354   				srcp += 2;
   355   				dstp += 2;
   356   			}, width);
   357 			srcp += srcskip;
   358 			dstp += dstskip;
   359 		}
   360 		emms();
   361 	}
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   366 {
   367 	int width = info->d_width;
   368 	int height = info->d_height;
   369 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   370 	int srcskip = info->s_skip >> 2;
   371 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   372 	int dstskip = info->d_skip >> 2;
   373 	SDL_PixelFormat* sf = info->src;
   374 	Uint32 amask = sf->Amask;
   375 
   376 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
   377 	/* form multiplication mask */
   378 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
   379 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
   380 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
   381 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
   382 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
   383 	/* form channel masks */
   384 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
   385 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
   386 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
   387 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
   388 	/* get alpha channel shift */
   389 	__asm__ __volatile__ (
   390 		"movd %0, %%mm5"
   391 		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   392 
   393 	while(height--) {
   394 	    DUFFS_LOOP4({
   395 		Uint32 alpha = *srcp & amask;
   396 		/* FIXME: Here we special-case opaque alpha since the
   397 			compositioning used (>>8 instead of /255) doesn't handle
   398 			it correctly. Also special-case alpha=0 for speed?
   399 			Benchmark this! */
   400 		if(alpha == 0) {
   401 			/* do nothing */
   402 		} else if(alpha == amask) {
   403 			/* opaque alpha -- copy RGB, keep dst alpha */
   404 			/* using MMX here to free up regular registers for other things */
   405 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   406 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   407 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   408 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   409 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   410 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   411 		} else {
   412 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   413 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   414 
   415 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   416 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   417 
   418 			__asm__ __volatile__ (
   419 				"movd %0, %%mm4"
   420 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   421 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   422 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   423 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   424 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   425 
   426 			/* blend */		    
   427 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   428 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   429 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   430 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   431 			
   432 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   433 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   434 		}
   435 		++srcp;
   436 		++dstp;
   437 	    }, width);
   438 	    srcp += srcskip;
   439 	    dstp += dstskip;
   440 	}
   441 	emms();
   442 }
   443 /* End GCC_ASMBLIT */
   444 
   445 #elif MSVC_ASMBLIT
   446 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   447 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   448 {
   449 	int width = info->d_width;
   450 	int height = info->d_height;
   451 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   452 	int srcskip = info->s_skip >> 2;
   453 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   454 	int dstskip = info->d_skip >> 2;
   455 	Uint32 dalpha = info->dst->Amask;
   456 
   457 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   458 	
   459 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
   460 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
   461 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   462 
   463 	while (height--) {
   464 		int n = width;
   465 		if ( n & 1 ) {
   466 			Uint32 s = *srcp++;
   467 			Uint32 d = *dstp;
   468 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   469 				   + (s & d & 0x00010101)) | dalpha;
   470 			n--;
   471 		}
   472 		
   473 		for (n >>= 1; n > 0; --n) {
   474 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
   475 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
   476 
   477 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
   478 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   479 
   480 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
   481 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
   482 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
   483 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
   484 
   485 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
   486 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
   487 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
   488 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
   489 			
   490 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
   491 			dstp += 2;
   492 			srcp += 2;
   493 		}
   494 		
   495 		srcp += srcskip;
   496 		dstp += dstskip;
   497 	}
   498 	_mm_empty();
   499 }
   500 
   501 /* fast RGB888->(A)RGB888 blending with surface alpha */
   502 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   503 {
   504 	SDL_PixelFormat* df = info->dst;
   505 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   506 	unsigned alpha = info->src->alpha;
   507 
   508 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   509 			/* only call a128 version when R,G,B occupy lower bits */
   510 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   511 	} else {
   512 		int width = info->d_width;
   513 		int height = info->d_height;
   514 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   515 		int srcskip = info->s_skip >> 2;
   516 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   517 		int dstskip = info->d_skip >> 2;
   518 		Uint32 dalpha = df->Amask;
   519 		Uint32 amult;
   520 
   521 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   522 		
   523 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   524 		/* form the alpha mult */
   525 		amult = alpha | (alpha << 8);
   526 		amult = amult | (amult << 16);
   527 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   528 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
   529 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   530 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   531 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   532 		
   533 		while (height--) {
   534 			int n = width;
   535 			if (n & 1) {
   536 				/* One Pixel Blend */
   537 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
   538 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   539 
   540 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   541 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   542 
   543 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
   544 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   545 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   546 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   547 				
   548 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   549 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   550 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   551 
   552 				++srcp;
   553 				++dstp;
   554 				
   555 				n--;
   556 			}
   557 
   558 			for (n >>= 1; n > 0; --n) {
   559 				/* Two Pixels Blend */
   560 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
   561 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   562 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   563 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   564 
   565 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
   566 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
   567 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   568 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   569 
   570 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   571 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
   572 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
   573 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   574 
   575 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
   576 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   577 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   578 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   579 				
   580 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   581 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   582 
   583 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
   584 
   585 				srcp += 2;
   586 				dstp += 2;
   587 			}
   588 			srcp += srcskip;
   589 			dstp += dstskip;
   590 		}
   591 		_mm_empty();
   592 	}
   593 }
   594 
   595 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   596 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   597 {
   598 	int width = info->d_width;
   599 	int height = info->d_height;
   600 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   601 	int srcskip = info->s_skip >> 2;
   602 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   603 	int dstskip = info->d_skip >> 2;
   604 	SDL_PixelFormat* sf = info->src;
   605 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   606 	Uint32 amask = sf->Amask;
   607 	Uint32 ashift = sf->Ashift;
   608 	Uint64 multmask;
   609 
   610 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   611 
   612 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   613 	multmask = ~(0xFFFFi64 << (ashift * 2));
   614 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   615 
   616 	while(height--) {
   617 		DUFFS_LOOP4({
   618 		Uint32 alpha = *srcp & amask;
   619 		if (alpha == 0) {
   620 			/* do nothing */
   621 		} else if (alpha == amask) {
   622 			/* opaque alpha -- copy RGB, keep dst alpha */
   623 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   624 		} else {
   625 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   626 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   627 
   628 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   629 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   630 
   631 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   632 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   633 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   634 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   635 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   636 
   637 			/* blend */		    
   638 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   639 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   640 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   641 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   642 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   643 			
   644 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   645 		}
   646 		++srcp;
   647 		++dstp;
   648 	    }, width);
   649 	    srcp += srcskip;
   650 	    dstp += dstskip;
   651 	}
   652 	_mm_empty();
   653 }
   654 /* End MSVC_ASMBLIT */
   655 
   656 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   657 
   658 #if SDL_ALTIVEC_BLITTERS
   659 #if __MWERKS__
   660 #pragma altivec_model on
   661 #endif
   662 #if HAVE_ALTIVEC_H
   663 #include <altivec.h>
   664 #endif
   665 #include <assert.h>
   666 
   667 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   668     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   669         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   670     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   671         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   672 #else
   673     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   674         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   675     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   676         (vector unsigned short) { a,b,c,d,e,f,g,h }
   677 #endif
   678 
   679 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   680 #define VECPRINT(msg, v) do { \
   681     vector unsigned int tmpvec = (vector unsigned int)(v); \
   682     unsigned int *vp = (unsigned int *)&tmpvec; \
   683     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   684 } while (0)
   685 
   686 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   687     (vector unsigned char)(
   688         0x00, 0x10, 0x02, 0x12,
   689         0x04, 0x14, 0x06, 0x16,
   690         0x08, 0x18, 0x0A, 0x1A,
   691         0x0C, 0x1C, 0x0E, 0x1E );
   692 */
   693 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   694 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   695 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   696 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   697     ? vec_lvsl(0, src) \
   698     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   699 
   700    
   701 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   702     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   703     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   704     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   705     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   706     /* valpha2 is 255-alpha */ \
   707     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   708     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   709     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   710     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   711     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   712     /* add source and dest */ \
   713     vtemp1 = vec_add(vtemp1, vtemp3); \
   714     vtemp2 = vec_add(vtemp2, vtemp4); \
   715     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   716     vtemp1 = vec_add(vtemp1, v1_16); \
   717     vtemp3 = vec_sr(vtemp1, v8_16); \
   718     vtemp1 = vec_add(vtemp1, vtemp3); \
   719     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   720     vtemp2 = vec_add(vtemp2, v1_16); \
   721     vtemp4 = vec_sr(vtemp2, v8_16); \
   722     vtemp2 = vec_add(vtemp2, vtemp4); \
   723     /* (>>8) and get ARGBARGBARGBARGB */ \
   724     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   725 } while (0)
   726  
   727 /* Calculate the permute vector used for 32->32 swizzling */
   728 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
   729                                   const SDL_PixelFormat *dstfmt)
   730 {
   731     /*
   732      * We have to assume that the bits that aren't used by other
   733      *  colors is alpha, and it's one complete byte, since some formats
   734      *  leave alpha with a zero mask, but we should still swizzle the bits.
   735      */
   736     /* ARGB */
   737     const static struct SDL_PixelFormat default_pixel_format = {
   738         NULL, 0, 0,
   739         0, 0, 0, 0,
   740         16, 8, 0, 24,
   741         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   742         0, 0};
   743     if (!srcfmt) {
   744         srcfmt = &default_pixel_format;
   745     }
   746     if (!dstfmt) {
   747         dstfmt = &default_pixel_format;
   748     }
   749     const vector unsigned char plus = VECUINT8_LITERAL
   750                                             ( 0x00, 0x00, 0x00, 0x00,
   751                                               0x04, 0x04, 0x04, 0x04,
   752                                               0x08, 0x08, 0x08, 0x08,
   753                                               0x0C, 0x0C, 0x0C, 0x0C );
   754     vector unsigned char vswiz;
   755     vector unsigned int srcvec;
   756 #define RESHIFT(X) (3 - ((X) >> 3))
   757     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   758     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   759     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   760     Uint32 amask;
   761     /* Use zero for alpha if either surface doesn't have alpha */
   762     if (dstfmt->Amask) {
   763         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   764     } else {
   765         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   766     }
   767 #undef RESHIFT  
   768     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
   769     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   770     return(vswiz);
   771 }
   772 
   773 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   774 {
   775     int height = info->d_height;
   776     Uint8 *src = (Uint8 *)info->s_pixels;
   777     int srcskip = info->s_skip;
   778     Uint8 *dst = (Uint8 *)info->d_pixels;
   779     int dstskip = info->d_skip;
   780     SDL_PixelFormat *srcfmt = info->src;
   781 
   782     vector unsigned char v0 = vec_splat_u8(0);
   783     vector unsigned short v8_16 = vec_splat_u16(8);
   784     vector unsigned short v1_16 = vec_splat_u16(1);
   785     vector unsigned short v2_16 = vec_splat_u16(2);
   786     vector unsigned short v3_16 = vec_splat_u16(3);
   787     vector unsigned int v8_32 = vec_splat_u32(8);
   788     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   789     vector unsigned short v3f = VECUINT16_LITERAL(
   790         0x003f, 0x003f, 0x003f, 0x003f,
   791         0x003f, 0x003f, 0x003f, 0x003f);
   792     vector unsigned short vfc = VECUINT16_LITERAL(
   793         0x00fc, 0x00fc, 0x00fc, 0x00fc,
   794         0x00fc, 0x00fc, 0x00fc, 0x00fc);
   795 
   796     /* 
   797         0x10 - 0x1f is the alpha
   798         0x00 - 0x0e evens are the red
   799         0x01 - 0x0f odds are zero
   800     */
   801     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
   802         0x10, 0x00, 0x01, 0x01,
   803         0x10, 0x02, 0x01, 0x01,
   804         0x10, 0x04, 0x01, 0x01,
   805         0x10, 0x06, 0x01, 0x01
   806     );
   807     vector unsigned char vredalpha2 = (vector unsigned char)(
   808         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   809     );
   810     /*
   811         0x00 - 0x0f is ARxx ARxx ARxx ARxx
   812         0x11 - 0x0f odds are blue
   813     */
   814     vector unsigned char vblue1 = VECUINT8_LITERAL(
   815         0x00, 0x01, 0x02, 0x11,
   816         0x04, 0x05, 0x06, 0x13,
   817         0x08, 0x09, 0x0a, 0x15,
   818         0x0c, 0x0d, 0x0e, 0x17
   819     );
   820     vector unsigned char vblue2 = (vector unsigned char)(
   821         vec_add((vector unsigned int)vblue1, v8_32)
   822     );
   823     /*
   824         0x00 - 0x0f is ARxB ARxB ARxB ARxB
   825         0x10 - 0x0e evens are green
   826     */
   827     vector unsigned char vgreen1 = VECUINT8_LITERAL(
   828         0x00, 0x01, 0x10, 0x03,
   829         0x04, 0x05, 0x12, 0x07,
   830         0x08, 0x09, 0x14, 0x0b,
   831         0x0c, 0x0d, 0x16, 0x0f
   832     );
   833     vector unsigned char vgreen2 = (vector unsigned char)(
   834         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   835     );
   836     vector unsigned char vgmerge = VECUINT8_LITERAL(
   837         0x00, 0x02, 0x00, 0x06,
   838         0x00, 0x0a, 0x00, 0x0e,
   839         0x00, 0x12, 0x00, 0x16,
   840         0x00, 0x1a, 0x00, 0x1e);
   841     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   842     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   843     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   844 
   845     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   846     vf800 = vec_sl(vf800, vec_splat_u16(8));
   847 
   848     while(height--) {
   849         int extrawidth;
   850         vector unsigned char valigner;
   851         vector unsigned char vsrc;
   852         vector unsigned char voverflow;
   853         int width = info->d_width;
   854 
   855 #define ONE_PIXEL_BLEND(condition, widthvar) \
   856         while (condition) { \
   857             Uint32 Pixel; \
   858             unsigned sR, sG, sB, dR, dG, dB, sA; \
   859             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   860             if(sA) { \
   861                 unsigned short dstpixel = *((unsigned short *)dst); \
   862                 dR = (dstpixel >> 8) & 0xf8; \
   863                 dG = (dstpixel >> 3) & 0xfc; \
   864                 dB = (dstpixel << 3) & 0xf8; \
   865                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   866                 *((unsigned short *)dst) = ( \
   867                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   868                 ); \
   869             } \
   870             src += 4; \
   871             dst += 2; \
   872             widthvar--; \
   873         }
   874         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   875         extrawidth = (width % 8);
   876         valigner = VEC_ALIGNER(src);
   877         vsrc = (vector unsigned char)vec_ld(0, src);
   878         width -= extrawidth;
   879         while (width) {
   880             vector unsigned char valpha;
   881             vector unsigned char vsrc1, vsrc2;
   882             vector unsigned char vdst1, vdst2;
   883             vector unsigned short vR, vG, vB;
   884             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   885 
   886             /* Load 8 pixels from src as ARGB */
   887             voverflow = (vector unsigned char)vec_ld(15, src);
   888             vsrc = vec_perm(vsrc, voverflow, valigner);
   889             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   890             src += 16;
   891             vsrc = (vector unsigned char)vec_ld(15, src);
   892             voverflow = vec_perm(voverflow, vsrc, valigner);
   893             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   894             src += 16;
   895 
   896             /* Load 8 pixels from dst as XRGB */
   897             voverflow = vec_ld(0, dst);
   898             vR = vec_and((vector unsigned short)voverflow, vf800);
   899             vB = vec_sl((vector unsigned short)voverflow, v3_16);
   900             vG = vec_sl(vB, v2_16);
   901             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   902             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   903             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   904             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   905             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   906             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   907 
   908             /* Alpha blend 8 pixels as ARGB */
   909             valpha = vec_perm(vsrc1, v0, valphaPermute);
   910             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   911             valpha = vec_perm(vsrc2, v0, valphaPermute);
   912             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   913 
   914             /* Convert 8 pixels to 565 */
   915             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   916             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   917             vgpixel = vec_and(vgpixel, vfc);
   918             vgpixel = vec_sl(vgpixel, v3_16);
   919             vrpixel = vec_sl(vpixel, v1_16);
   920             vrpixel = vec_and(vrpixel, vf800);
   921             vbpixel = vec_and(vpixel, v3f);
   922             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   923             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   924             
   925             /* Store 8 pixels */
   926             vec_st(vdst1, 0, dst);
   927 
   928             width -= 8;
   929             dst += 16;
   930         }
   931         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   932 #undef ONE_PIXEL_BLEND
   933         src += srcskip;
   934         dst += dstskip;
   935     }
   936 }
   937 
   938 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   939 {
   940     unsigned alpha = info->src->alpha;
   941     int height = info->d_height;
   942     Uint32 *srcp = (Uint32 *)info->s_pixels;
   943     int srcskip = info->s_skip >> 2;
   944     Uint32 *dstp = (Uint32 *)info->d_pixels;
   945     int dstskip = info->d_skip >> 2;
   946     SDL_PixelFormat *srcfmt = info->src;
   947     SDL_PixelFormat *dstfmt = info->dst;
   948     unsigned sA = srcfmt->alpha;
   949     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   950     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   951     Uint32 ckey = info->src->colorkey;
   952     vector unsigned char mergePermute;
   953     vector unsigned char vsrcPermute;
   954     vector unsigned char vdstPermute;
   955     vector unsigned char vsdstPermute;
   956     vector unsigned char valpha;
   957     vector unsigned char valphamask;
   958     vector unsigned char vbits;
   959     vector unsigned char v0;
   960     vector unsigned short v1;
   961     vector unsigned short v8;
   962     vector unsigned int vckey;
   963     vector unsigned int vrgbmask;
   964 
   965     mergePermute = VEC_MERGE_PERMUTE();
   966     v0 = vec_splat_u8(0);
   967     v1 = vec_splat_u16(1);
   968     v8 = vec_splat_u16(8);
   969 
   970     /* set the alpha to 255 on the destination surf */
   971     valphamask = VEC_ALPHA_MASK();
   972 
   973     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   974     vdstPermute = calc_swizzle32(NULL, dstfmt);
   975     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   976 
   977     /* set a vector full of alpha and 255-alpha */
   978     ((unsigned char *)&valpha)[0] = alpha;
   979     valpha = vec_splat(valpha, 0);
   980     vbits = (vector unsigned char)vec_splat_s8(-1);
   981 
   982     ckey &= rgbmask;
   983     ((unsigned int *)(char*)&vckey)[0] = ckey;
   984     vckey = vec_splat(vckey, 0);
   985     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
   986     vrgbmask = vec_splat(vrgbmask, 0);
   987 
   988     while(height--) {
   989         int width = info->d_width;
   990 #define ONE_PIXEL_BLEND(condition, widthvar) \
   991         while (condition) { \
   992             Uint32 Pixel; \
   993             unsigned sR, sG, sB, dR, dG, dB; \
   994             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   995             if(sA && Pixel != ckey) { \
   996                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   997                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   998                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   999                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1000             } \
  1001             dstp++; \
  1002             srcp++; \
  1003             widthvar--; \
  1004         }
  1005         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1006         if (width > 0) {
  1007             int extrawidth = (width % 4);
  1008             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1009             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1010             width -= extrawidth;
  1011             while (width) {
  1012                 vector unsigned char vsel;
  1013                 vector unsigned char voverflow;
  1014                 vector unsigned char vd;
  1015                 vector unsigned char vd_orig;
  1016 
  1017                 /* s = *srcp */
  1018                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1019                 vs = vec_perm(vs, voverflow, valigner);
  1020                 
  1021                 /* vsel is set for items that match the key */
  1022                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
  1023                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
  1024 
  1025                 /* permute to source format */
  1026                 vs = vec_perm(vs, valpha, vsrcPermute);
  1027 
  1028                 /* d = *dstp */
  1029                 vd = (vector unsigned char)vec_ld(0, dstp);
  1030                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1031 
  1032                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1033 
  1034                 /* set the alpha channel to full on */
  1035                 vd = vec_or(vd, valphamask);
  1036 
  1037                 /* mask out color key */
  1038                 vd = vec_sel(vd, vd_orig, vsel);
  1039                 
  1040                 /* permute to dest format */
  1041                 vd = vec_perm(vd, vbits, vdstPermute);
  1042 
  1043                 /* *dstp = res */
  1044                 vec_st((vector unsigned int)vd, 0, dstp);
  1045                 
  1046                 srcp += 4;
  1047                 dstp += 4;
  1048                 width -= 4;
  1049                 vs = voverflow;
  1050             }
  1051             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1052         }
  1053 #undef ONE_PIXEL_BLEND
  1054  
  1055         srcp += srcskip;
  1056         dstp += dstskip;
  1057     }
  1058 }
  1059 
  1060 
  1061 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
  1062 {
  1063     int width = info->d_width;
  1064     int height = info->d_height;
  1065     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1066     int srcskip = info->s_skip >> 2;
  1067     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1068     int dstskip = info->d_skip >> 2;
  1069     SDL_PixelFormat *srcfmt = info->src;
  1070     SDL_PixelFormat *dstfmt = info->dst;
  1071     vector unsigned char mergePermute;
  1072     vector unsigned char valphaPermute;
  1073     vector unsigned char vsrcPermute;
  1074     vector unsigned char vdstPermute;
  1075     vector unsigned char vsdstPermute;
  1076     vector unsigned char valphamask;
  1077     vector unsigned char vpixelmask;
  1078     vector unsigned char v0;
  1079     vector unsigned short v1;
  1080     vector unsigned short v8;
  1081 
  1082     v0 = vec_splat_u8(0);
  1083     v1 = vec_splat_u16(1);
  1084     v8 = vec_splat_u16(8);
  1085     mergePermute = VEC_MERGE_PERMUTE();
  1086     valphamask = VEC_ALPHA_MASK();
  1087     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1088     vpixelmask = vec_nor(valphamask, v0);
  1089     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1090     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1091     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1092 
  1093 	while ( height-- ) {
  1094         width = info->d_width;
  1095 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1096             Uint32 Pixel; \
  1097             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1098             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1099             if(sA) { \
  1100               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1101               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1102               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1103             } \
  1104             ++srcp; \
  1105             ++dstp; \
  1106             widthvar--; \
  1107         }
  1108         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1109         if (width > 0) {
  1110             /* vsrcPermute */
  1111             /* vdstPermute */
  1112             int extrawidth = (width % 4);
  1113             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1114             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1115             width -= extrawidth;
  1116             while (width) {
  1117                 vector unsigned char voverflow;
  1118                 vector unsigned char vd;
  1119                 vector unsigned char valpha;
  1120                 vector unsigned char vdstalpha;
  1121                 /* s = *srcp */
  1122                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1123                 vs = vec_perm(vs, voverflow, valigner);
  1124                 vs = vec_perm(vs, v0, vsrcPermute);
  1125 
  1126                 valpha = vec_perm(vs, v0, valphaPermute);
  1127                 
  1128                 /* d = *dstp */
  1129                 vd = (vector unsigned char)vec_ld(0, dstp);
  1130                 vd = vec_perm(vd, v0, vsdstPermute);
  1131                 vdstalpha = vec_and(vd, valphamask);
  1132 
  1133                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1134 
  1135                 /* set the alpha to the dest alpha */
  1136                 vd = vec_and(vd, vpixelmask);
  1137                 vd = vec_or(vd, vdstalpha);
  1138                 vd = vec_perm(vd, v0, vdstPermute);
  1139 
  1140                 /* *dstp = res */
  1141                 vec_st((vector unsigned int)vd, 0, dstp);
  1142                 
  1143                 srcp += 4;
  1144                 dstp += 4;
  1145                 width -= 4;
  1146                 vs = voverflow;
  1147 
  1148             }
  1149             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1150         }
  1151 	    srcp += srcskip;
  1152 	    dstp += dstskip;
  1153 #undef ONE_PIXEL_BLEND
  1154 	}
  1155 }
  1156 
  1157 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1158 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
  1159 {
  1160 	int width = info->d_width;
  1161 	int height = info->d_height;
  1162 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1163 	int srcskip = info->s_skip >> 2;
  1164 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1165 	int dstskip = info->d_skip >> 2;
  1166     vector unsigned char mergePermute;
  1167     vector unsigned char valphaPermute;
  1168     vector unsigned char valphamask;
  1169     vector unsigned char vpixelmask;
  1170     vector unsigned char v0;
  1171     vector unsigned short v1;
  1172     vector unsigned short v8;
  1173     v0 = vec_splat_u8(0);
  1174     v1 = vec_splat_u16(1);
  1175     v8 = vec_splat_u16(8);
  1176     mergePermute = VEC_MERGE_PERMUTE();
  1177     valphamask = VEC_ALPHA_MASK();
  1178     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1179     
  1180  
  1181     vpixelmask = vec_nor(valphamask, v0);
  1182 	while(height--) {
  1183         width = info->d_width;
  1184 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1185         while ((condition)) { \
  1186             Uint32 dalpha; \
  1187             Uint32 d; \
  1188             Uint32 s1; \
  1189             Uint32 d1; \
  1190             Uint32 s = *srcp; \
  1191             Uint32 alpha = s >> 24; \
  1192             if(alpha) { \
  1193               if(alpha == SDL_ALPHA_OPAQUE) { \
  1194                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1195               } else { \
  1196                 d = *dstp; \
  1197                 dalpha = d & 0xff000000; \
  1198                 s1 = s & 0xff00ff; \
  1199                 d1 = d & 0xff00ff; \
  1200                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1201                 s &= 0xff00; \
  1202                 d &= 0xff00; \
  1203                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1204                 *dstp = d1 | d | dalpha; \
  1205               } \
  1206             } \
  1207             ++srcp; \
  1208             ++dstp; \
  1209             widthvar--; \
  1210 	    }
  1211         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1212         if (width > 0) {
  1213             int extrawidth = (width % 4);
  1214             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1215             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1216             width -= extrawidth;
  1217             while (width) {
  1218                 vector unsigned char voverflow;
  1219                 vector unsigned char vd;
  1220                 vector unsigned char valpha;
  1221                 vector unsigned char vdstalpha;
  1222                 /* s = *srcp */
  1223                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1224                 vs = vec_perm(vs, voverflow, valigner);
  1225 
  1226                 valpha = vec_perm(vs, v0, valphaPermute);
  1227                 
  1228                 /* d = *dstp */
  1229                 vd = (vector unsigned char)vec_ld(0, dstp);
  1230                 vdstalpha = vec_and(vd, valphamask);
  1231 
  1232                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1233 
  1234                 /* set the alpha to the dest alpha */
  1235                 vd = vec_and(vd, vpixelmask);
  1236                 vd = vec_or(vd, vdstalpha);
  1237 
  1238                 /* *dstp = res */
  1239                 vec_st((vector unsigned int)vd, 0, dstp);
  1240                 
  1241                 srcp += 4;
  1242                 dstp += 4;
  1243                 width -= 4;
  1244                 vs = voverflow;
  1245             }
  1246             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1247         }
  1248 	    srcp += srcskip;
  1249 	    dstp += dstskip;
  1250 	}
  1251 #undef ONE_PIXEL_BLEND
  1252 }
  1253 
  1254 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
  1255 {
  1256     /* XXX : 6 */
  1257 	unsigned alpha = info->src->alpha;
  1258     int height = info->d_height;
  1259     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1260     int srcskip = info->s_skip >> 2;
  1261     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1262     int dstskip = info->d_skip >> 2;
  1263     SDL_PixelFormat *srcfmt = info->src;
  1264     SDL_PixelFormat *dstfmt = info->dst;
  1265 	unsigned sA = srcfmt->alpha;
  1266 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1267     vector unsigned char mergePermute;
  1268     vector unsigned char vsrcPermute;
  1269     vector unsigned char vdstPermute;
  1270     vector unsigned char vsdstPermute;
  1271     vector unsigned char valpha;
  1272     vector unsigned char valphamask;
  1273     vector unsigned char vbits;
  1274     vector unsigned short v1;
  1275     vector unsigned short v8;
  1276 
  1277     mergePermute = VEC_MERGE_PERMUTE();
  1278     v1 = vec_splat_u16(1);
  1279     v8 = vec_splat_u16(8);
  1280 
  1281     /* set the alpha to 255 on the destination surf */
  1282     valphamask = VEC_ALPHA_MASK();
  1283 
  1284     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1285     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1286     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1287 
  1288     /* set a vector full of alpha and 255-alpha */
  1289     ((unsigned char *)&valpha)[0] = alpha;
  1290     valpha = vec_splat(valpha, 0);
  1291     vbits = (vector unsigned char)vec_splat_s8(-1);
  1292 
  1293     while(height--) {
  1294         int width = info->d_width;
  1295 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1296             Uint32 Pixel; \
  1297             unsigned sR, sG, sB, dR, dG, dB; \
  1298             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1299             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1300             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1301             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1302             ++srcp; \
  1303             ++dstp; \
  1304             widthvar--; \
  1305         }
  1306         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1307         if (width > 0) {
  1308             int extrawidth = (width % 4);
  1309             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1310             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1311             width -= extrawidth;
  1312             while (width) {
  1313                 vector unsigned char voverflow;
  1314                 vector unsigned char vd;
  1315 
  1316                 /* s = *srcp */
  1317                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1318                 vs = vec_perm(vs, voverflow, valigner);
  1319                 vs = vec_perm(vs, valpha, vsrcPermute);
  1320                 
  1321                 /* d = *dstp */
  1322                 vd = (vector unsigned char)vec_ld(0, dstp);
  1323                 vd = vec_perm(vd, vd, vsdstPermute);
  1324 
  1325                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1326 
  1327                 /* set the alpha channel to full on */
  1328                 vd = vec_or(vd, valphamask);
  1329                 vd = vec_perm(vd, vbits, vdstPermute);
  1330 
  1331                 /* *dstp = res */
  1332                 vec_st((vector unsigned int)vd, 0, dstp);
  1333                 
  1334                 srcp += 4;
  1335                 dstp += 4;
  1336                 width -= 4;
  1337                 vs = voverflow;
  1338             }
  1339             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1340         }
  1341 #undef ONE_PIXEL_BLEND
  1342  
  1343         srcp += srcskip;
  1344         dstp += dstskip;
  1345     }
  1346 
  1347 }
  1348 
  1349 
  1350 /* fast RGB888->(A)RGB888 blending */
  1351 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
  1352 {
  1353 	unsigned alpha = info->src->alpha;
  1354     int height = info->d_height;
  1355     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1356     int srcskip = info->s_skip >> 2;
  1357     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1358     int dstskip = info->d_skip >> 2;
  1359     vector unsigned char mergePermute;
  1360     vector unsigned char valpha;
  1361     vector unsigned char valphamask;
  1362     vector unsigned short v1;
  1363     vector unsigned short v8;
  1364 
  1365     mergePermute = VEC_MERGE_PERMUTE();
  1366     v1 = vec_splat_u16(1);
  1367     v8 = vec_splat_u16(8);
  1368 
  1369     /* set the alpha to 255 on the destination surf */
  1370     valphamask = VEC_ALPHA_MASK();
  1371 
  1372     /* set a vector full of alpha and 255-alpha */
  1373     ((unsigned char *)&valpha)[0] = alpha;
  1374     valpha = vec_splat(valpha, 0);
  1375 
  1376     while(height--) {
  1377         int width = info->d_width;
  1378 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1379             Uint32 s = *srcp; \
  1380             Uint32 d = *dstp; \
  1381             Uint32 s1 = s & 0xff00ff; \
  1382             Uint32 d1 = d & 0xff00ff; \
  1383             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1384                  & 0xff00ff; \
  1385             s &= 0xff00; \
  1386             d &= 0xff00; \
  1387             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1388             *dstp = d1 | d | 0xff000000; \
  1389             ++srcp; \
  1390             ++dstp; \
  1391             widthvar--; \
  1392         }
  1393         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1394         if (width > 0) {
  1395             int extrawidth = (width % 4);
  1396             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1397             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1398             width -= extrawidth;
  1399             while (width) {
  1400                 vector unsigned char voverflow;
  1401                 vector unsigned char vd;
  1402 
  1403                 /* s = *srcp */
  1404                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1405                 vs = vec_perm(vs, voverflow, valigner);
  1406                 
  1407                 /* d = *dstp */
  1408                 vd = (vector unsigned char)vec_ld(0, dstp);
  1409 
  1410                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1411 
  1412                 /* set the alpha channel to full on */
  1413                 vd = vec_or(vd, valphamask);
  1414 
  1415                 /* *dstp = res */
  1416                 vec_st((vector unsigned int)vd, 0, dstp);
  1417                 
  1418                 srcp += 4;
  1419                 dstp += 4;
  1420                 width -= 4;
  1421                 vs = voverflow;
  1422             }
  1423             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1424         }
  1425 #undef ONE_PIXEL_BLEND
  1426  
  1427         srcp += srcskip;
  1428         dstp += dstskip;
  1429     }
  1430 }
  1431 #if __MWERKS__
  1432 #pragma altivec_model off
  1433 #endif
  1434 #endif /* SDL_ALTIVEC_BLITTERS */
  1435 
  1436 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1437 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  1438 {
  1439 	int width = info->d_width;
  1440 	int height = info->d_height;
  1441 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1442 	int srcskip = info->s_skip >> 2;
  1443 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1444 	int dstskip = info->d_skip >> 2;
  1445 
  1446 	while(height--) {
  1447 	    DUFFS_LOOP4({
  1448 		    Uint32 s = *srcp++;
  1449 		    Uint32 d = *dstp;
  1450 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1451 			       + (s & d & 0x00010101)) | 0xff000000;
  1452 	    }, width);
  1453 	    srcp += srcskip;
  1454 	    dstp += dstskip;
  1455 	}
  1456 }
  1457 
  1458 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1459 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  1460 {
  1461 	unsigned alpha = info->src->alpha;
  1462 	if(alpha == 128) {
  1463 		BlitRGBtoRGBSurfaceAlpha128(info);
  1464 	} else {
  1465 		int width = info->d_width;
  1466 		int height = info->d_height;
  1467 		Uint32 *srcp = (Uint32 *)info->s_pixels;
  1468 		int srcskip = info->s_skip >> 2;
  1469 		Uint32 *dstp = (Uint32 *)info->d_pixels;
  1470 		int dstskip = info->d_skip >> 2;
  1471 		Uint32 s;
  1472 		Uint32 d;
  1473 		Uint32 s1;
  1474 		Uint32 d1;
  1475 
  1476 		while(height--) {
  1477 			DUFFS_LOOP_DOUBLE2({
  1478 				/* One Pixel Blend */
  1479 				s = *srcp;
  1480 				d = *dstp;
  1481 				s1 = s & 0xff00ff;
  1482 				d1 = d & 0xff00ff;
  1483 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1484 				     & 0xff00ff;
  1485 				s &= 0xff00;
  1486 				d &= 0xff00;
  1487 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1488 				*dstp = d1 | d | 0xff000000;
  1489 				++srcp;
  1490 				++dstp;
  1491 			},{
  1492 			        /* Two Pixels Blend */
  1493 				s = *srcp;
  1494 				d = *dstp;
  1495 				s1 = s & 0xff00ff;
  1496 				d1 = d & 0xff00ff;
  1497 				d1 += (s1 - d1) * alpha >> 8;
  1498 				d1 &= 0xff00ff;
  1499 				     
  1500 				s = ((s & 0xff00) >> 8) | 
  1501 					((srcp[1] & 0xff00) << 8);
  1502 				d = ((d & 0xff00) >> 8) |
  1503 					((dstp[1] & 0xff00) << 8);
  1504 				d += (s - d) * alpha >> 8;
  1505 				d &= 0x00ff00ff;
  1506 				
  1507 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1508 				++srcp;
  1509 				
  1510 			        s1 = *srcp;
  1511 				d1 = *dstp;
  1512 				s1 &= 0xff00ff;
  1513 				d1 &= 0xff00ff;
  1514 				d1 += (s1 - d1) * alpha >> 8;
  1515 				d1 &= 0xff00ff;
  1516 				
  1517 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1518 				++srcp;
  1519 				++dstp;
  1520 			}, width);
  1521 			srcp += srcskip;
  1522 			dstp += dstskip;
  1523 		}
  1524 	}
  1525 }
  1526 
  1527 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1528 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
  1529 {
  1530 	int width = info->d_width;
  1531 	int height = info->d_height;
  1532 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1533 	int srcskip = info->s_skip >> 2;
  1534 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1535 	int dstskip = info->d_skip >> 2;
  1536 
  1537 	while(height--) {
  1538 	    DUFFS_LOOP4({
  1539 		Uint32 dalpha;
  1540 		Uint32 d;
  1541 		Uint32 s1;
  1542 		Uint32 d1;
  1543 		Uint32 s = *srcp;
  1544 		Uint32 alpha = s >> 24;
  1545 		/* FIXME: Here we special-case opaque alpha since the
  1546 		   compositioning used (>>8 instead of /255) doesn't handle
  1547 		   it correctly. Also special-case alpha=0 for speed?
  1548 		   Benchmark this! */
  1549 		if(alpha) {   
  1550 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1551 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1552 		  } else {
  1553 		    /*
  1554 		     * take out the middle component (green), and process
  1555 		     * the other two in parallel. One multiply less.
  1556 		     */
  1557 		    d = *dstp;
  1558 		    dalpha = d & 0xff000000;
  1559 		    s1 = s & 0xff00ff;
  1560 		    d1 = d & 0xff00ff;
  1561 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1562 		    s &= 0xff00;
  1563 		    d &= 0xff00;
  1564 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1565 		    *dstp = d1 | d | dalpha;
  1566 		  }
  1567 		}
  1568 		++srcp;
  1569 		++dstp;
  1570 	    }, width);
  1571 	    srcp += srcskip;
  1572 	    dstp += dstskip;
  1573 	}
  1574 }
  1575 
  1576 #if GCC_ASMBLIT
  1577 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1578 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1579 {
  1580 	int width = info->d_width;
  1581 	int height = info->d_height;
  1582 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1583 	int srcskip = info->s_skip >> 2;
  1584 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1585 	int dstskip = info->d_skip >> 2;
  1586 	SDL_PixelFormat* sf = info->src;
  1587 	Uint32 amask = sf->Amask;
  1588 
  1589 	__asm__ (
  1590 	/* make mm6 all zeros. */
  1591 	"pxor       %%mm6, %%mm6\n"
  1592 	
  1593 	/* Make a mask to preserve the alpha. */
  1594 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
  1595 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
  1596 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
  1597 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
  1598 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
  1599 
  1600 	/* form channel masks */
  1601 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
  1602 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
  1603 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
  1604 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
  1605 	
  1606 	/* get alpha channel shift */
  1607 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
  1608 
  1609 	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
  1610 
  1611 	while(height--) {
  1612 
  1613 	    DUFFS_LOOP4({
  1614 		Uint32 alpha;
  1615 
  1616 		__asm__ (
  1617 		"prefetch 64(%0)\n"
  1618 		"prefetch 64(%1)\n"
  1619 			: : "r" (srcp), "r" (dstp) );
  1620 
  1621 		alpha = *srcp & amask;
  1622 		/* FIXME: Here we special-case opaque alpha since the
  1623 		   compositioning used (>>8 instead of /255) doesn't handle
  1624 		   it correctly. Also special-case alpha=0 for speed?
  1625 		   Benchmark this! */
  1626 		if(alpha == 0) {
  1627 		    /* do nothing */
  1628 		}
  1629 		else if(alpha == amask) {
  1630 			/* opaque alpha -- copy RGB, keep dst alpha */
  1631 		    /* using MMX here to free up regular registers for other things */
  1632 			    __asm__ (
  1633 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1634 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1635 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1636 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1637 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1638 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1639 
  1640 		     : : "r" (srcp), "r" (dstp) );
  1641 		} 
  1642 
  1643 		else {
  1644 			    __asm__ (
  1645 		    /* load in the source, and dst. */
  1646 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1647 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1648 
  1649 		    /* Move the src alpha into mm2 */
  1650 
  1651 		    /* if supporting pshufw */
  1652 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1653 		    /*"psrlw     $8, %%mm2\n" */
  1654 		    
  1655 		    /* else: */
  1656 		    "movd       %2,    %%mm2\n"
  1657 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1658 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1659 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1660 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1661 
  1662 		    /* move the colors into words. */
  1663 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1664 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1665 
  1666 		    /* src - dst */
  1667 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1668 
  1669 		    /* A * (src-dst) */
  1670 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1671 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1672 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1673 
  1674 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1675 		    
  1676 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1677 
  1678 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1679 
  1680 		}
  1681 		++srcp;
  1682 		++dstp;
  1683 	    }, width);
  1684 	    srcp += srcskip;
  1685 	    dstp += dstskip;
  1686 	}
  1687 
  1688 	__asm__ (
  1689 	"emms\n"
  1690 		:   );
  1691 }
  1692 /* End GCC_ASMBLIT*/
  1693 
  1694 #elif MSVC_ASMBLIT
  1695 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1696 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1697 {
  1698 	int width = info->d_width;
  1699 	int height = info->d_height;
  1700 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1701 	int srcskip = info->s_skip >> 2;
  1702 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1703 	int dstskip = info->d_skip >> 2;
  1704 	SDL_PixelFormat* sf = info->src;
  1705 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1706 	Uint32 amask = sf->Amask;
  1707 	Uint32 ashift = sf->Ashift;
  1708 	Uint64 multmask;
  1709 	
  1710 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1711 
  1712 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
  1713 	multmask = ~(0xFFFFi64 << (ashift * 2));
  1714 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
  1715 
  1716 	while(height--) {
  1717 	    DUFFS_LOOP4({
  1718 		Uint32 alpha;
  1719 
  1720 		_m_prefetch(srcp + 16);
  1721 		_m_prefetch(dstp + 16);
  1722 
  1723 		alpha = *srcp & amask;
  1724 		if (alpha == 0) {
  1725 			/* do nothing */
  1726 		} else if (alpha == amask) {
  1727 			/* copy RGB, keep dst alpha */
  1728 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1729 		} else {
  1730 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1731 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1732 
  1733 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1734 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1735 
  1736 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1737 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1738 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1739 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1740 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1741 
  1742 			/* blend */		    
  1743 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1744 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1745 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1746 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1747 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1748 			
  1749 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1750 		}
  1751 		++srcp;
  1752 		++dstp;
  1753 	    }, width);
  1754 	    srcp += srcskip;
  1755 	    dstp += dstskip;
  1756 	}
  1757 	_mm_empty();
  1758 }
  1759 /* End MSVC_ASMBLIT */
  1760 
  1761 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1762 
  1763 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1764 
  1765 /* blend a single 16 bit pixel at 50% */
  1766 #define BLEND16_50(d, s, mask)						\
  1767 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1768 
  1769 /* blend two 16 bit pixels at 50% */
  1770 #define BLEND2x16_50(d, s, mask)					     \
  1771 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1772 	 + (s & d & (~(mask | mask << 16))))
  1773 
  1774 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  1775 {
  1776 	int width = info->d_width;
  1777 	int height = info->d_height;
  1778 	Uint16 *srcp = (Uint16 *)info->s_pixels;
  1779 	int srcskip = info->s_skip >> 1;
  1780 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  1781 	int dstskip = info->d_skip >> 1;
  1782 
  1783 	while(height--) {
  1784 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  1785 			/*
  1786 			 * Source and destination not aligned, pipeline it.
  1787 			 * This is mostly a win for big blits but no loss for
  1788 			 * small ones
  1789 			 */
  1790 			Uint32 prev_sw;
  1791 			int w = width;
  1792 
  1793 			/* handle odd destination */
  1794 			if((uintptr_t)dstp & 2) {
  1795 				Uint16 d = *dstp, s = *srcp;
  1796 				*dstp = BLEND16_50(d, s, mask);
  1797 				dstp++;
  1798 				srcp++;
  1799 				w--;
  1800 			}
  1801 			srcp++;	/* srcp is now 32-bit aligned */
  1802 
  1803 			/* bootstrap pipeline with first halfword */
  1804 			prev_sw = ((Uint32 *)srcp)[-1];
  1805 
  1806 			while(w > 1) {
  1807 				Uint32 sw, dw, s;
  1808 				sw = *(Uint32 *)srcp;
  1809 				dw = *(Uint32 *)dstp;
  1810 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1811 				s = (prev_sw << 16) + (sw >> 16);
  1812 #else
  1813 				s = (prev_sw >> 16) + (sw << 16);
  1814 #endif
  1815 				prev_sw = sw;
  1816 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  1817 				dstp += 2;
  1818 				srcp += 2;
  1819 				w -= 2;
  1820 			}
  1821 
  1822 			/* final pixel if any */
  1823 			if(w) {
  1824 				Uint16 d = *dstp, s;
  1825 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1826 				s = (Uint16)prev_sw;
  1827 #else
  1828 				s = (Uint16)(prev_sw >> 16);
  1829 #endif
  1830 				*dstp = BLEND16_50(d, s, mask);
  1831 				srcp++;
  1832 				dstp++;
  1833 			}
  1834 			srcp += srcskip - 1;
  1835 			dstp += dstskip;
  1836 		} else {
  1837 			/* source and destination are aligned */
  1838 			int w = width;
  1839 
  1840 			/* first odd pixel? */
  1841 			if((uintptr_t)srcp & 2) {
  1842 				Uint16 d = *dstp, s = *srcp;
  1843 				*dstp = BLEND16_50(d, s, mask);
  1844 				srcp++;
  1845 				dstp++;
  1846 				w--;
  1847 			}
  1848 			/* srcp and dstp are now 32-bit aligned */
  1849 
  1850 			while(w > 1) {
  1851 				Uint32 sw = *(Uint32 *)srcp;
  1852 				Uint32 dw = *(Uint32 *)dstp;
  1853 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  1854 				srcp += 2;
  1855 				dstp += 2;
  1856 				w -= 2;
  1857 			}
  1858 
  1859 			/* last odd pixel? */
  1860 			if(w) {
  1861 				Uint16 d = *dstp, s = *srcp;
  1862 				*dstp = BLEND16_50(d, s, mask);
  1863 				srcp++;
  1864 				dstp++;
  1865 			}
  1866 			srcp += srcskip;
  1867 			dstp += dstskip;
  1868 		}
  1869 	}
  1870 }
  1871 
  1872 #if GCC_ASMBLIT
  1873 /* fast RGB565->RGB565 blending with surface alpha */
  1874 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  1875 {
  1876 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1877 	if(alpha == 128) {
  1878 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  1879 	} else {
  1880 		int width = info->d_width;
  1881 		int height = info->d_height;
  1882 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  1883 		int srcskip = info->s_skip >> 1;
  1884 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  1885 		int dstskip = info->d_skip >> 1;
  1886 		Uint32 s, d;
  1887 		Uint64 load;
  1888 	  
  1889 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  1890 		load = alpha;
  1891 		alpha >>= 3;		/* downscale alpha to 5 bits */
  1892 
  1893 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  1894 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  1895 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  1896 		/* position alpha to allow for mullo and mulhi on diff channels
  1897 		   to reduce the number of operations */
  1898 		psllq_i2r(3, mm0);
  1899 	  
  1900 		/* Setup the 565 color channel masks */
  1901 		load = 0x07E007E007E007E0ULL;
  1902 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  1903 		load = 0x001F001F001F001FULL;
  1904 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  1905 		while(height--) {
  1906 			DUFFS_LOOP_QUATRO2(
  1907 			{
  1908 				s = *srcp++;
  1909 				d = *dstp;
  1910 				/*
  1911 				 * shift out the middle component (green) to
  1912 				 * the high 16 bits, and process all three RGB
  1913 				 * components at the same time.
  1914 				 */
  1915 				s = (s | s << 16) & 0x07e0f81f;
  1916 				d = (d | d << 16) & 0x07e0f81f;
  1917 				d += (s - d) * alpha >> 5;
  1918 				d &= 0x07e0f81f;
  1919 				*dstp++ = d | d >> 16;
  1920 			},{
  1921 				s = *srcp++;
  1922 				d = *dstp;
  1923 				/*
  1924 				 * shift out the middle component (green) to
  1925 				 * the high 16 bits, and process all three RGB
  1926 				 * components at the same time.
  1927 				 */
  1928 				s = (s | s << 16) & 0x07e0f81f;
  1929 				d = (d | d << 16) & 0x07e0f81f;
  1930 				d += (s - d) * alpha >> 5;
  1931 				d &= 0x07e0f81f;
  1932 				*dstp++ = d | d >> 16;
  1933 				s = *srcp++;
  1934 				d = *dstp;
  1935 				/*
  1936 				 * shift out the middle component (green) to
  1937 				 * the high 16 bits, and process all three RGB
  1938 				 * components at the same time.
  1939 				 */
  1940 				s = (s | s << 16) & 0x07e0f81f;
  1941 				d = (d | d << 16) & 0x07e0f81f;
  1942 				d += (s - d) * alpha >> 5;
  1943 				d &= 0x07e0f81f;
  1944 				*dstp++ = d | d >> 16;
  1945 			},{
  1946 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  1947 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  1948 
  1949 				/* red -- does not need a mask since the right shift clears
  1950 				   the uninteresting bits */
  1951 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1952 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1953 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  1954 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  1955 
  1956 				/* blend */
  1957 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1958 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1959 				/* alpha used is actually 11 bits
  1960 				   11 + 5 = 16 bits, so the sign bits are lost */
  1961 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1962 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1963 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  1964 
  1965 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  1966 
  1967 				/* green -- process the bits in place */
  1968 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1969 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1970 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  1971 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  1972 
  1973 				/* blend */
  1974 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1975 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1976 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  1977 				   bits are gone and the sign bits present */
  1978 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  1979 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1980 
  1981 				por_r2r(mm6, mm1); /* save new greens in dsts */
  1982 
  1983 				/* blue */
  1984 				movq_r2r(mm2, mm5); /* src -> mm5 */
  1985 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  1986 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  1987 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1988 
  1989 				/* blend */
  1990 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  1991 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  1992 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  1993 				   the interesting bits will need to be MASKed */
  1994 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  1995 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  1996 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  1997 
  1998 				por_r2r(mm6, mm1); /* save new blues in dsts */
  1999 
  2000 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2001 
  2002 				srcp += 4;
  2003 				dstp += 4;
  2004 			}, width);			
  2005 			srcp += srcskip;
  2006 			dstp += dstskip;
  2007 		}
  2008 		emms();
  2009 	}
  2010 }
  2011 
  2012 /* fast RGB555->RGB555 blending with surface alpha */
  2013 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2014 {
  2015 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2016 	if(alpha == 128) {
  2017 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2018 	} else {
  2019 		int width = info->d_width;
  2020 		int height = info->d_height;
  2021 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2022 		int srcskip = info->s_skip >> 1;
  2023 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2024 		int dstskip = info->d_skip >> 1;
  2025 		Uint32 s, d;
  2026 		Uint64 load;
  2027 	  
  2028 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2029 		load = alpha;
  2030 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2031 
  2032 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  2033 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  2034 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  2035 		/* position alpha to allow for mullo and mulhi on diff channels
  2036 		   to reduce the number of operations */
  2037 		psllq_i2r(3, mm0);
  2038 
  2039 		/* Setup the 555 color channel masks */
  2040 		load = 0x03E003E003E003E0ULL;
  2041 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  2042 		load = 0x001F001F001F001FULL;
  2043 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  2044 		while(height--) {
  2045 			DUFFS_LOOP_QUATRO2(
  2046 			{
  2047 				s = *srcp++;
  2048 				d = *dstp;
  2049 				/*
  2050 				 * shift out the middle component (green) to
  2051 				 * the high 16 bits, and process all three RGB
  2052 				 * components at the same time.
  2053 				 */
  2054 				s = (s | s << 16) & 0x03e07c1f;
  2055 				d = (d | d << 16) & 0x03e07c1f;
  2056 				d += (s - d) * alpha >> 5;
  2057 				d &= 0x03e07c1f;
  2058 				*dstp++ = d | d >> 16;
  2059 			},{
  2060 				s = *srcp++;
  2061 				d = *dstp;
  2062 				/*
  2063 				 * shift out the middle component (green) to
  2064 				 * the high 16 bits, and process all three RGB
  2065 				 * components at the same time.
  2066 				 */
  2067 				s = (s | s << 16) & 0x03e07c1f;
  2068 				d = (d | d << 16) & 0x03e07c1f;
  2069 				d += (s - d) * alpha >> 5;
  2070 				d &= 0x03e07c1f;
  2071 				*dstp++ = d | d >> 16;
  2072 			        s = *srcp++;
  2073 				d = *dstp;
  2074 				/*
  2075 				 * shift out the middle component (green) to
  2076 				 * the high 16 bits, and process all three RGB
  2077 				 * components at the same time.
  2078 				 */
  2079 				s = (s | s << 16) & 0x03e07c1f;
  2080 				d = (d | d << 16) & 0x03e07c1f;
  2081 				d += (s - d) * alpha >> 5;
  2082 				d &= 0x03e07c1f;
  2083 				*dstp++ = d | d >> 16;
  2084 			},{
  2085 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2086 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2087 
  2088 				/* red -- process the bits in place */
  2089 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2090 					/* by reusing the GREEN mask we free up another mmx
  2091 					   register to accumulate the result */
  2092 
  2093 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2094 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2095 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2096 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2097 
  2098 				/* blend */
  2099 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2100 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2101 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2102 				   cleared by a MASK below */
  2103 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2104 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2105 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2106 
  2107 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2108 
  2109 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2110 
  2111 				/* green -- process the bits in place */
  2112 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2113 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2114 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2115 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2116 
  2117 				/* blend */
  2118 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2119 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2120 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2121 				   bits are gone and the sign bits present */
  2122 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2123 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2124 
  2125 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2126 
  2127 				/* blue */
  2128 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2129 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2130 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2131 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2132 
  2133 				/* blend */
  2134 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2135 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2136 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2137 				   the interesting bits will need to be MASKed */
  2138 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2139 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2140 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2141 
  2142 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2143 
  2144 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2145 
  2146 				srcp += 4;
  2147 				dstp += 4;
  2148 			}, width);			
  2149 			srcp += srcskip;
  2150 			dstp += dstskip;
  2151 		}
  2152 		emms();
  2153 	}
  2154 }
  2155 /* End GCC_ASMBLIT */
  2156 
  2157 #elif MSVC_ASMBLIT
  2158 /* fast RGB565->RGB565 blending with surface alpha */
  2159 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  2160 {
  2161 	unsigned alpha = info->src->alpha;
  2162 	if(alpha == 128) {
  2163 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2164 	} else {
  2165 		int width = info->d_width;
  2166 		int height = info->d_height;
  2167 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2168 		int srcskip = info->s_skip >> 1;
  2169 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2170 		int dstskip = info->d_skip >> 1;
  2171 		Uint32 s, d;
  2172 	  
  2173 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2174 
  2175 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2176 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2177 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2178 
  2179 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2180 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2181 		/* position alpha to allow for mullo and mulhi on diff channels
  2182 		   to reduce the number of operations */
  2183 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2184 	  
  2185 		/* Setup the 565 color channel masks */
  2186 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
  2187 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2188 		
  2189 		while(height--) {
  2190 			DUFFS_LOOP_QUATRO2(
  2191 			{
  2192 				s = *srcp++;
  2193 				d = *dstp;
  2194 				/*
  2195 				 * shift out the middle component (green) to
  2196 				 * the high 16 bits, and process all three RGB
  2197 				 * components at the same time.
  2198 				 */
  2199 				s = (s | s << 16) & 0x07e0f81f;
  2200 				d = (d | d << 16) & 0x07e0f81f;
  2201 				d += (s - d) * alpha >> 5;
  2202 				d &= 0x07e0f81f;
  2203 				*dstp++ = (Uint16)(d | d >> 16);
  2204 			},{
  2205 				s = *srcp++;
  2206 				d = *dstp;
  2207 				/*
  2208 				 * shift out the middle component (green) to
  2209 				 * the high 16 bits, and process all three RGB
  2210 				 * components at the same time.
  2211 				 */
  2212 				s = (s | s << 16) & 0x07e0f81f;
  2213 				d = (d | d << 16) & 0x07e0f81f;
  2214 				d += (s - d) * alpha >> 5;
  2215 				d &= 0x07e0f81f;
  2216 				*dstp++ = (Uint16)(d | d >> 16);
  2217 				s = *srcp++;
  2218 				d = *dstp;
  2219 				/*
  2220 				 * shift out the middle component (green) to
  2221 				 * the high 16 bits, and process all three RGB
  2222 				 * components at the same time.
  2223 				 */
  2224 				s = (s | s << 16) & 0x07e0f81f;
  2225 				d = (d | d << 16) & 0x07e0f81f;
  2226 				d += (s - d) * alpha >> 5;
  2227 				d &= 0x07e0f81f;
  2228 				*dstp++ = (Uint16)(d | d >> 16);
  2229 			},{
  2230 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2231 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2232 
  2233 				/* red */
  2234 				src2 = src1;
  2235 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2236 
  2237 				dst2 = dst1;
  2238 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2239 
  2240 				/* blend */
  2241 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2242 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2243 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2244 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2245 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2246 
  2247 				mm_res = dst2; /* RED -> mm_res */
  2248 
  2249 				/* green -- process the bits in place */
  2250 				src2 = src1;
  2251 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2252 
  2253 				dst2 = dst1;
  2254 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2255 
  2256 				/* blend */
  2257 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2258 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2259 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2260 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2261 
  2262 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2263 
  2264 				/* blue */
  2265 				src2 = src1;
  2266 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2267 
  2268 				dst2 = dst1;
  2269 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2270 
  2271 				/* blend */
  2272 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2273 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2274 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2275 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2276 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2277 
  2278 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2279 
  2280 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2281 
  2282 				srcp += 4;
  2283 				dstp += 4;
  2284 			}, width);			
  2285 			srcp += srcskip;
  2286 			dstp += dstskip;
  2287 		}
  2288 		_mm_empty();
  2289 	}
  2290 }
  2291 
  2292 /* fast RGB555->RGB555 blending with surface alpha */
  2293 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2294 {
  2295 	unsigned alpha = info->src->alpha;
  2296 	if(alpha == 128) {
  2297 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2298 	} else {
  2299 		int width = info->d_width;
  2300 		int height = info->d_height;
  2301 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2302 		int srcskip = info->s_skip >> 1;
  2303 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2304 		int dstskip = info->d_skip >> 1;
  2305 		Uint32 s, d;
  2306 	  
  2307 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2308 
  2309 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2310 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2311 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2312 
  2313 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2314 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2315 		/* position alpha to allow for mullo and mulhi on diff channels
  2316 		   to reduce the number of operations */
  2317 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2318 	  
  2319 		/* Setup the 555 color channel masks */
  2320 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
  2321 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
  2322 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2323 
  2324 		while(height--) {
  2325 			DUFFS_LOOP_QUATRO2(
  2326 			{
  2327 				s = *srcp++;
  2328 				d = *dstp;
  2329 				/*
  2330 				 * shift out the middle component (green) to
  2331 				 * the high 16 bits, and process all three RGB
  2332 				 * components at the same time.
  2333 				 */
  2334 				s = (s | s << 16) & 0x03e07c1f;
  2335 				d = (d | d << 16) & 0x03e07c1f;
  2336 				d += (s - d) * alpha >> 5;
  2337 				d &= 0x03e07c1f;
  2338 				*dstp++ = (Uint16)(d | d >> 16);
  2339 			},{
  2340 				s = *srcp++;
  2341 				d = *dstp;
  2342 				/*
  2343 				 * shift out the middle component (green) to
  2344 				 * the high 16 bits, and process all three RGB
  2345 				 * components at the same time.
  2346 				 */
  2347 				s = (s | s << 16) & 0x03e07c1f;
  2348 				d = (d | d << 16) & 0x03e07c1f;
  2349 				d += (s - d) * alpha >> 5;
  2350 				d &= 0x03e07c1f;
  2351 				*dstp++ = (Uint16)(d | d >> 16);
  2352 			        s = *srcp++;
  2353 				d = *dstp;
  2354 				/*
  2355 				 * shift out the middle component (green) to
  2356 				 * the high 16 bits, and process all three RGB
  2357 				 * components at the same time.
  2358 				 */
  2359 				s = (s | s << 16) & 0x03e07c1f;
  2360 				d = (d | d << 16) & 0x03e07c1f;
  2361 				d += (s - d) * alpha >> 5;
  2362 				d &= 0x03e07c1f;
  2363 				*dstp++ = (Uint16)(d | d >> 16);
  2364 			},{
  2365 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2366 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2367 
  2368 				/* red -- process the bits in place */
  2369 				src2 = src1;
  2370 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2371 
  2372 				dst2 = dst1;
  2373 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2374 
  2375 				/* blend */
  2376 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2377 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2378 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2379 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2380 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2381 
  2382 				mm_res = dst2; /* RED -> mm_res */
  2383 				
  2384 				/* green -- process the bits in place */
  2385 				src2 = src1;
  2386 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2387 
  2388 				dst2 = dst1;
  2389 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2390 
  2391 				/* blend */
  2392 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2393 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2394 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2395 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2396 
  2397 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2398 
  2399 				/* blue */
  2400 				src2 = src1; /* src -> src2 */
  2401 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2402 
  2403 				dst2 = dst1; /* dst -> dst2 */
  2404 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2405 
  2406 				/* blend */
  2407 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2408 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2409 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2410 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2411 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2412 
  2413 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2414 
  2415 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2416 
  2417 				srcp += 4;
  2418 				dstp += 4;
  2419 			}, width);			
  2420 			srcp += srcskip;
  2421 			dstp += dstskip;
  2422 		}
  2423 		_mm_empty();
  2424 	}
  2425 }
  2426 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2427 
  2428 /* fast RGB565->RGB565 blending with surface alpha */
  2429 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  2430 {
  2431 	unsigned alpha = info->src->alpha;
  2432 	if(alpha == 128) {
  2433 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2434 	} else {
  2435 		int width = info->d_width;
  2436 		int height = info->d_height;
  2437 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2438 		int srcskip = info->s_skip >> 1;
  2439 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2440 		int dstskip = info->d_skip >> 1;
  2441 		alpha >>= 3;	/* downscale alpha to 5 bits */
  2442 
  2443 		while(height--) {
  2444 			DUFFS_LOOP4({
  2445 				Uint32 s = *srcp++;
  2446 				Uint32 d = *dstp;
  2447 				/*
  2448 				 * shift out the middle component (green) to
  2449 				 * the high 16 bits, and process all three RGB
  2450 				 * components at the same time.
  2451 				 */
  2452 				s = (s | s << 16) & 0x07e0f81f;
  2453 				d = (d | d << 16) & 0x07e0f81f;
  2454 				d += (s - d) * alpha >> 5;
  2455 				d &= 0x07e0f81f;
  2456 				*dstp++ = (Uint16)(d | d >> 16);
  2457 			}, width);
  2458 			srcp += srcskip;
  2459 			dstp += dstskip;
  2460 		}
  2461 	}
  2462 }
  2463 
  2464 /* fast RGB555->RGB555 blending with surface alpha */
  2465 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  2466 {
  2467 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2468 	if(alpha == 128) {
  2469 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2470 	} else {
  2471 		int width = info->d_width;
  2472 		int height = info->d_height;
  2473 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2474 		int srcskip = info->s_skip >> 1;
  2475 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2476 		int dstskip = info->d_skip >> 1;
  2477 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2478 
  2479 		while(height--) {
  2480 			DUFFS_LOOP4({
  2481 				Uint32 s = *srcp++;
  2482 				Uint32 d = *dstp;
  2483 				/*
  2484 				 * shift out the middle component (green) to
  2485 				 * the high 16 bits, and process all three RGB
  2486 				 * components at the same time.
  2487 				 */
  2488 				s = (s | s << 16) & 0x03e07c1f;
  2489 				d = (d | d << 16) & 0x03e07c1f;
  2490 				d += (s - d) * alpha >> 5;
  2491 				d &= 0x03e07c1f;
  2492 				*dstp++ = (Uint16)(d | d >> 16);
  2493 			}, width);
  2494 			srcp += srcskip;
  2495 			dstp += dstskip;
  2496 		}
  2497 	}
  2498 }
  2499 
  2500 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2501 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  2502 {
  2503 	int width = info->d_width;
  2504 	int height = info->d_height;
  2505 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2506 	int srcskip = info->s_skip >> 2;
  2507 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2508 	int dstskip = info->d_skip >> 1;
  2509 
  2510 	while(height--) {
  2511 	    DUFFS_LOOP4({
  2512 		Uint32 s = *srcp;
  2513 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2514 		/* FIXME: Here we special-case opaque alpha since the
  2515 		   compositioning used (>>8 instead of /255) doesn't handle
  2516 		   it correctly. Also special-case alpha=0 for speed?
  2517 		   Benchmark this! */
  2518 		if(alpha) {   
  2519 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2520 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2521 		  } else {
  2522 		    Uint32 d = *dstp;
  2523 		    /*
  2524 		     * convert source and destination to G0RAB65565
  2525 		     * and blend all components at the same time
  2526 		     */
  2527 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2528 		      + (s >> 3 & 0x1f);
  2529 		    d = (d | d << 16) & 0x07e0f81f;
  2530 		    d += (s - d) * alpha >> 5;
  2531 		    d &= 0x07e0f81f;
  2532 		    *dstp = (Uint16)(d | d >> 16);
  2533 		  }
  2534 		}
  2535 		srcp++;
  2536 		dstp++;
  2537 	    }, width);
  2538 	    srcp += srcskip;
  2539 	    dstp += dstskip;
  2540 	}
  2541 }
  2542 
  2543 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2544 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  2545 {
  2546 	int width = info->d_width;
  2547 	int height = info->d_height;
  2548 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2549 	int srcskip = info->s_skip >> 2;
  2550 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2551 	int dstskip = info->d_skip >> 1;
  2552 
  2553 	while(height--) {
  2554 	    DUFFS_LOOP4({
  2555 		unsigned alpha;
  2556 		Uint32 s = *srcp;
  2557 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2558 		/* FIXME: Here we special-case opaque alpha since the
  2559 		   compositioning used (>>8 instead of /255) doesn't handle
  2560 		   it correctly. Also special-case alpha=0 for speed?
  2561 		   Benchmark this! */
  2562 		if(alpha) {   
  2563 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2564 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2565 		  } else {
  2566 		    Uint32 d = *dstp;
  2567 		    /*
  2568 		     * convert source and destination to G0RAB65565
  2569 		     * and blend all components at the same time
  2570 		     */
  2571 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2572 		      + (s >> 3 & 0x1f);
  2573 		    d = (d | d << 16) & 0x03e07c1f;
  2574 		    d += (s - d) * alpha >> 5;
  2575 		    d &= 0x03e07c1f;
  2576 		    *dstp = (Uint16)(d | d >> 16);
  2577 		  }
  2578 		}
  2579 		srcp++;
  2580 		dstp++;
  2581 	    }, width);
  2582 	    srcp += srcskip;
  2583 	    dstp += dstskip;
  2584 	}
  2585 }
  2586 
  2587 /* General (slow) N->N blending with per-surface alpha */
  2588 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  2589 {
  2590 	int width = info->d_width;
  2591 	int height = info->d_height;
  2592 	Uint8 *src = info->s_pixels;
  2593 	int srcskip = info->s_skip;
  2594 	Uint8 *dst = info->d_pixels;
  2595 	int dstskip = info->d_skip;
  2596 	SDL_PixelFormat *srcfmt = info->src;
  2597 	SDL_PixelFormat *dstfmt = info->dst;
  2598 	int srcbpp = srcfmt->BytesPerPixel;
  2599 	int dstbpp = dstfmt->BytesPerPixel;
  2600 	unsigned sA = srcfmt->alpha;
  2601 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2602 
  2603 	if(sA) {
  2604 	  while ( height-- ) {
  2605 	    DUFFS_LOOP4(
  2606 	    {
  2607 		Uint32 Pixel;
  2608 		unsigned sR;
  2609 		unsigned sG;
  2610 		unsigned sB;
  2611 		unsigned dR;
  2612 		unsigned dG;
  2613 		unsigned dB;
  2614 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2615 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2616 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2617 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2618 		src += srcbpp;
  2619 		dst += dstbpp;
  2620 	    },
  2621 	    width);
  2622 	    src += srcskip;
  2623 	    dst += dstskip;
  2624 	  }
  2625 	}
  2626 }
  2627 
  2628 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2629 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  2630 {
  2631 	int width = info->d_width;
  2632 	int height = info->d_height;
  2633 	Uint8 *src = info->s_pixels;
  2634 	int srcskip = info->s_skip;
  2635 	Uint8 *dst = info->d_pixels;
  2636 	int dstskip = info->d_skip;
  2637 	SDL_PixelFormat *srcfmt = info->src;
  2638 	SDL_PixelFormat *dstfmt = info->dst;
  2639 	Uint32 ckey = srcfmt->colorkey;
  2640 	int srcbpp = srcfmt->BytesPerPixel;
  2641 	int dstbpp = dstfmt->BytesPerPixel;
  2642 	unsigned sA = srcfmt->alpha;
  2643 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2644 
  2645 	while ( height-- ) {
  2646 	    DUFFS_LOOP4(
  2647 	    {
  2648 		Uint32 Pixel;
  2649 		unsigned sR;
  2650 		unsigned sG;
  2651 		unsigned sB;
  2652 		unsigned dR;
  2653 		unsigned dG;
  2654 		unsigned dB;
  2655 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2656 		if(sA && Pixel != ckey) {
  2657 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2658 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2659 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2660 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2661 		}
  2662 		src += srcbpp;
  2663 		dst += dstbpp;
  2664 	    },
  2665 	    width);
  2666 	    src += srcskip;
  2667 	    dst += dstskip;
  2668 	}
  2669 }
  2670 
  2671 /* General (slow) N->N blending with pixel alpha */
  2672 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  2673 {
  2674 	int width = info->d_width;
  2675 	int height = info->d_height;
  2676 	Uint8 *src = info->s_pixels;
  2677 	int srcskip = info->s_skip;
  2678 	Uint8 *dst = info->d_pixels;
  2679 	int dstskip = info->d_skip;
  2680 	SDL_PixelFormat *srcfmt = info->src;
  2681 	SDL_PixelFormat *dstfmt = info->dst;
  2682 
  2683 	int  srcbpp;
  2684 	int  dstbpp;
  2685 
  2686 	/* Set up some basic variables */
  2687 	srcbpp = srcfmt->BytesPerPixel;
  2688 	dstbpp = dstfmt->BytesPerPixel;
  2689 
  2690 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2691 	   quite right. for <8bpp source alpha, it gets them very wrong
  2692 	   (check all macros!)
  2693 	   It is unclear whether there is a good general solution that doesn't
  2694 	   need a branch (or a divide). */
  2695 	while ( height-- ) {
  2696 	    DUFFS_LOOP4(
  2697 	    {
  2698 		Uint32 Pixel;
  2699 		unsigned sR;
  2700 		unsigned sG;
  2701 		unsigned sB;
  2702 		unsigned dR;
  2703 		unsigned dG;
  2704 		unsigned dB;
  2705 		unsigned sA;
  2706 		unsigned dA;
  2707 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2708 		if(sA) {
  2709 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2710 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2711 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2712 		}
  2713 		src += srcbpp;
  2714 		dst += dstbpp;
  2715 	    },
  2716 	    width);
  2717 	    src += srcskip;
  2718 	    dst += dstskip;
  2719 	}
  2720 }
  2721 
  2722 
  2723 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
  2724 {
  2725     SDL_PixelFormat *sf = surface->format;
  2726     SDL_PixelFormat *df = surface->map->dst->format;
  2727 
  2728     if(sf->Amask == 0) {
  2729 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2730 	    if(df->BytesPerPixel == 1)
  2731 		return BlitNto1SurfaceAlphaKey;
  2732 	    else
  2733 #if SDL_ALTIVEC_BLITTERS
  2734 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2735 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2736             return Blit32to32SurfaceAlphaKeyAltivec;
  2737         else
  2738 #endif
  2739             return BlitNtoNSurfaceAlphaKey;
  2740 	} else {
  2741 	    /* Per-surface alpha blits */
  2742 	    switch(df->BytesPerPixel) {
  2743 	    case 1:
  2744 		return BlitNto1SurfaceAlpha;
  2745 
  2746 	    case 2:
  2747 		if(surface->map->identity) {
  2748 		    if(df->Gmask == 0x7e0)
  2749 		    {
  2750 #if MMX_ASMBLIT
  2751 		if(SDL_HasMMX())
  2752 			return Blit565to565SurfaceAlphaMMX;
  2753 		else
  2754 #endif
  2755 			return Blit565to565SurfaceAlpha;
  2756 		    }
  2757 		    else if(df->Gmask == 0x3e0)
  2758 		    {
  2759 #if MMX_ASMBLIT
  2760 		if(SDL_HasMMX())
  2761 			return Blit555to555SurfaceAlphaMMX;
  2762 		else
  2763 #endif
  2764 			return Blit555to555SurfaceAlpha;
  2765 		    }
  2766 		}
  2767 		return BlitNtoNSurfaceAlpha;
  2768 
  2769 	    case 4:
  2770 		if(sf->Rmask == df->Rmask
  2771 		   && sf->Gmask == df->Gmask
  2772 		   && sf->Bmask == df->Bmask
  2773 		   && sf->BytesPerPixel == 4)
  2774 		{
  2775 #if MMX_ASMBLIT
  2776 			if(sf->Rshift % 8 == 0
  2777 			   && sf->Gshift % 8 == 0
  2778 			   && sf->Bshift % 8 == 0
  2779 			   && SDL_HasMMX())
  2780 			    return BlitRGBtoRGBSurfaceAlphaMMX;
  2781 #endif
  2782 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
  2783 			{
  2784 #if SDL_ALTIVEC_BLITTERS
  2785 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2786 					&& SDL_HasAltiVec())
  2787 					return BlitRGBtoRGBSurfaceAlphaAltivec;
  2788 #endif
  2789 				return BlitRGBtoRGBSurfaceAlpha;
  2790 			}
  2791 		}
  2792 #if SDL_ALTIVEC_BLITTERS
  2793 		if((sf->BytesPerPixel == 4) &&
  2794 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2795 			return Blit32to32SurfaceAlphaAltivec;
  2796 		else
  2797 #endif
  2798 			return BlitNtoNSurfaceAlpha;
  2799 
  2800 	    case 3:
  2801 	    default:
  2802 		return BlitNtoNSurfaceAlpha;
  2803 	    }
  2804 	}
  2805     } else {
  2806 	/* Per-pixel alpha blits */
  2807 	switch(df->BytesPerPixel) {
  2808 	case 1:
  2809 	    return BlitNto1PixelAlpha;
  2810 
  2811 	case 2:
  2812 #if SDL_ALTIVEC_BLITTERS
  2813 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
  2814            df->Gmask == 0x7e0 &&
  2815 	   df->Bmask == 0x1f && SDL_HasAltiVec())
  2816             return Blit32to565PixelAlphaAltivec;
  2817         else
  2818 #endif
  2819 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2820 	       && sf->Gmask == 0xff00
  2821 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2822 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2823 		if(df->Gmask == 0x7e0)
  2824 		    return BlitARGBto565PixelAlpha;
  2825 		else if(df->Gmask == 0x3e0)
  2826 		    return BlitARGBto555PixelAlpha;
  2827 	    }
  2828 	    return BlitNtoNPixelAlpha;
  2829 
  2830 	case 4:
  2831 	    if(sf->Rmask == df->Rmask
  2832 	       && sf->Gmask == df->Gmask
  2833 	       && sf->Bmask == df->Bmask
  2834 	       && sf->BytesPerPixel == 4)
  2835 	    {
  2836 #if MMX_ASMBLIT
  2837 		if(sf->Rshift % 8 == 0
  2838 		   && sf->Gshift % 8 == 0
  2839 		   && sf->Bshift % 8 == 0
  2840 		   && sf->Ashift % 8 == 0
  2841 		   && sf->Aloss == 0)
  2842 		{
  2843 			if(SDL_Has3DNow())
  2844 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2845 			if(SDL_HasMMX())
  2846 				return BlitRGBtoRGBPixelAlphaMMX;
  2847 		}
  2848 #endif
  2849 		if(sf->Amask == 0xff000000)
  2850 		{
  2851 #if SDL_ALTIVEC_BLITTERS
  2852 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2853 				&& SDL_HasAltiVec())
  2854 				return BlitRGBtoRGBPixelAlphaAltivec;
  2855 #endif
  2856 			return BlitRGBtoRGBPixelAlpha;
  2857 		}
  2858 	    }
  2859 #if SDL_ALTIVEC_BLITTERS
  2860 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
  2861 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2862 		return Blit32to32PixelAlphaAltivec;
  2863 	    else
  2864 #endif
  2865 		return BlitNtoNPixelAlpha;
  2866 
  2867 	case 3:
  2868 	default:
  2869 	    return BlitNtoNPixelAlpha;
  2870 	}
  2871     }
  2872 }
  2873