src/video/SDL_blit_A.c
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13221 3705e81df6ff
parent 13220 0ae1ddca5e85
permissions -rw-r--r--
ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha
---
src/video/SDL_blit_A.c | 31 ++++++++--
src/video/arm/pixman-arm-neon-asm.S | 88 +++++++++++++++++++++++++++++
2 files changed, 114 insertions(+), 5 deletions(-)
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2012 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /*
    28   In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
    29    Checking if _mm_free is #defined in malloc.h is is the only way to
    30    determine if the Processor Pack is installed, as far as I can tell.
    31 */
    32 
    33 #if SDL_ASSEMBLY_ROUTINES
    34 #  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    35      /* forced MMX to 0...it breaks on most compilers now.  --ryan. */
    36 #    define MMX_ASMBLIT 0
    37 #    define GCC_ASMBLIT 0
    38 #  elif defined(_MSC_VER) && defined(_M_IX86)
    39 #    if (_MSC_VER <= 1200)  
    40 #      include <malloc.h>   
    41 #      if defined(_mm_free)
    42 #          define HAVE_MMINTRIN_H 1
    43 #      endif
    44 #    else  /* Visual Studio > VC6 always has mmintrin.h */
    45 #      define HAVE_MMINTRIN_H 1
    46 #    endif
    47 #    if HAVE_MMINTRIN_H
    48 #      define MMX_ASMBLIT 1
    49 #      define MSVC_ASMBLIT 1
    50 #    endif
    51 #  endif
    52 #endif /* SDL_ASSEMBLY_ROUTINES */
    53 
    54 /* Function to check the CPU flags */
    55 #include "SDL_cpuinfo.h"
    56 #if GCC_ASMBLIT
    57 #include "mmx.h"
    58 #elif MSVC_ASMBLIT
    59 #include <mmintrin.h>
    60 #include <mm3dnow.h>
    61 #endif
    62 
    63 /* Functions to perform alpha blended blitting */
    64 
    65 /* N->1 blending with per-surface alpha */
    66 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
    67 {
    68 	int width = info->d_width;
    69 	int height = info->d_height;
    70 	Uint8 *src = info->s_pixels;
    71 	int srcskip = info->s_skip;
    72 	Uint8 *dst = info->d_pixels;
    73 	int dstskip = info->d_skip;
    74 	Uint8 *palmap = info->table;
    75 	SDL_PixelFormat *srcfmt = info->src;
    76 	SDL_PixelFormat *dstfmt = info->dst;
    77 	int srcbpp = srcfmt->BytesPerPixel;
    78 
    79 	const unsigned A = srcfmt->alpha;
    80 
    81 	while ( height-- ) {
    82 	    DUFFS_LOOP4(
    83 	    {
    84 		Uint32 Pixel;
    85 		unsigned sR;
    86 		unsigned sG;
    87 		unsigned sB;
    88 		unsigned dR;
    89 		unsigned dG;
    90 		unsigned dB;
    91 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    92 		dR = dstfmt->palette->colors[*dst].r;
    93 		dG = dstfmt->palette->colors[*dst].g;
    94 		dB = dstfmt->palette->colors[*dst].b;
    95 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    96 		dR &= 0xff;
    97 		dG &= 0xff;
    98 		dB &= 0xff;
    99 		/* Pack RGB into 8bit pixel */
   100 		if ( palmap == NULL ) {
   101 		    *dst =((dR>>5)<<(3+2))|
   102 			  ((dG>>5)<<(2))|
   103 			  ((dB>>6)<<(0));
   104 		} else {
   105 		    *dst = palmap[((dR>>5)<<(3+2))|
   106 				  ((dG>>5)<<(2))  |
   107 				  ((dB>>6)<<(0))];
   108 		}
   109 		dst++;
   110 		src += srcbpp;
   111 	    },
   112 	    width);
   113 	    src += srcskip;
   114 	    dst += dstskip;
   115 	}
   116 }
   117 
   118 /* N->1 blending with pixel alpha */
   119 static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
   120 {
   121 	int width = info->d_width;
   122 	int height = info->d_height;
   123 	Uint8 *src = info->s_pixels;
   124 	int srcskip = info->s_skip;
   125 	Uint8 *dst = info->d_pixels;
   126 	int dstskip = info->d_skip;
   127 	Uint8 *palmap = info->table;
   128 	SDL_PixelFormat *srcfmt = info->src;
   129 	SDL_PixelFormat *dstfmt = info->dst;
   130 	int srcbpp = srcfmt->BytesPerPixel;
   131 
   132 	/* FIXME: fix alpha bit field expansion here too? */
   133 	while ( height-- ) {
   134 	    DUFFS_LOOP4(
   135 	    {
   136 		Uint32 Pixel;
   137 		unsigned sR;
   138 		unsigned sG;
   139 		unsigned sB;
   140 		unsigned sA;
   141 		unsigned dR;
   142 		unsigned dG;
   143 		unsigned dB;
   144 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   145 		dR = dstfmt->palette->colors[*dst].r;
   146 		dG = dstfmt->palette->colors[*dst].g;
   147 		dB = dstfmt->palette->colors[*dst].b;
   148 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   149 		dR &= 0xff;
   150 		dG &= 0xff;
   151 		dB &= 0xff;
   152 		/* Pack RGB into 8bit pixel */
   153 		if ( palmap == NULL ) {
   154 		    *dst =((dR>>5)<<(3+2))|
   155 			  ((dG>>5)<<(2))|
   156 			  ((dB>>6)<<(0));
   157 		} else {
   158 		    *dst = palmap[((dR>>5)<<(3+2))|
   159 				  ((dG>>5)<<(2))  |
   160 				  ((dB>>6)<<(0))  ];
   161 		}
   162 		dst++;
   163 		src += srcbpp;
   164 	    },
   165 	    width);
   166 	    src += srcskip;
   167 	    dst += dstskip;
   168 	}
   169 }
   170 
   171 /* colorkeyed N->1 blending with per-surface alpha */
   172 static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
   173 {
   174 	int width = info->d_width;
   175 	int height = info->d_height;
   176 	Uint8 *src = info->s_pixels;
   177 	int srcskip = info->s_skip;
   178 	Uint8 *dst = info->d_pixels;
   179 	int dstskip = info->d_skip;
   180 	Uint8 *palmap = info->table;
   181 	SDL_PixelFormat *srcfmt = info->src;
   182 	SDL_PixelFormat *dstfmt = info->dst;
   183 	int srcbpp = srcfmt->BytesPerPixel;
   184 	Uint32 ckey = srcfmt->colorkey;
   185 
   186 	const int A = srcfmt->alpha;
   187 
   188 	while ( height-- ) {
   189 	    DUFFS_LOOP(
   190 	    {
   191 		Uint32 Pixel;
   192 		unsigned sR;
   193 		unsigned sG;
   194 		unsigned sB;
   195 		unsigned dR;
   196 		unsigned dG;
   197 		unsigned dB;
   198 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   199 		if ( Pixel != ckey ) {
   200 		    dR = dstfmt->palette->colors[*dst].r;
   201 		    dG = dstfmt->palette->colors[*dst].g;
   202 		    dB = dstfmt->palette->colors[*dst].b;
   203 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   204 		    dR &= 0xff;
   205 		    dG &= 0xff;
   206 		    dB &= 0xff;
   207 		    /* Pack RGB into 8bit pixel */
   208 		    if ( palmap == NULL ) {
   209 			*dst =((dR>>5)<<(3+2))|
   210 			      ((dG>>5)<<(2)) |
   211 			      ((dB>>6)<<(0));
   212 		    } else {
   213 			*dst = palmap[((dR>>5)<<(3+2))|
   214 				      ((dG>>5)<<(2))  |
   215 				      ((dB>>6)<<(0))  ];
   216 		    }
   217 		}
   218 		dst++;
   219 		src += srcbpp;
   220 	    },
   221 	    width);
   222 	    src += srcskip;
   223 	    dst += dstskip;
   224 	}
   225 }
   226 
   227 #if GCC_ASMBLIT
   228 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   229 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   230 {
   231 	int width = info->d_width;
   232 	int height = info->d_height;
   233 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   234 	int srcskip = info->s_skip >> 2;
   235 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   236 	int dstskip = info->d_skip >> 2;
   237 	Uint32 dalpha = info->dst->Amask;
   238 	Uint64 load;
   239 
   240 	load = 0x00fefefe00fefefeULL;/* alpha128 mask */
   241 	movq_m2r(load, mm4); /* alpha128 mask -> mm4 */
   242 	load = 0x0001010100010101ULL;/* !alpha128 mask */
   243 	movq_m2r(load, mm3); /* !alpha128 mask -> mm3 */
   244 	movd_m2r(dalpha, mm7); /* dst alpha mask */
   245 	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   246 	while(height--) {
   247 		DUFFS_LOOP_DOUBLE2(
   248 		{
   249 			Uint32 s = *srcp++;
   250 			Uint32 d = *dstp;
   251 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   252 				   + (s & d & 0x00010101)) | dalpha;
   253 		},{
   254 			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   255 			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   256 
   257 			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
   258 			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
   259 
   260 			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
   261 			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
   262 			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
   263 			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
   264 			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
   265 			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
   266 			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
   267 			
   268 			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   269 			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
   270 			dstp += 2;
   271 			srcp += 2;
   272 		}, width);
   273 		srcp += srcskip;
   274 		dstp += dstskip;
   275 	}
   276 	emms();
   277 }
   278 
   279 /* fast RGB888->(A)RGB888 blending with surface alpha */
   280 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   281 {
   282 	SDL_PixelFormat* df = info->dst;
   283 	unsigned alpha = info->src->alpha;
   284 
   285 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   286 			/* only call a128 version when R,G,B occupy lower bits */
   287 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   288 	} else {
   289 		int width = info->d_width;
   290 		int height = info->d_height;
   291 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   292 		int srcskip = info->s_skip >> 2;
   293 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   294 		int dstskip = info->d_skip >> 2;
   295 
   296 		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
   297 		/* form the alpha mult */
   298 		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
   299 		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   300 		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   301 		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   302 		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
   303 		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
   304 		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
   305 			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   306 		movd_m2r(df->Amask, mm7); /* dst alpha mask */
   307 		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
   308 		
   309 		while(height--) {
   310 			DUFFS_LOOP_DOUBLE2({
   311 				/* One Pixel Blend */
   312 				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   313 				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   314 				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
   315 				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
   316 
   317 				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   318 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   319 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   320 				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   321 
   322 				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
   323 				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
   324 				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
   325 				++srcp;
   326 				++dstp;
   327 			},{
   328 				/* Two Pixels Blend */
   329 				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
   330 				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
   331 				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
   332 				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
   333 
   334 				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
   335 				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
   336 				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
   337 				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
   338 
   339 				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
   340 				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
   341 				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
   342 				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
   343 
   344 				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
   345 				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   346 				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
   347 				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
   348 
   349 				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
   350 				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
   351 				
   352 				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
   353 
   354   				srcp += 2;
   355   				dstp += 2;
   356   			}, width);
   357 			srcp += srcskip;
   358 			dstp += dstskip;
   359 		}
   360 		emms();
   361 	}
   362 }
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   366 {
   367 	int width = info->d_width;
   368 	int height = info->d_height;
   369 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   370 	int srcskip = info->s_skip >> 2;
   371 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   372 	int dstskip = info->d_skip >> 2;
   373 	SDL_PixelFormat* sf = info->src;
   374 	Uint32 amask = sf->Amask;
   375 
   376 	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
   377 	/* form multiplication mask */
   378 	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
   379 	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
   380 	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
   381 	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
   382 	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
   383 	/* form channel masks */
   384 	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
   385 	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
   386 	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
   387 	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
   388 	/* get alpha channel shift */
   389 	__asm__ __volatile__ (
   390 		"movd %0, %%mm5"
   391 		: : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
   392 
   393 	while(height--) {
   394 	    DUFFS_LOOP4({
   395 		Uint32 alpha = *srcp & amask;
   396 		/* FIXME: Here we special-case opaque alpha since the
   397 			compositioning used (>>8 instead of /255) doesn't handle
   398 			it correctly. Also special-case alpha=0 for speed?
   399 			Benchmark this! */
   400 		if(alpha == 0) {
   401 			/* do nothing */
   402 		} else if(alpha == amask) {
   403 			/* opaque alpha -- copy RGB, keep dst alpha */
   404 			/* using MMX here to free up regular registers for other things */
   405 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   406 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   407 			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
   408 			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
   409 			por_r2r(mm1, mm2); /* src | dst -> mm2 */
   410 			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
   411 		} else {
   412 			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
   413 			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
   414 
   415 			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
   416 			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
   417 
   418 			__asm__ __volatile__ (
   419 				"movd %0, %%mm4"
   420 				: : "r" (alpha) ); /* 0000A000 -> mm4 */
   421 			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
   422 			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
   423 			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
   424 			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
   425 
   426 			/* blend */		    
   427 			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
   428 			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
   429 			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
   430 			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
   431 			
   432 			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
   433 			movd_r2m(mm2, *dstp);/* mm2 -> dst */
   434 		}
   435 		++srcp;
   436 		++dstp;
   437 	    }, width);
   438 	    srcp += srcskip;
   439 	    dstp += dstskip;
   440 	}
   441 	emms();
   442 }
   443 /* End GCC_ASMBLIT */
   444 
   445 #elif MSVC_ASMBLIT
   446 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   447 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
   448 {
   449 	int width = info->d_width;
   450 	int height = info->d_height;
   451 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   452 	int srcskip = info->s_skip >> 2;
   453 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   454 	int dstskip = info->d_skip >> 2;
   455 	Uint32 dalpha = info->dst->Amask;
   456 
   457 	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   458 	
   459 	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
   460 	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
   461 	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   462 
   463 	while (height--) {
   464 		int n = width;
   465 		if ( n & 1 ) {
   466 			Uint32 s = *srcp++;
   467 			Uint32 d = *dstp;
   468 			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   469 				   + (s & d & 0x00010101)) | dalpha;
   470 			n--;
   471 		}
   472 		
   473 		for (n >>= 1; n > 0; --n) {
   474 			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
   475 			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
   476 
   477 			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
   478 			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   479 
   480 			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
   481 			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
   482 			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
   483 			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
   484 
   485 			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
   486 			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
   487 			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
   488 			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
   489 			
   490 			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
   491 			dstp += 2;
   492 			srcp += 2;
   493 		}
   494 		
   495 		srcp += srcskip;
   496 		dstp += dstskip;
   497 	}
   498 	_mm_empty();
   499 }
   500 
   501 /* fast RGB888->(A)RGB888 blending with surface alpha */
   502 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
   503 {
   504 	SDL_PixelFormat* df = info->dst;
   505 	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   506 	unsigned alpha = info->src->alpha;
   507 
   508 	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   509 			/* only call a128 version when R,G,B occupy lower bits */
   510 		BlitRGBtoRGBSurfaceAlpha128MMX(info);
   511 	} else {
   512 		int width = info->d_width;
   513 		int height = info->d_height;
   514 		Uint32 *srcp = (Uint32 *)info->s_pixels;
   515 		int srcskip = info->s_skip >> 2;
   516 		Uint32 *dstp = (Uint32 *)info->d_pixels;
   517 		int dstskip = info->d_skip >> 2;
   518 		Uint32 dalpha = df->Amask;
   519 		Uint32 amult;
   520 
   521 		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   522 		
   523 		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   524 		/* form the alpha mult */
   525 		amult = alpha | (alpha << 8);
   526 		amult = amult | (amult << 16);
   527 		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
   528 		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
   529 		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   530 			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   531 		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
   532 		
   533 		while (height--) {
   534 			int n = width;
   535 			if (n & 1) {
   536 				/* One Pixel Blend */
   537 				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
   538 				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   539 
   540 				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   541 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   542 
   543 				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
   544 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   545 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   546 				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   547 				
   548 				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   549 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   550 				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   551 
   552 				++srcp;
   553 				++dstp;
   554 				
   555 				n--;
   556 			}
   557 
   558 			for (n >>= 1; n > 0; --n) {
   559 				/* Two Pixels Blend */
   560 				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
   561 				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
   562 				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   563 				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   564 
   565 				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
   566 				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
   567 				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   568 				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   569 
   570 				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   571 				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
   572 				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
   573 				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   574 
   575 				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
   576 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   577 				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
   578 				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   579 				
   580 				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   581 				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   582 
   583 				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
   584 
   585 				srcp += 2;
   586 				dstp += 2;
   587 			}
   588 			srcp += srcskip;
   589 			dstp += dstskip;
   590 		}
   591 		_mm_empty();
   592 	}
   593 }
   594 
   595 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   596 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
   597 {
   598 	int width = info->d_width;
   599 	int height = info->d_height;
   600 	Uint32 *srcp = (Uint32 *)info->s_pixels;
   601 	int srcskip = info->s_skip >> 2;
   602 	Uint32 *dstp = (Uint32 *)info->d_pixels;
   603 	int dstskip = info->d_skip >> 2;
   604 	SDL_PixelFormat* sf = info->src;
   605 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   606 	Uint32 amask = sf->Amask;
   607 	Uint32 ashift = sf->Ashift;
   608 	Uint64 multmask;
   609 
   610 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
   611 
   612 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
   613 	multmask = ~(0xFFFFi64 << (ashift * 2));
   614 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
   615 
   616 	while(height--) {
   617 		DUFFS_LOOP4({
   618 		Uint32 alpha = *srcp & amask;
   619 		if (alpha == 0) {
   620 			/* do nothing */
   621 		} else if (alpha == amask) {
   622 			/* opaque alpha -- copy RGB, keep dst alpha */
   623 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   624 		} else {
   625 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   626 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   627 
   628 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   629 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   630 
   631 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   632 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   633 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   634 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   635 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   636 
   637 			/* blend */		    
   638 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   639 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   640 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   641 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   642 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   643 			
   644 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   645 		}
   646 		++srcp;
   647 		++dstp;
   648 	    }, width);
   649 	    srcp += srcskip;
   650 	    dstp += dstskip;
   651 	}
   652 	_mm_empty();
   653 }
   654 /* End MSVC_ASMBLIT */
   655 
   656 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   657 
   658 #if SDL_ALTIVEC_BLITTERS
   659 #if __MWERKS__
   660 #pragma altivec_model on
   661 #endif
   662 #if HAVE_ALTIVEC_H
   663 #include <altivec.h>
   664 #endif
   665 #include <assert.h>
   666 
   667 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   668     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   669         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   670     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   671         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   672 #else
   673     #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   674         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   675     #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   676         (vector unsigned short) { a,b,c,d,e,f,g,h }
   677 #endif
   678 
   679 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   680 #define VECPRINT(msg, v) do { \
   681     vector unsigned int tmpvec = (vector unsigned int)(v); \
   682     unsigned int *vp = (unsigned int *)&tmpvec; \
   683     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   684 } while (0)
   685 
   686 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   687     (vector unsigned char)(
   688         0x00, 0x10, 0x02, 0x12,
   689         0x04, 0x14, 0x06, 0x16,
   690         0x08, 0x18, 0x0A, 0x1A,
   691         0x0C, 0x1C, 0x0E, 0x1E );
   692 */
   693 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   694 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   695 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   696 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   697     ? vec_lvsl(0, src) \
   698     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   699 
   700    
   701 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   702     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   703     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   704     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   705     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   706     /* valpha2 is 255-alpha */ \
   707     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   708     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   709     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   710     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   711     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   712     /* add source and dest */ \
   713     vtemp1 = vec_add(vtemp1, vtemp3); \
   714     vtemp2 = vec_add(vtemp2, vtemp4); \
   715     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   716     vtemp1 = vec_add(vtemp1, v1_16); \
   717     vtemp3 = vec_sr(vtemp1, v8_16); \
   718     vtemp1 = vec_add(vtemp1, vtemp3); \
   719     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   720     vtemp2 = vec_add(vtemp2, v1_16); \
   721     vtemp4 = vec_sr(vtemp2, v8_16); \
   722     vtemp2 = vec_add(vtemp2, vtemp4); \
   723     /* (>>8) and get ARGBARGBARGBARGB */ \
   724     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   725 } while (0)
   726  
   727 /* Calculate the permute vector used for 32->32 swizzling */
   728 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
   729                                   const SDL_PixelFormat *dstfmt)
   730 {
   731     /*
   732      * We have to assume that the bits that aren't used by other
   733      *  colors is alpha, and it's one complete byte, since some formats
   734      *  leave alpha with a zero mask, but we should still swizzle the bits.
   735      */
   736     /* ARGB */
   737     const static struct SDL_PixelFormat default_pixel_format = {
   738         NULL, 0, 0,
   739         0, 0, 0, 0,
   740         16, 8, 0, 24,
   741         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   742         0, 0};
   743     if (!srcfmt) {
   744         srcfmt = &default_pixel_format;
   745     }
   746     if (!dstfmt) {
   747         dstfmt = &default_pixel_format;
   748     }
   749     const vector unsigned char plus = VECUINT8_LITERAL
   750                                             ( 0x00, 0x00, 0x00, 0x00,
   751                                               0x04, 0x04, 0x04, 0x04,
   752                                               0x08, 0x08, 0x08, 0x08,
   753                                               0x0C, 0x0C, 0x0C, 0x0C );
   754     vector unsigned char vswiz;
   755     vector unsigned int srcvec;
   756 #define RESHIFT(X) (3 - ((X) >> 3))
   757     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   758     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   759     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   760     Uint32 amask;
   761     /* Use zero for alpha if either surface doesn't have alpha */
   762     if (dstfmt->Amask) {
   763         amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   764     } else {
   765         amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   766     }
   767 #undef RESHIFT  
   768     ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
   769     vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   770     return(vswiz);
   771 }
   772 
   773 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   774 {
   775     int height = info->d_height;
   776     Uint8 *src = (Uint8 *)info->s_pixels;
   777     int srcskip = info->s_skip;
   778     Uint8 *dst = (Uint8 *)info->d_pixels;
   779     int dstskip = info->d_skip;
   780     SDL_PixelFormat *srcfmt = info->src;
   781 
   782     vector unsigned char v0 = vec_splat_u8(0);
   783     vector unsigned short v8_16 = vec_splat_u16(8);
   784     vector unsigned short v1_16 = vec_splat_u16(1);
   785     vector unsigned short v2_16 = vec_splat_u16(2);
   786     vector unsigned short v3_16 = vec_splat_u16(3);
   787     vector unsigned int v8_32 = vec_splat_u32(8);
   788     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   789     vector unsigned short v3f = VECUINT16_LITERAL(
   790         0x003f, 0x003f, 0x003f, 0x003f,
   791         0x003f, 0x003f, 0x003f, 0x003f);
   792     vector unsigned short vfc = VECUINT16_LITERAL(
   793         0x00fc, 0x00fc, 0x00fc, 0x00fc,
   794         0x00fc, 0x00fc, 0x00fc, 0x00fc);
   795 
   796     /* 
   797         0x10 - 0x1f is the alpha
   798         0x00 - 0x0e evens are the red
   799         0x01 - 0x0f odds are zero
   800     */
   801     vector unsigned char vredalpha1 = VECUINT8_LITERAL(
   802         0x10, 0x00, 0x01, 0x01,
   803         0x10, 0x02, 0x01, 0x01,
   804         0x10, 0x04, 0x01, 0x01,
   805         0x10, 0x06, 0x01, 0x01
   806     );
   807     vector unsigned char vredalpha2 = (vector unsigned char)(
   808         vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   809     );
   810     /*
   811         0x00 - 0x0f is ARxx ARxx ARxx ARxx
   812         0x11 - 0x0f odds are blue
   813     */
   814     vector unsigned char vblue1 = VECUINT8_LITERAL(
   815         0x00, 0x01, 0x02, 0x11,
   816         0x04, 0x05, 0x06, 0x13,
   817         0x08, 0x09, 0x0a, 0x15,
   818         0x0c, 0x0d, 0x0e, 0x17
   819     );
   820     vector unsigned char vblue2 = (vector unsigned char)(
   821         vec_add((vector unsigned int)vblue1, v8_32)
   822     );
   823     /*
   824         0x00 - 0x0f is ARxB ARxB ARxB ARxB
   825         0x10 - 0x0e evens are green
   826     */
   827     vector unsigned char vgreen1 = VECUINT8_LITERAL(
   828         0x00, 0x01, 0x10, 0x03,
   829         0x04, 0x05, 0x12, 0x07,
   830         0x08, 0x09, 0x14, 0x0b,
   831         0x0c, 0x0d, 0x16, 0x0f
   832     );
   833     vector unsigned char vgreen2 = (vector unsigned char)(
   834         vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   835     );
   836     vector unsigned char vgmerge = VECUINT8_LITERAL(
   837         0x00, 0x02, 0x00, 0x06,
   838         0x00, 0x0a, 0x00, 0x0e,
   839         0x00, 0x12, 0x00, 0x16,
   840         0x00, 0x1a, 0x00, 0x1e);
   841     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   842     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   843     vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   844 
   845     vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   846     vf800 = vec_sl(vf800, vec_splat_u16(8));
   847 
   848     while(height--) {
   849         int extrawidth;
   850         vector unsigned char valigner;
   851         vector unsigned char vsrc;
   852         vector unsigned char voverflow;
   853         int width = info->d_width;
   854 
   855 #define ONE_PIXEL_BLEND(condition, widthvar) \
   856         while (condition) { \
   857             Uint32 Pixel; \
   858             unsigned sR, sG, sB, dR, dG, dB, sA; \
   859             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   860             if(sA) { \
   861                 unsigned short dstpixel = *((unsigned short *)dst); \
   862                 dR = (dstpixel >> 8) & 0xf8; \
   863                 dG = (dstpixel >> 3) & 0xfc; \
   864                 dB = (dstpixel << 3) & 0xf8; \
   865                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   866                 *((unsigned short *)dst) = ( \
   867                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   868                 ); \
   869             } \
   870             src += 4; \
   871             dst += 2; \
   872             widthvar--; \
   873         }
   874         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   875         extrawidth = (width % 8);
   876         valigner = VEC_ALIGNER(src);
   877         vsrc = (vector unsigned char)vec_ld(0, src);
   878         width -= extrawidth;
   879         while (width) {
   880             vector unsigned char valpha;
   881             vector unsigned char vsrc1, vsrc2;
   882             vector unsigned char vdst1, vdst2;
   883             vector unsigned short vR, vG, vB;
   884             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   885 
   886             /* Load 8 pixels from src as ARGB */
   887             voverflow = (vector unsigned char)vec_ld(15, src);
   888             vsrc = vec_perm(vsrc, voverflow, valigner);
   889             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   890             src += 16;
   891             vsrc = (vector unsigned char)vec_ld(15, src);
   892             voverflow = vec_perm(voverflow, vsrc, valigner);
   893             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   894             src += 16;
   895 
   896             /* Load 8 pixels from dst as XRGB */
   897             voverflow = vec_ld(0, dst);
   898             vR = vec_and((vector unsigned short)voverflow, vf800);
   899             vB = vec_sl((vector unsigned short)voverflow, v3_16);
   900             vG = vec_sl(vB, v2_16);
   901             vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   902             vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   903             vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   904             vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   905             vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   906             vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   907 
   908             /* Alpha blend 8 pixels as ARGB */
   909             valpha = vec_perm(vsrc1, v0, valphaPermute);
   910             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   911             valpha = vec_perm(vsrc2, v0, valphaPermute);
   912             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   913 
   914             /* Convert 8 pixels to 565 */
   915             vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   916             vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   917             vgpixel = vec_and(vgpixel, vfc);
   918             vgpixel = vec_sl(vgpixel, v3_16);
   919             vrpixel = vec_sl(vpixel, v1_16);
   920             vrpixel = vec_and(vrpixel, vf800);
   921             vbpixel = vec_and(vpixel, v3f);
   922             vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   923             vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   924             
   925             /* Store 8 pixels */
   926             vec_st(vdst1, 0, dst);
   927 
   928             width -= 8;
   929             dst += 16;
   930         }
   931         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   932 #undef ONE_PIXEL_BLEND
   933         src += srcskip;
   934         dst += dstskip;
   935     }
   936 }
   937 
   938 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   939 {
   940     unsigned alpha = info->src->alpha;
   941     int height = info->d_height;
   942     Uint32 *srcp = (Uint32 *)info->s_pixels;
   943     int srcskip = info->s_skip >> 2;
   944     Uint32 *dstp = (Uint32 *)info->d_pixels;
   945     int dstskip = info->d_skip >> 2;
   946     SDL_PixelFormat *srcfmt = info->src;
   947     SDL_PixelFormat *dstfmt = info->dst;
   948     unsigned sA = srcfmt->alpha;
   949     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   950     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   951     Uint32 ckey = info->src->colorkey;
   952     vector unsigned char mergePermute;
   953     vector unsigned char vsrcPermute;
   954     vector unsigned char vdstPermute;
   955     vector unsigned char vsdstPermute;
   956     vector unsigned char valpha;
   957     vector unsigned char valphamask;
   958     vector unsigned char vbits;
   959     vector unsigned char v0;
   960     vector unsigned short v1;
   961     vector unsigned short v8;
   962     vector unsigned int vckey;
   963     vector unsigned int vrgbmask;
   964 
   965     mergePermute = VEC_MERGE_PERMUTE();
   966     v0 = vec_splat_u8(0);
   967     v1 = vec_splat_u16(1);
   968     v8 = vec_splat_u16(8);
   969 
   970     /* set the alpha to 255 on the destination surf */
   971     valphamask = VEC_ALPHA_MASK();
   972 
   973     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   974     vdstPermute = calc_swizzle32(NULL, dstfmt);
   975     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   976 
   977     /* set a vector full of alpha and 255-alpha */
   978     ((unsigned char *)&valpha)[0] = alpha;
   979     valpha = vec_splat(valpha, 0);
   980     vbits = (vector unsigned char)vec_splat_s8(-1);
   981 
   982     ckey &= rgbmask;
   983     ((unsigned int *)(char*)&vckey)[0] = ckey;
   984     vckey = vec_splat(vckey, 0);
   985     ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
   986     vrgbmask = vec_splat(vrgbmask, 0);
   987 
   988     while(height--) {
   989         int width = info->d_width;
   990 #define ONE_PIXEL_BLEND(condition, widthvar) \
   991         while (condition) { \
   992             Uint32 Pixel; \
   993             unsigned sR, sG, sB, dR, dG, dB; \
   994             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   995             if(sA && Pixel != ckey) { \
   996                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   997                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   998                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   999                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1000             } \
  1001             dstp++; \
  1002             srcp++; \
  1003             widthvar--; \
  1004         }
  1005         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1006         if (width > 0) {
  1007             int extrawidth = (width % 4);
  1008             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1009             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1010             width -= extrawidth;
  1011             while (width) {
  1012                 vector unsigned char vsel;
  1013                 vector unsigned char voverflow;
  1014                 vector unsigned char vd;
  1015                 vector unsigned char vd_orig;
  1016 
  1017                 /* s = *srcp */
  1018                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1019                 vs = vec_perm(vs, voverflow, valigner);
  1020                 
  1021                 /* vsel is set for items that match the key */
  1022                 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
  1023                 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
  1024 
  1025                 /* permute to source format */
  1026                 vs = vec_perm(vs, valpha, vsrcPermute);
  1027 
  1028                 /* d = *dstp */
  1029                 vd = (vector unsigned char)vec_ld(0, dstp);
  1030                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1031 
  1032                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1033 
  1034                 /* set the alpha channel to full on */
  1035                 vd = vec_or(vd, valphamask);
  1036 
  1037                 /* mask out color key */
  1038                 vd = vec_sel(vd, vd_orig, vsel);
  1039                 
  1040                 /* permute to dest format */
  1041                 vd = vec_perm(vd, vbits, vdstPermute);
  1042 
  1043                 /* *dstp = res */
  1044                 vec_st((vector unsigned int)vd, 0, dstp);
  1045                 
  1046                 srcp += 4;
  1047                 dstp += 4;
  1048                 width -= 4;
  1049                 vs = voverflow;
  1050             }
  1051             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1052         }
  1053 #undef ONE_PIXEL_BLEND
  1054  
  1055         srcp += srcskip;
  1056         dstp += dstskip;
  1057     }
  1058 }
  1059 
  1060 
  1061 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
  1062 {
  1063     int width = info->d_width;
  1064     int height = info->d_height;
  1065     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1066     int srcskip = info->s_skip >> 2;
  1067     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1068     int dstskip = info->d_skip >> 2;
  1069     SDL_PixelFormat *srcfmt = info->src;
  1070     SDL_PixelFormat *dstfmt = info->dst;
  1071     vector unsigned char mergePermute;
  1072     vector unsigned char valphaPermute;
  1073     vector unsigned char vsrcPermute;
  1074     vector unsigned char vdstPermute;
  1075     vector unsigned char vsdstPermute;
  1076     vector unsigned char valphamask;
  1077     vector unsigned char vpixelmask;
  1078     vector unsigned char v0;
  1079     vector unsigned short v1;
  1080     vector unsigned short v8;
  1081 
  1082     v0 = vec_splat_u8(0);
  1083     v1 = vec_splat_u16(1);
  1084     v8 = vec_splat_u16(8);
  1085     mergePermute = VEC_MERGE_PERMUTE();
  1086     valphamask = VEC_ALPHA_MASK();
  1087     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1088     vpixelmask = vec_nor(valphamask, v0);
  1089     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1090     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1091     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1092 
  1093 	while ( height-- ) {
  1094         width = info->d_width;
  1095 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1096             Uint32 Pixel; \
  1097             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
  1098             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
  1099             if(sA) { \
  1100               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
  1101               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1102               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
  1103             } \
  1104             ++srcp; \
  1105             ++dstp; \
  1106             widthvar--; \
  1107         }
  1108         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1109         if (width > 0) {
  1110             /* vsrcPermute */
  1111             /* vdstPermute */
  1112             int extrawidth = (width % 4);
  1113             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1114             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1115             width -= extrawidth;
  1116             while (width) {
  1117                 vector unsigned char voverflow;
  1118                 vector unsigned char vd;
  1119                 vector unsigned char valpha;
  1120                 vector unsigned char vdstalpha;
  1121                 /* s = *srcp */
  1122                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1123                 vs = vec_perm(vs, voverflow, valigner);
  1124                 vs = vec_perm(vs, v0, vsrcPermute);
  1125 
  1126                 valpha = vec_perm(vs, v0, valphaPermute);
  1127                 
  1128                 /* d = *dstp */
  1129                 vd = (vector unsigned char)vec_ld(0, dstp);
  1130                 vd = vec_perm(vd, v0, vsdstPermute);
  1131                 vdstalpha = vec_and(vd, valphamask);
  1132 
  1133                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1134 
  1135                 /* set the alpha to the dest alpha */
  1136                 vd = vec_and(vd, vpixelmask);
  1137                 vd = vec_or(vd, vdstalpha);
  1138                 vd = vec_perm(vd, v0, vdstPermute);
  1139 
  1140                 /* *dstp = res */
  1141                 vec_st((vector unsigned int)vd, 0, dstp);
  1142                 
  1143                 srcp += 4;
  1144                 dstp += 4;
  1145                 width -= 4;
  1146                 vs = voverflow;
  1147 
  1148             }
  1149             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1150         }
  1151 	    srcp += srcskip;
  1152 	    dstp += dstskip;
  1153 #undef ONE_PIXEL_BLEND
  1154 	}
  1155 }
  1156 
  1157 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1158 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
  1159 {
  1160 	int width = info->d_width;
  1161 	int height = info->d_height;
  1162 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1163 	int srcskip = info->s_skip >> 2;
  1164 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1165 	int dstskip = info->d_skip >> 2;
  1166     vector unsigned char mergePermute;
  1167     vector unsigned char valphaPermute;
  1168     vector unsigned char valphamask;
  1169     vector unsigned char vpixelmask;
  1170     vector unsigned char v0;
  1171     vector unsigned short v1;
  1172     vector unsigned short v8;
  1173     v0 = vec_splat_u8(0);
  1174     v1 = vec_splat_u16(1);
  1175     v8 = vec_splat_u16(8);
  1176     mergePermute = VEC_MERGE_PERMUTE();
  1177     valphamask = VEC_ALPHA_MASK();
  1178     valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
  1179     
  1180  
  1181     vpixelmask = vec_nor(valphamask, v0);
  1182 	while(height--) {
  1183         width = info->d_width;
  1184 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1185         while ((condition)) { \
  1186             Uint32 dalpha; \
  1187             Uint32 d; \
  1188             Uint32 s1; \
  1189             Uint32 d1; \
  1190             Uint32 s = *srcp; \
  1191             Uint32 alpha = s >> 24; \
  1192             if(alpha) { \
  1193               if(alpha == SDL_ALPHA_OPAQUE) { \
  1194                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
  1195               } else { \
  1196                 d = *dstp; \
  1197                 dalpha = d & 0xff000000; \
  1198                 s1 = s & 0xff00ff; \
  1199                 d1 = d & 0xff00ff; \
  1200                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
  1201                 s &= 0xff00; \
  1202                 d &= 0xff00; \
  1203                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1204                 *dstp = d1 | d | dalpha; \
  1205               } \
  1206             } \
  1207             ++srcp; \
  1208             ++dstp; \
  1209             widthvar--; \
  1210 	    }
  1211         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1212         if (width > 0) {
  1213             int extrawidth = (width % 4);
  1214             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1215             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1216             width -= extrawidth;
  1217             while (width) {
  1218                 vector unsigned char voverflow;
  1219                 vector unsigned char vd;
  1220                 vector unsigned char valpha;
  1221                 vector unsigned char vdstalpha;
  1222                 /* s = *srcp */
  1223                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1224                 vs = vec_perm(vs, voverflow, valigner);
  1225 
  1226                 valpha = vec_perm(vs, v0, valphaPermute);
  1227                 
  1228                 /* d = *dstp */
  1229                 vd = (vector unsigned char)vec_ld(0, dstp);
  1230                 vdstalpha = vec_and(vd, valphamask);
  1231 
  1232                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1233 
  1234                 /* set the alpha to the dest alpha */
  1235                 vd = vec_and(vd, vpixelmask);
  1236                 vd = vec_or(vd, vdstalpha);
  1237 
  1238                 /* *dstp = res */
  1239                 vec_st((vector unsigned int)vd, 0, dstp);
  1240                 
  1241                 srcp += 4;
  1242                 dstp += 4;
  1243                 width -= 4;
  1244                 vs = voverflow;
  1245             }
  1246             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1247         }
  1248 	    srcp += srcskip;
  1249 	    dstp += dstskip;
  1250 	}
  1251 #undef ONE_PIXEL_BLEND
  1252 }
  1253 
  1254 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
  1255 {
  1256     /* XXX : 6 */
  1257 	unsigned alpha = info->src->alpha;
  1258     int height = info->d_height;
  1259     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1260     int srcskip = info->s_skip >> 2;
  1261     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1262     int dstskip = info->d_skip >> 2;
  1263     SDL_PixelFormat *srcfmt = info->src;
  1264     SDL_PixelFormat *dstfmt = info->dst;
  1265 	unsigned sA = srcfmt->alpha;
  1266 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1267     vector unsigned char mergePermute;
  1268     vector unsigned char vsrcPermute;
  1269     vector unsigned char vdstPermute;
  1270     vector unsigned char vsdstPermute;
  1271     vector unsigned char valpha;
  1272     vector unsigned char valphamask;
  1273     vector unsigned char vbits;
  1274     vector unsigned short v1;
  1275     vector unsigned short v8;
  1276 
  1277     mergePermute = VEC_MERGE_PERMUTE();
  1278     v1 = vec_splat_u16(1);
  1279     v8 = vec_splat_u16(8);
  1280 
  1281     /* set the alpha to 255 on the destination surf */
  1282     valphamask = VEC_ALPHA_MASK();
  1283 
  1284     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1285     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1286     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1287 
  1288     /* set a vector full of alpha and 255-alpha */
  1289     ((unsigned char *)&valpha)[0] = alpha;
  1290     valpha = vec_splat(valpha, 0);
  1291     vbits = (vector unsigned char)vec_splat_s8(-1);
  1292 
  1293     while(height--) {
  1294         int width = info->d_width;
  1295 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1296             Uint32 Pixel; \
  1297             unsigned sR, sG, sB, dR, dG, dB; \
  1298             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1299             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1300             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1301             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1302             ++srcp; \
  1303             ++dstp; \
  1304             widthvar--; \
  1305         }
  1306         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1307         if (width > 0) {
  1308             int extrawidth = (width % 4);
  1309             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1310             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1311             width -= extrawidth;
  1312             while (width) {
  1313                 vector unsigned char voverflow;
  1314                 vector unsigned char vd;
  1315 
  1316                 /* s = *srcp */
  1317                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1318                 vs = vec_perm(vs, voverflow, valigner);
  1319                 vs = vec_perm(vs, valpha, vsrcPermute);
  1320                 
  1321                 /* d = *dstp */
  1322                 vd = (vector unsigned char)vec_ld(0, dstp);
  1323                 vd = vec_perm(vd, vd, vsdstPermute);
  1324 
  1325                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1326 
  1327                 /* set the alpha channel to full on */
  1328                 vd = vec_or(vd, valphamask);
  1329                 vd = vec_perm(vd, vbits, vdstPermute);
  1330 
  1331                 /* *dstp = res */
  1332                 vec_st((vector unsigned int)vd, 0, dstp);
  1333                 
  1334                 srcp += 4;
  1335                 dstp += 4;
  1336                 width -= 4;
  1337                 vs = voverflow;
  1338             }
  1339             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1340         }
  1341 #undef ONE_PIXEL_BLEND
  1342  
  1343         srcp += srcskip;
  1344         dstp += dstskip;
  1345     }
  1346 
  1347 }
  1348 
  1349 
  1350 /* fast RGB888->(A)RGB888 blending */
  1351 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
  1352 {
  1353 	unsigned alpha = info->src->alpha;
  1354     int height = info->d_height;
  1355     Uint32 *srcp = (Uint32 *)info->s_pixels;
  1356     int srcskip = info->s_skip >> 2;
  1357     Uint32 *dstp = (Uint32 *)info->d_pixels;
  1358     int dstskip = info->d_skip >> 2;
  1359     vector unsigned char mergePermute;
  1360     vector unsigned char valpha;
  1361     vector unsigned char valphamask;
  1362     vector unsigned short v1;
  1363     vector unsigned short v8;
  1364 
  1365     mergePermute = VEC_MERGE_PERMUTE();
  1366     v1 = vec_splat_u16(1);
  1367     v8 = vec_splat_u16(8);
  1368 
  1369     /* set the alpha to 255 on the destination surf */
  1370     valphamask = VEC_ALPHA_MASK();
  1371 
  1372     /* set a vector full of alpha and 255-alpha */
  1373     ((unsigned char *)&valpha)[0] = alpha;
  1374     valpha = vec_splat(valpha, 0);
  1375 
  1376     while(height--) {
  1377         int width = info->d_width;
  1378 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1379             Uint32 s = *srcp; \
  1380             Uint32 d = *dstp; \
  1381             Uint32 s1 = s & 0xff00ff; \
  1382             Uint32 d1 = d & 0xff00ff; \
  1383             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1384                  & 0xff00ff; \
  1385             s &= 0xff00; \
  1386             d &= 0xff00; \
  1387             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1388             *dstp = d1 | d | 0xff000000; \
  1389             ++srcp; \
  1390             ++dstp; \
  1391             widthvar--; \
  1392         }
  1393         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1394         if (width > 0) {
  1395             int extrawidth = (width % 4);
  1396             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1397             vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
  1398             width -= extrawidth;
  1399             while (width) {
  1400                 vector unsigned char voverflow;
  1401                 vector unsigned char vd;
  1402 
  1403                 /* s = *srcp */
  1404                 voverflow = (vector unsigned char)vec_ld(15, srcp);
  1405                 vs = vec_perm(vs, voverflow, valigner);
  1406                 
  1407                 /* d = *dstp */
  1408                 vd = (vector unsigned char)vec_ld(0, dstp);
  1409 
  1410                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1411 
  1412                 /* set the alpha channel to full on */
  1413                 vd = vec_or(vd, valphamask);
  1414 
  1415                 /* *dstp = res */
  1416                 vec_st((vector unsigned int)vd, 0, dstp);
  1417                 
  1418                 srcp += 4;
  1419                 dstp += 4;
  1420                 width -= 4;
  1421                 vs = voverflow;
  1422             }
  1423             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1424         }
  1425 #undef ONE_PIXEL_BLEND
  1426  
  1427         srcp += srcskip;
  1428         dstp += dstskip;
  1429     }
  1430 }
  1431 #if __MWERKS__
  1432 #pragma altivec_model off
  1433 #endif
  1434 #endif /* SDL_ALTIVEC_BLITTERS */
  1435 
  1436 #if SDL_ARM_SIMD_BLITTERS
  1437 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
  1438 
  1439 static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info)
  1440 {
  1441 	int32_t width = info->d_width;
  1442 	int32_t height = info->d_height;
  1443 	uint16_t *dstp = (uint16_t *)info->d_pixels;
  1444 	int32_t dststride = width + (info->d_skip >> 1);
  1445 	uint32_t *srcp = (uint32_t *)info->s_pixels;
  1446 	int32_t srcstride = width + (info->s_skip >> 2);
  1447 
  1448 	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
  1449 }
  1450 
  1451 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
  1452 
  1453 static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info)
  1454 {
  1455 	int32_t width = info->d_width;
  1456 	int32_t height = info->d_height;
  1457 	uint32_t *dstp = (uint32_t *)info->d_pixels;
  1458 	int32_t dststride = width + (info->d_skip >> 2);
  1459 	uint32_t *srcp = (uint32_t *)info->s_pixels;
  1460 	int32_t srcstride = width + (info->s_skip >> 2);
  1461 
  1462 	BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
  1463 }
  1464 #endif
  1465 
  1466 #if SDL_ARM_NEON_BLITTERS
  1467 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
  1468 
  1469 static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info)
  1470 {
  1471     int32_t width = info->d_width;
  1472     int32_t height = info->d_height;
  1473     uint16_t *dstp = (uint16_t *)info->d_pixels;
  1474     int32_t dststride = width + (info->d_skip >> 1);
  1475     uint32_t *srcp = (uint32_t *)info->s_pixels;
  1476     int32_t srcstride = width + (info->s_skip >> 2);
  1477 
  1478     BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
  1479 }
  1480 
  1481 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
  1482 
  1483 static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info)
  1484 {
  1485 	int32_t width = info->d_width;
  1486 	int32_t height = info->d_height;
  1487 	uint32_t *dstp = (uint32_t *)info->d_pixels;
  1488 	int32_t dststride = width + (info->d_skip >> 2);
  1489 	uint32_t *srcp = (uint32_t *)info->s_pixels;
  1490 	int32_t srcstride = width + (info->s_skip >> 2);
  1491 
  1492 	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
  1493 }
  1494 #endif
  1495 
  1496 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1497 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
  1498 {
  1499 	int width = info->d_width;
  1500 	int height = info->d_height;
  1501 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1502 	int srcskip = info->s_skip >> 2;
  1503 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1504 	int dstskip = info->d_skip >> 2;
  1505 
  1506 	while(height--) {
  1507 	    DUFFS_LOOP4({
  1508 		    Uint32 s = *srcp++;
  1509 		    Uint32 d = *dstp;
  1510 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1511 			       + (s & d & 0x00010101)) | 0xff000000;
  1512 	    }, width);
  1513 	    srcp += srcskip;
  1514 	    dstp += dstskip;
  1515 	}
  1516 }
  1517 
  1518 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1519 static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
  1520 {
  1521 	unsigned alpha = info->src->alpha;
  1522 	if(alpha == 128) {
  1523 		BlitRGBtoRGBSurfaceAlpha128(info);
  1524 	} else {
  1525 		int width = info->d_width;
  1526 		int height = info->d_height;
  1527 		Uint32 *srcp = (Uint32 *)info->s_pixels;
  1528 		int srcskip = info->s_skip >> 2;
  1529 		Uint32 *dstp = (Uint32 *)info->d_pixels;
  1530 		int dstskip = info->d_skip >> 2;
  1531 		Uint32 s;
  1532 		Uint32 d;
  1533 		Uint32 s1;
  1534 		Uint32 d1;
  1535 
  1536 		while(height--) {
  1537 			DUFFS_LOOP_DOUBLE2({
  1538 				/* One Pixel Blend */
  1539 				s = *srcp;
  1540 				d = *dstp;
  1541 				s1 = s & 0xff00ff;
  1542 				d1 = d & 0xff00ff;
  1543 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1544 				     & 0xff00ff;
  1545 				s &= 0xff00;
  1546 				d &= 0xff00;
  1547 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1548 				*dstp = d1 | d | 0xff000000;
  1549 				++srcp;
  1550 				++dstp;
  1551 			},{
  1552 			        /* Two Pixels Blend */
  1553 				s = *srcp;
  1554 				d = *dstp;
  1555 				s1 = s & 0xff00ff;
  1556 				d1 = d & 0xff00ff;
  1557 				d1 += (s1 - d1) * alpha >> 8;
  1558 				d1 &= 0xff00ff;
  1559 				     
  1560 				s = ((s & 0xff00) >> 8) | 
  1561 					((srcp[1] & 0xff00) << 8);
  1562 				d = ((d & 0xff00) >> 8) |
  1563 					((dstp[1] & 0xff00) << 8);
  1564 				d += (s - d) * alpha >> 8;
  1565 				d &= 0x00ff00ff;
  1566 				
  1567 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1568 				++srcp;
  1569 				
  1570 			        s1 = *srcp;
  1571 				d1 = *dstp;
  1572 				s1 &= 0xff00ff;
  1573 				d1 &= 0xff00ff;
  1574 				d1 += (s1 - d1) * alpha >> 8;
  1575 				d1 &= 0xff00ff;
  1576 				
  1577 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1578 				++srcp;
  1579 				++dstp;
  1580 			}, width);
  1581 			srcp += srcskip;
  1582 			dstp += dstskip;
  1583 		}
  1584 	}
  1585 }
  1586 
  1587 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1588 static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
  1589 {
  1590 	int width = info->d_width;
  1591 	int height = info->d_height;
  1592 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1593 	int srcskip = info->s_skip >> 2;
  1594 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1595 	int dstskip = info->d_skip >> 2;
  1596 
  1597 	while(height--) {
  1598 	    DUFFS_LOOP4({
  1599 		Uint32 dalpha;
  1600 		Uint32 d;
  1601 		Uint32 s1;
  1602 		Uint32 d1;
  1603 		Uint32 s = *srcp;
  1604 		Uint32 alpha = s >> 24;
  1605 		/* FIXME: Here we special-case opaque alpha since the
  1606 		   compositioning used (>>8 instead of /255) doesn't handle
  1607 		   it correctly. Also special-case alpha=0 for speed?
  1608 		   Benchmark this! */
  1609 		if(alpha) {   
  1610 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1611 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1612 		  } else {
  1613 		    /*
  1614 		     * take out the middle component (green), and process
  1615 		     * the other two in parallel. One multiply less.
  1616 		     */
  1617 		    d = *dstp;
  1618 		    dalpha = d & 0xff000000;
  1619 		    s1 = s & 0xff00ff;
  1620 		    d1 = d & 0xff00ff;
  1621 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1622 		    s &= 0xff00;
  1623 		    d &= 0xff00;
  1624 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1625 		    *dstp = d1 | d | dalpha;
  1626 		  }
  1627 		}
  1628 		++srcp;
  1629 		++dstp;
  1630 	    }, width);
  1631 	    srcp += srcskip;
  1632 	    dstp += dstskip;
  1633 	}
  1634 }
  1635 
  1636 #if GCC_ASMBLIT
  1637 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1638 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1639 {
  1640 	int width = info->d_width;
  1641 	int height = info->d_height;
  1642 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1643 	int srcskip = info->s_skip >> 2;
  1644 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1645 	int dstskip = info->d_skip >> 2;
  1646 	SDL_PixelFormat* sf = info->src;
  1647 	Uint32 amask = sf->Amask;
  1648 
  1649 	__asm__ (
  1650 	/* make mm6 all zeros. */
  1651 	"pxor       %%mm6, %%mm6\n"
  1652 	
  1653 	/* Make a mask to preserve the alpha. */
  1654 	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
  1655 	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
  1656 	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
  1657 	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
  1658 	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
  1659 
  1660 	/* form channel masks */
  1661 	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
  1662 	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
  1663 	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
  1664 	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
  1665 	
  1666 	/* get alpha channel shift */
  1667 	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
  1668 
  1669 	  : /* nothing */ : "rm" (amask), "rm" ((Uint32) sf->Ashift) );
  1670 
  1671 	while(height--) {
  1672 
  1673 	    DUFFS_LOOP4({
  1674 		Uint32 alpha;
  1675 
  1676 		__asm__ (
  1677 		"prefetch 64(%0)\n"
  1678 		"prefetch 64(%1)\n"
  1679 			: : "r" (srcp), "r" (dstp) );
  1680 
  1681 		alpha = *srcp & amask;
  1682 		/* FIXME: Here we special-case opaque alpha since the
  1683 		   compositioning used (>>8 instead of /255) doesn't handle
  1684 		   it correctly. Also special-case alpha=0 for speed?
  1685 		   Benchmark this! */
  1686 		if(alpha == 0) {
  1687 		    /* do nothing */
  1688 		}
  1689 		else if(alpha == amask) {
  1690 			/* opaque alpha -- copy RGB, keep dst alpha */
  1691 		    /* using MMX here to free up regular registers for other things */
  1692 			    __asm__ (
  1693 		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
  1694 		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
  1695 		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
  1696 		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
  1697 		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
  1698 		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
  1699 
  1700 		     : : "r" (srcp), "r" (dstp) );
  1701 		} 
  1702 
  1703 		else {
  1704 			    __asm__ (
  1705 		    /* load in the source, and dst. */
  1706 		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
  1707 		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
  1708 
  1709 		    /* Move the src alpha into mm2 */
  1710 
  1711 		    /* if supporting pshufw */
  1712 		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
  1713 		    /*"psrlw     $8, %%mm2\n" */
  1714 		    
  1715 		    /* else: */
  1716 		    "movd       %2,    %%mm2\n"
  1717 		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
  1718 		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
  1719 		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
  1720 		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
  1721 
  1722 		    /* move the colors into words. */
  1723 		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
  1724 		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
  1725 
  1726 		    /* src - dst */
  1727 		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
  1728 
  1729 		    /* A * (src-dst) */
  1730 		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
  1731 		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
  1732 		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
  1733 
  1734 		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
  1735 		    
  1736 		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
  1737 
  1738 		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
  1739 
  1740 		}
  1741 		++srcp;
  1742 		++dstp;
  1743 	    }, width);
  1744 	    srcp += srcskip;
  1745 	    dstp += dstskip;
  1746 	}
  1747 
  1748 	__asm__ (
  1749 	"emms\n"
  1750 		:   );
  1751 }
  1752 /* End GCC_ASMBLIT*/
  1753 
  1754 #elif MSVC_ASMBLIT
  1755 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1756 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
  1757 {
  1758 	int width = info->d_width;
  1759 	int height = info->d_height;
  1760 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  1761 	int srcskip = info->s_skip >> 2;
  1762 	Uint32 *dstp = (Uint32 *)info->d_pixels;
  1763 	int dstskip = info->d_skip >> 2;
  1764 	SDL_PixelFormat* sf = info->src;
  1765 	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1766 	Uint32 amask = sf->Amask;
  1767 	Uint32 ashift = sf->Ashift;
  1768 	Uint64 multmask;
  1769 	
  1770 	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1771 
  1772 	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
  1773 	multmask = ~(0xFFFFi64 << (ashift * 2));
  1774 	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
  1775 
  1776 	while(height--) {
  1777 	    DUFFS_LOOP4({
  1778 		Uint32 alpha;
  1779 
  1780 		_m_prefetch(srcp + 16);
  1781 		_m_prefetch(dstp + 16);
  1782 
  1783 		alpha = *srcp & amask;
  1784 		if (alpha == 0) {
  1785 			/* do nothing */
  1786 		} else if (alpha == amask) {
  1787 			/* copy RGB, keep dst alpha */
  1788 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1789 		} else {
  1790 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1791 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1792 
  1793 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1794 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1795 
  1796 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1797 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1798 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1799 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1800 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1801 
  1802 			/* blend */		    
  1803 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1804 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1805 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1806 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1807 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1808 			
  1809 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1810 		}
  1811 		++srcp;
  1812 		++dstp;
  1813 	    }, width);
  1814 	    srcp += srcskip;
  1815 	    dstp += dstskip;
  1816 	}
  1817 	_mm_empty();
  1818 }
  1819 /* End MSVC_ASMBLIT */
  1820 
  1821 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1822 
  1823 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1824 
  1825 /* blend a single 16 bit pixel at 50% */
  1826 #define BLEND16_50(d, s, mask)						\
  1827 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1828 
  1829 /* blend two 16 bit pixels at 50% */
  1830 #define BLEND2x16_50(d, s, mask)					     \
  1831 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1832 	 + (s & d & (~(mask | mask << 16))))
  1833 
  1834 static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
  1835 {
  1836 	int width = info->d_width;
  1837 	int height = info->d_height;
  1838 	Uint16 *srcp = (Uint16 *)info->s_pixels;
  1839 	int srcskip = info->s_skip >> 1;
  1840 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  1841 	int dstskip = info->d_skip >> 1;
  1842 
  1843 	while(height--) {
  1844 		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
  1845 			/*
  1846 			 * Source and destination not aligned, pipeline it.
  1847 			 * This is mostly a win for big blits but no loss for
  1848 			 * small ones
  1849 			 */
  1850 			Uint32 prev_sw;
  1851 			int w = width;
  1852 
  1853 			/* handle odd destination */
  1854 			if((uintptr_t)dstp & 2) {
  1855 				Uint16 d = *dstp, s = *srcp;
  1856 				*dstp = BLEND16_50(d, s, mask);
  1857 				dstp++;
  1858 				srcp++;
  1859 				w--;
  1860 			}
  1861 			srcp++;	/* srcp is now 32-bit aligned */
  1862 
  1863 			/* bootstrap pipeline with first halfword */
  1864 			prev_sw = ((Uint32 *)srcp)[-1];
  1865 
  1866 			while(w > 1) {
  1867 				Uint32 sw, dw, s;
  1868 				sw = *(Uint32 *)srcp;
  1869 				dw = *(Uint32 *)dstp;
  1870 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1871 				s = (prev_sw << 16) + (sw >> 16);
  1872 #else
  1873 				s = (prev_sw >> 16) + (sw << 16);
  1874 #endif
  1875 				prev_sw = sw;
  1876 				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
  1877 				dstp += 2;
  1878 				srcp += 2;
  1879 				w -= 2;
  1880 			}
  1881 
  1882 			/* final pixel if any */
  1883 			if(w) {
  1884 				Uint16 d = *dstp, s;
  1885 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1886 				s = (Uint16)prev_sw;
  1887 #else
  1888 				s = (Uint16)(prev_sw >> 16);
  1889 #endif
  1890 				*dstp = BLEND16_50(d, s, mask);
  1891 				srcp++;
  1892 				dstp++;
  1893 			}
  1894 			srcp += srcskip - 1;
  1895 			dstp += dstskip;
  1896 		} else {
  1897 			/* source and destination are aligned */
  1898 			int w = width;
  1899 
  1900 			/* first odd pixel? */
  1901 			if((uintptr_t)srcp & 2) {
  1902 				Uint16 d = *dstp, s = *srcp;
  1903 				*dstp = BLEND16_50(d, s, mask);
  1904 				srcp++;
  1905 				dstp++;
  1906 				w--;
  1907 			}
  1908 			/* srcp and dstp are now 32-bit aligned */
  1909 
  1910 			while(w > 1) {
  1911 				Uint32 sw = *(Uint32 *)srcp;
  1912 				Uint32 dw = *(Uint32 *)dstp;
  1913 				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
  1914 				srcp += 2;
  1915 				dstp += 2;
  1916 				w -= 2;
  1917 			}
  1918 
  1919 			/* last odd pixel? */
  1920 			if(w) {
  1921 				Uint16 d = *dstp, s = *srcp;
  1922 				*dstp = BLEND16_50(d, s, mask);
  1923 				srcp++;
  1924 				dstp++;
  1925 			}
  1926 			srcp += srcskip;
  1927 			dstp += dstskip;
  1928 		}
  1929 	}
  1930 }
  1931 
  1932 #if GCC_ASMBLIT
  1933 /* fast RGB565->RGB565 blending with surface alpha */
  1934 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  1935 {
  1936 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  1937 	if(alpha == 128) {
  1938 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  1939 	} else {
  1940 		int width = info->d_width;
  1941 		int height = info->d_height;
  1942 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  1943 		int srcskip = info->s_skip >> 1;
  1944 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  1945 		int dstskip = info->d_skip >> 1;
  1946 		Uint32 s, d;
  1947 		Uint64 load;
  1948 	  
  1949 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  1950 		load = alpha;
  1951 		alpha >>= 3;		/* downscale alpha to 5 bits */
  1952 
  1953 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  1954 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  1955 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  1956 		/* position alpha to allow for mullo and mulhi on diff channels
  1957 		   to reduce the number of operations */
  1958 		psllq_i2r(3, mm0);
  1959 	  
  1960 		/* Setup the 565 color channel masks */
  1961 		load = 0x07E007E007E007E0ULL;
  1962 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  1963 		load = 0x001F001F001F001FULL;
  1964 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  1965 		while(height--) {
  1966 			DUFFS_LOOP_QUATRO2(
  1967 			{
  1968 				s = *srcp++;
  1969 				d = *dstp;
  1970 				/*
  1971 				 * shift out the middle component (green) to
  1972 				 * the high 16 bits, and process all three RGB
  1973 				 * components at the same time.
  1974 				 */
  1975 				s = (s | s << 16) & 0x07e0f81f;
  1976 				d = (d | d << 16) & 0x07e0f81f;
  1977 				d += (s - d) * alpha >> 5;
  1978 				d &= 0x07e0f81f;
  1979 				*dstp++ = d | d >> 16;
  1980 			},{
  1981 				s = *srcp++;
  1982 				d = *dstp;
  1983 				/*
  1984 				 * shift out the middle component (green) to
  1985 				 * the high 16 bits, and process all three RGB
  1986 				 * components at the same time.
  1987 				 */
  1988 				s = (s | s << 16) & 0x07e0f81f;
  1989 				d = (d | d << 16) & 0x07e0f81f;
  1990 				d += (s - d) * alpha >> 5;
  1991 				d &= 0x07e0f81f;
  1992 				*dstp++ = d | d >> 16;
  1993 				s = *srcp++;
  1994 				d = *dstp;
  1995 				/*
  1996 				 * shift out the middle component (green) to
  1997 				 * the high 16 bits, and process all three RGB
  1998 				 * components at the same time.
  1999 				 */
  2000 				s = (s | s << 16) & 0x07e0f81f;
  2001 				d = (d | d << 16) & 0x07e0f81f;
  2002 				d += (s - d) * alpha >> 5;
  2003 				d &= 0x07e0f81f;
  2004 				*dstp++ = d | d >> 16;
  2005 			},{
  2006 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2007 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2008 
  2009 				/* red -- does not need a mask since the right shift clears
  2010 				   the uninteresting bits */
  2011 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2012 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2013 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
  2014 				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
  2015 
  2016 				/* blend */
  2017 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2018 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2019 				/* alpha used is actually 11 bits
  2020 				   11 + 5 = 16 bits, so the sign bits are lost */
  2021 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2022 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2023 				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
  2024 
  2025 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2026 
  2027 				/* green -- process the bits in place */
  2028 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2029 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2030 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2031 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2032 
  2033 				/* blend */
  2034 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2035 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2036 				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
  2037 				   bits are gone and the sign bits present */
  2038 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2039 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2040 
  2041 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2042 
  2043 				/* blue */
  2044 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2045 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2046 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2047 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2048 
  2049 				/* blend */
  2050 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2051 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2052 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2053 				   the interesting bits will need to be MASKed */
  2054 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2055 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2056 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2057 
  2058 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2059 
  2060 				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
  2061 
  2062 				srcp += 4;
  2063 				dstp += 4;
  2064 			}, width);			
  2065 			srcp += srcskip;
  2066 			dstp += dstskip;
  2067 		}
  2068 		emms();
  2069 	}
  2070 }
  2071 
  2072 /* fast RGB555->RGB555 blending with surface alpha */
  2073 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2074 {
  2075 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2076 	if(alpha == 128) {
  2077 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2078 	} else {
  2079 		int width = info->d_width;
  2080 		int height = info->d_height;
  2081 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2082 		int srcskip = info->s_skip >> 1;
  2083 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2084 		int dstskip = info->d_skip >> 1;
  2085 		Uint32 s, d;
  2086 		Uint64 load;
  2087 	  
  2088 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2089 		load = alpha;
  2090 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2091 
  2092 		movq_m2r(load, mm0); /* alpha(0000000A) -> mm0 */
  2093 		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
  2094 		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
  2095 		/* position alpha to allow for mullo and mulhi on diff channels
  2096 		   to reduce the number of operations */
  2097 		psllq_i2r(3, mm0);
  2098 
  2099 		/* Setup the 555 color channel masks */
  2100 		load = 0x03E003E003E003E0ULL;
  2101 		movq_m2r(load, mm4); /* MASKGREEN -> mm4 */
  2102 		load = 0x001F001F001F001FULL;
  2103 		movq_m2r(load, mm7); /* MASKBLUE -> mm7 */
  2104 		while(height--) {
  2105 			DUFFS_LOOP_QUATRO2(
  2106 			{
  2107 				s = *srcp++;
  2108 				d = *dstp;
  2109 				/*
  2110 				 * shift out the middle component (green) to
  2111 				 * the high 16 bits, and process all three RGB
  2112 				 * components at the same time.
  2113 				 */
  2114 				s = (s | s << 16) & 0x03e07c1f;
  2115 				d = (d | d << 16) & 0x03e07c1f;
  2116 				d += (s - d) * alpha >> 5;
  2117 				d &= 0x03e07c1f;
  2118 				*dstp++ = d | d >> 16;
  2119 			},{
  2120 				s = *srcp++;
  2121 				d = *dstp;
  2122 				/*
  2123 				 * shift out the middle component (green) to
  2124 				 * the high 16 bits, and process all three RGB
  2125 				 * components at the same time.
  2126 				 */
  2127 				s = (s | s << 16) & 0x03e07c1f;
  2128 				d = (d | d << 16) & 0x03e07c1f;
  2129 				d += (s - d) * alpha >> 5;
  2130 				d &= 0x03e07c1f;
  2131 				*dstp++ = d | d >> 16;
  2132 			        s = *srcp++;
  2133 				d = *dstp;
  2134 				/*
  2135 				 * shift out the middle component (green) to
  2136 				 * the high 16 bits, and process all three RGB
  2137 				 * components at the same time.
  2138 				 */
  2139 				s = (s | s << 16) & 0x03e07c1f;
  2140 				d = (d | d << 16) & 0x03e07c1f;
  2141 				d += (s - d) * alpha >> 5;
  2142 				d &= 0x03e07c1f;
  2143 				*dstp++ = d | d >> 16;
  2144 			},{
  2145 				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
  2146 				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
  2147 
  2148 				/* red -- process the bits in place */
  2149 				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
  2150 					/* by reusing the GREEN mask we free up another mmx
  2151 					   register to accumulate the result */
  2152 
  2153 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2154 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2155 				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
  2156 				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
  2157 
  2158 				/* blend */
  2159 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2160 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2161 				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
  2162 				   cleared by a MASK below */
  2163 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2164 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2165 				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
  2166 
  2167 				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
  2168 
  2169 				movq_r2r(mm6, mm1); /* save new reds in dsts */
  2170 
  2171 				/* green -- process the bits in place */
  2172 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2173 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2174 				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
  2175 				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
  2176 
  2177 				/* blend */
  2178 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2179 				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2180 				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
  2181 				   bits are gone and the sign bits present */
  2182 				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
  2183 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2184 
  2185 				por_r2r(mm6, mm1); /* save new greens in dsts */
  2186 
  2187 				/* blue */
  2188 				movq_r2r(mm2, mm5); /* src -> mm5 */
  2189 				movq_r2r(mm3, mm6); /* dst -> mm6 */
  2190 				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
  2191 				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2192 
  2193 				/* blend */
  2194 				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
  2195 				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
  2196 				/* 11 + 5 = 16 bits, so the sign bits are lost and
  2197 				   the interesting bits will need to be MASKed */
  2198 				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
  2199 				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
  2200 				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
  2201 
  2202 				por_r2r(mm6, mm1); /* save new blues in dsts */
  2203 
  2204 				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
  2205 
  2206 				srcp += 4;
  2207 				dstp += 4;
  2208 			}, width);			
  2209 			srcp += srcskip;
  2210 			dstp += dstskip;
  2211 		}
  2212 		emms();
  2213 	}
  2214 }
  2215 /* End GCC_ASMBLIT */
  2216 
  2217 #elif MSVC_ASMBLIT
  2218 /* fast RGB565->RGB565 blending with surface alpha */
  2219 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
  2220 {
  2221 	unsigned alpha = info->src->alpha;
  2222 	if(alpha == 128) {
  2223 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2224 	} else {
  2225 		int width = info->d_width;
  2226 		int height = info->d_height;
  2227 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2228 		int srcskip = info->s_skip >> 1;
  2229 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2230 		int dstskip = info->d_skip >> 1;
  2231 		Uint32 s, d;
  2232 	  
  2233 		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2234 
  2235 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2236 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2237 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2238 
  2239 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2240 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2241 		/* position alpha to allow for mullo and mulhi on diff channels
  2242 		   to reduce the number of operations */
  2243 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2244 	  
  2245 		/* Setup the 565 color channel masks */
  2246 		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
  2247 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2248 		
  2249 		while(height--) {
  2250 			DUFFS_LOOP_QUATRO2(
  2251 			{
  2252 				s = *srcp++;
  2253 				d = *dstp;
  2254 				/*
  2255 				 * shift out the middle component (green) to
  2256 				 * the high 16 bits, and process all three RGB
  2257 				 * components at the same time.
  2258 				 */
  2259 				s = (s | s << 16) & 0x07e0f81f;
  2260 				d = (d | d << 16) & 0x07e0f81f;
  2261 				d += (s - d) * alpha >> 5;
  2262 				d &= 0x07e0f81f;
  2263 				*dstp++ = (Uint16)(d | d >> 16);
  2264 			},{
  2265 				s = *srcp++;
  2266 				d = *dstp;
  2267 				/*
  2268 				 * shift out the middle component (green) to
  2269 				 * the high 16 bits, and process all three RGB
  2270 				 * components at the same time.
  2271 				 */
  2272 				s = (s | s << 16) & 0x07e0f81f;
  2273 				d = (d | d << 16) & 0x07e0f81f;
  2274 				d += (s - d) * alpha >> 5;
  2275 				d &= 0x07e0f81f;
  2276 				*dstp++ = (Uint16)(d | d >> 16);
  2277 				s = *srcp++;
  2278 				d = *dstp;
  2279 				/*
  2280 				 * shift out the middle component (green) to
  2281 				 * the high 16 bits, and process all three RGB
  2282 				 * components at the same time.
  2283 				 */
  2284 				s = (s | s << 16) & 0x07e0f81f;
  2285 				d = (d | d << 16) & 0x07e0f81f;
  2286 				d += (s - d) * alpha >> 5;
  2287 				d &= 0x07e0f81f;
  2288 				*dstp++ = (Uint16)(d | d >> 16);
  2289 			},{
  2290 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2291 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2292 
  2293 				/* red */
  2294 				src2 = src1;
  2295 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  2296 
  2297 				dst2 = dst1;
  2298 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  2299 
  2300 				/* blend */
  2301 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2302 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2303 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2304 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2305 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  2306 
  2307 				mm_res = dst2; /* RED -> mm_res */
  2308 
  2309 				/* green -- process the bits in place */
  2310 				src2 = src1;
  2311 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2312 
  2313 				dst2 = dst1;
  2314 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2315 
  2316 				/* blend */
  2317 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2318 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2319 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2320 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2321 
  2322 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2323 
  2324 				/* blue */
  2325 				src2 = src1;
  2326 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2327 
  2328 				dst2 = dst1;
  2329 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2330 
  2331 				/* blend */
  2332 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2333 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2334 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2335 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2336 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2337 
  2338 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2339 
  2340 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2341 
  2342 				srcp += 4;
  2343 				dstp += 4;
  2344 			}, width);			
  2345 			srcp += srcskip;
  2346 			dstp += dstskip;
  2347 		}
  2348 		_mm_empty();
  2349 	}
  2350 }
  2351 
  2352 /* fast RGB555->RGB555 blending with surface alpha */
  2353 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
  2354 {
  2355 	unsigned alpha = info->src->alpha;
  2356 	if(alpha == 128) {
  2357 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2358 	} else {
  2359 		int width = info->d_width;
  2360 		int height = info->d_height;
  2361 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2362 		int srcskip = info->s_skip >> 1;
  2363 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2364 		int dstskip = info->d_skip >> 1;
  2365 		Uint32 s, d;
  2366 	  
  2367 		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2368 
  2369 		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
  2370 		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
  2371 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2372 
  2373 		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  2374 		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  2375 		/* position alpha to allow for mullo and mulhi on diff channels
  2376 		   to reduce the number of operations */
  2377 		mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2378 	  
  2379 		/* Setup the 555 color channel masks */
  2380 		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
  2381 		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
  2382 		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
  2383 
  2384 		while(height--) {
  2385 			DUFFS_LOOP_QUATRO2(
  2386 			{
  2387 				s = *srcp++;
  2388 				d = *dstp;
  2389 				/*
  2390 				 * shift out the middle component (green) to
  2391 				 * the high 16 bits, and process all three RGB
  2392 				 * components at the same time.
  2393 				 */
  2394 				s = (s | s << 16) & 0x03e07c1f;
  2395 				d = (d | d << 16) & 0x03e07c1f;
  2396 				d += (s - d) * alpha >> 5;
  2397 				d &= 0x03e07c1f;
  2398 				*dstp++ = (Uint16)(d | d >> 16);
  2399 			},{
  2400 				s = *srcp++;
  2401 				d = *dstp;
  2402 				/*
  2403 				 * shift out the middle component (green) to
  2404 				 * the high 16 bits, and process all three RGB
  2405 				 * components at the same time.
  2406 				 */
  2407 				s = (s | s << 16) & 0x03e07c1f;
  2408 				d = (d | d << 16) & 0x03e07c1f;
  2409 				d += (s - d) * alpha >> 5;
  2410 				d &= 0x03e07c1f;
  2411 				*dstp++ = (Uint16)(d | d >> 16);
  2412 			        s = *srcp++;
  2413 				d = *dstp;
  2414 				/*
  2415 				 * shift out the middle component (green) to
  2416 				 * the high 16 bits, and process all three RGB
  2417 				 * components at the same time.
  2418 				 */
  2419 				s = (s | s << 16) & 0x03e07c1f;
  2420 				d = (d | d << 16) & 0x03e07c1f;
  2421 				d += (s - d) * alpha >> 5;
  2422 				d &= 0x03e07c1f;
  2423 				*dstp++ = (Uint16)(d | d >> 16);
  2424 			},{
  2425 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  2426 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  2427 
  2428 				/* red -- process the bits in place */
  2429 				src2 = src1;
  2430 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  2431 
  2432 				dst2 = dst1;
  2433 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  2434 
  2435 				/* blend */
  2436 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2437 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2438 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2439 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2440 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  2441 
  2442 				mm_res = dst2; /* RED -> mm_res */
  2443 				
  2444 				/* green -- process the bits in place */
  2445 				src2 = src1;
  2446 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  2447 
  2448 				dst2 = dst1;
  2449 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  2450 
  2451 				/* blend */
  2452 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2453 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2454 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  2455 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2456 
  2457 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  2458 
  2459 				/* blue */
  2460 				src2 = src1; /* src -> src2 */
  2461 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  2462 
  2463 				dst2 = dst1; /* dst -> dst2 */
  2464 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  2465 
  2466 				/* blend */
  2467 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  2468 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  2469 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  2470 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  2471 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  2472 
  2473 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  2474 
  2475 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  2476 
  2477 				srcp += 4;
  2478 				dstp += 4;
  2479 			}, width);			
  2480 			srcp += srcskip;
  2481 			dstp += dstskip;
  2482 		}
  2483 		_mm_empty();
  2484 	}
  2485 }
  2486 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2487 
  2488 /* fast RGB565->RGB565 blending with surface alpha */
  2489 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
  2490 {
  2491 	unsigned alpha = info->src->alpha;
  2492 	if(alpha == 128) {
  2493 		Blit16to16SurfaceAlpha128(info, 0xf7de);
  2494 	} else {
  2495 		int width = info->d_width;
  2496 		int height = info->d_height;
  2497 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2498 		int srcskip = info->s_skip >> 1;
  2499 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2500 		int dstskip = info->d_skip >> 1;
  2501 		alpha >>= 3;	/* downscale alpha to 5 bits */
  2502 
  2503 		while(height--) {
  2504 			DUFFS_LOOP4({
  2505 				Uint32 s = *srcp++;
  2506 				Uint32 d = *dstp;
  2507 				/*
  2508 				 * shift out the middle component (green) to
  2509 				 * the high 16 bits, and process all three RGB
  2510 				 * components at the same time.
  2511 				 */
  2512 				s = (s | s << 16) & 0x07e0f81f;
  2513 				d = (d | d << 16) & 0x07e0f81f;
  2514 				d += (s - d) * alpha >> 5;
  2515 				d &= 0x07e0f81f;
  2516 				*dstp++ = (Uint16)(d | d >> 16);
  2517 			}, width);
  2518 			srcp += srcskip;
  2519 			dstp += dstskip;
  2520 		}
  2521 	}
  2522 }
  2523 
  2524 /* fast RGB555->RGB555 blending with surface alpha */
  2525 static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
  2526 {
  2527 	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
  2528 	if(alpha == 128) {
  2529 		Blit16to16SurfaceAlpha128(info, 0xfbde);
  2530 	} else {
  2531 		int width = info->d_width;
  2532 		int height = info->d_height;
  2533 		Uint16 *srcp = (Uint16 *)info->s_pixels;
  2534 		int srcskip = info->s_skip >> 1;
  2535 		Uint16 *dstp = (Uint16 *)info->d_pixels;
  2536 		int dstskip = info->d_skip >> 1;
  2537 		alpha >>= 3;		/* downscale alpha to 5 bits */
  2538 
  2539 		while(height--) {
  2540 			DUFFS_LOOP4({
  2541 				Uint32 s = *srcp++;
  2542 				Uint32 d = *dstp;
  2543 				/*
  2544 				 * shift out the middle component (green) to
  2545 				 * the high 16 bits, and process all three RGB
  2546 				 * components at the same time.
  2547 				 */
  2548 				s = (s | s << 16) & 0x03e07c1f;
  2549 				d = (d | d << 16) & 0x03e07c1f;
  2550 				d += (s - d) * alpha >> 5;
  2551 				d &= 0x03e07c1f;
  2552 				*dstp++ = (Uint16)(d | d >> 16);
  2553 			}, width);
  2554 			srcp += srcskip;
  2555 			dstp += dstskip;
  2556 		}
  2557 	}
  2558 }
  2559 
  2560 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2561 static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
  2562 {
  2563 	int width = info->d_width;
  2564 	int height = info->d_height;
  2565 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2566 	int srcskip = info->s_skip >> 2;
  2567 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2568 	int dstskip = info->d_skip >> 1;
  2569 
  2570 	while(height--) {
  2571 	    DUFFS_LOOP4({
  2572 		Uint32 s = *srcp;
  2573 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  2574 		/* FIXME: Here we special-case opaque alpha since the
  2575 		   compositioning used (>>8 instead of /255) doesn't handle
  2576 		   it correctly. Also special-case alpha=0 for speed?
  2577 		   Benchmark this! */
  2578 		if(alpha) {   
  2579 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2580 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  2581 		  } else {
  2582 		    Uint32 d = *dstp;
  2583 		    /*
  2584 		     * convert source and destination to G0RAB65565
  2585 		     * and blend all components at the same time
  2586 		     */
  2587 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  2588 		      + (s >> 3 & 0x1f);
  2589 		    d = (d | d << 16) & 0x07e0f81f;
  2590 		    d += (s - d) * alpha >> 5;
  2591 		    d &= 0x07e0f81f;
  2592 		    *dstp = (Uint16)(d | d >> 16);
  2593 		  }
  2594 		}
  2595 		srcp++;
  2596 		dstp++;
  2597 	    }, width);
  2598 	    srcp += srcskip;
  2599 	    dstp += dstskip;
  2600 	}
  2601 }
  2602 
  2603 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2604 static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
  2605 {
  2606 	int width = info->d_width;
  2607 	int height = info->d_height;
  2608 	Uint32 *srcp = (Uint32 *)info->s_pixels;
  2609 	int srcskip = info->s_skip >> 2;
  2610 	Uint16 *dstp = (Uint16 *)info->d_pixels;
  2611 	int dstskip = info->d_skip >> 1;
  2612 
  2613 	while(height--) {
  2614 	    DUFFS_LOOP4({
  2615 		unsigned alpha;
  2616 		Uint32 s = *srcp;
  2617 		alpha = s >> 27; /* downscale alpha to 5 bits */
  2618 		/* FIXME: Here we special-case opaque alpha since the
  2619 		   compositioning used (>>8 instead of /255) doesn't handle
  2620 		   it correctly. Also special-case alpha=0 for speed?
  2621 		   Benchmark this! */
  2622 		if(alpha) {   
  2623 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  2624 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  2625 		  } else {
  2626 		    Uint32 d = *dstp;
  2627 		    /*
  2628 		     * convert source and destination to G0RAB65565
  2629 		     * and blend all components at the same time
  2630 		     */
  2631 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  2632 		      + (s >> 3 & 0x1f);
  2633 		    d = (d | d << 16) & 0x03e07c1f;
  2634 		    d += (s - d) * alpha >> 5;
  2635 		    d &= 0x03e07c1f;
  2636 		    *dstp = (Uint16)(d | d >> 16);
  2637 		  }
  2638 		}
  2639 		srcp++;
  2640 		dstp++;
  2641 	    }, width);
  2642 	    srcp += srcskip;
  2643 	    dstp += dstskip;
  2644 	}
  2645 }
  2646 
  2647 /* General (slow) N->N blending with per-surface alpha */
  2648 static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
  2649 {
  2650 	int width = info->d_width;
  2651 	int height = info->d_height;
  2652 	Uint8 *src = info->s_pixels;
  2653 	int srcskip = info->s_skip;
  2654 	Uint8 *dst = info->d_pixels;
  2655 	int dstskip = info->d_skip;
  2656 	SDL_PixelFormat *srcfmt = info->src;
  2657 	SDL_PixelFormat *dstfmt = info->dst;
  2658 	int srcbpp = srcfmt->BytesPerPixel;
  2659 	int dstbpp = dstfmt->BytesPerPixel;
  2660 	unsigned sA = srcfmt->alpha;
  2661 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2662 
  2663 	if(sA) {
  2664 	  while ( height-- ) {
  2665 	    DUFFS_LOOP4(
  2666 	    {
  2667 		Uint32 Pixel;
  2668 		unsigned sR;
  2669 		unsigned sG;
  2670 		unsigned sB;
  2671 		unsigned dR;
  2672 		unsigned dG;
  2673 		unsigned dB;
  2674 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2675 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2676 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2677 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2678 		src += srcbpp;
  2679 		dst += dstbpp;
  2680 	    },
  2681 	    width);
  2682 	    src += srcskip;
  2683 	    dst += dstskip;
  2684 	  }
  2685 	}
  2686 }
  2687 
  2688 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2689 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
  2690 {
  2691 	int width = info->d_width;
  2692 	int height = info->d_height;
  2693 	Uint8 *src = info->s_pixels;
  2694 	int srcskip = info->s_skip;
  2695 	Uint8 *dst = info->d_pixels;
  2696 	int dstskip = info->d_skip;
  2697 	SDL_PixelFormat *srcfmt = info->src;
  2698 	SDL_PixelFormat *dstfmt = info->dst;
  2699 	Uint32 ckey = srcfmt->colorkey;
  2700 	int srcbpp = srcfmt->BytesPerPixel;
  2701 	int dstbpp = dstfmt->BytesPerPixel;
  2702 	unsigned sA = srcfmt->alpha;
  2703 	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2704 
  2705 	while ( height-- ) {
  2706 	    DUFFS_LOOP4(
  2707 	    {
  2708 		Uint32 Pixel;
  2709 		unsigned sR;
  2710 		unsigned sG;
  2711 		unsigned sB;
  2712 		unsigned dR;
  2713 		unsigned dG;
  2714 		unsigned dB;
  2715 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2716 		if(sA && Pixel != ckey) {
  2717 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2718 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2719 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2720 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2721 		}
  2722 		src += srcbpp;
  2723 		dst += dstbpp;
  2724 	    },
  2725 	    width);
  2726 	    src += srcskip;
  2727 	    dst += dstskip;
  2728 	}
  2729 }
  2730 
  2731 /* General (slow) N->N blending with pixel alpha */
  2732 static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
  2733 {
  2734 	int width = info->d_width;
  2735 	int height = info->d_height;
  2736 	Uint8 *src = info->s_pixels;
  2737 	int srcskip = info->s_skip;
  2738 	Uint8 *dst = info->d_pixels;
  2739 	int dstskip = info->d_skip;
  2740 	SDL_PixelFormat *srcfmt = info->src;
  2741 	SDL_PixelFormat *dstfmt = info->dst;
  2742 
  2743 	int  srcbpp;
  2744 	int  dstbpp;
  2745 
  2746 	/* Set up some basic variables */
  2747 	srcbpp = srcfmt->BytesPerPixel;
  2748 	dstbpp = dstfmt->BytesPerPixel;
  2749 
  2750 	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2751 	   quite right. for <8bpp source alpha, it gets them very wrong
  2752 	   (check all macros!)
  2753 	   It is unclear whether there is a good general solution that doesn't
  2754 	   need a branch (or a divide). */
  2755 	while ( height-- ) {
  2756 	    DUFFS_LOOP4(
  2757 	    {
  2758 		Uint32 Pixel;
  2759 		unsigned sR;
  2760 		unsigned sG;
  2761 		unsigned sB;
  2762 		unsigned dR;
  2763 		unsigned dG;
  2764 		unsigned dB;
  2765 		unsigned sA;
  2766 		unsigned dA;
  2767 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2768 		if(sA) {
  2769 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2770 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2771 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2772 		}
  2773 		src += srcbpp;
  2774 		dst += dstbpp;
  2775 	    },
  2776 	    width);
  2777 	    src += srcskip;
  2778 	    dst += dstskip;
  2779 	}
  2780 }
  2781 
  2782 
  2783 SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
  2784 {
  2785     SDL_PixelFormat *sf = surface->format;
  2786     SDL_PixelFormat *df = surface->map->dst->format;
  2787 
  2788     if(sf->Amask == 0) {
  2789 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  2790 	    if(df->BytesPerPixel == 1)
  2791 		return BlitNto1SurfaceAlphaKey;
  2792 	    else
  2793 #if SDL_ALTIVEC_BLITTERS
  2794 	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2795 	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2796             return Blit32to32SurfaceAlphaKeyAltivec;
  2797         else
  2798 #endif
  2799             return BlitNtoNSurfaceAlphaKey;
  2800 	} else {
  2801 	    /* Per-surface alpha blits */
  2802 	    switch(df->BytesPerPixel) {
  2803 	    case 1:
  2804 		return BlitNto1SurfaceAlpha;
  2805 
  2806 	    case 2:
  2807 		if(surface->map->identity) {
  2808 		    if(df->Gmask == 0x7e0)
  2809 		    {
  2810 #if MMX_ASMBLIT
  2811 		if(SDL_HasMMX())
  2812 			return Blit565to565SurfaceAlphaMMX;
  2813 		else
  2814 #endif
  2815 			return Blit565to565SurfaceAlpha;
  2816 		    }
  2817 		    else if(df->Gmask == 0x3e0)
  2818 		    {
  2819 #if MMX_ASMBLIT
  2820 		if(SDL_HasMMX())
  2821 			return Blit555to555SurfaceAlphaMMX;
  2822 		else
  2823 #endif
  2824 			return Blit555to555SurfaceAlpha;
  2825 		    }
  2826 		}
  2827 		return BlitNtoNSurfaceAlpha;
  2828 
  2829 	    case 4:
  2830 		if(sf->Rmask == df->Rmask
  2831 		   && sf->Gmask == df->Gmask
  2832 		   && sf->Bmask == df->Bmask
  2833 		   && sf->BytesPerPixel == 4)
  2834 		{
  2835 #if MMX_ASMBLIT
  2836 			if(sf->Rshift % 8 == 0
  2837 			   && sf->Gshift % 8 == 0
  2838 			   && sf->Bshift % 8 == 0
  2839 			   && SDL_HasMMX())
  2840 			    return BlitRGBtoRGBSurfaceAlphaMMX;
  2841 #endif
  2842 			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
  2843 			{
  2844 #if SDL_ALTIVEC_BLITTERS
  2845 				if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2846 					&& SDL_HasAltiVec())
  2847 					return BlitRGBtoRGBSurfaceAlphaAltivec;
  2848 #endif
  2849 				return BlitRGBtoRGBSurfaceAlpha;
  2850 			}
  2851 		}
  2852 #if SDL_ALTIVEC_BLITTERS
  2853 		if((sf->BytesPerPixel == 4) &&
  2854 		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2855 			return Blit32to32SurfaceAlphaAltivec;
  2856 		else
  2857 #endif
  2858 			return BlitNtoNSurfaceAlpha;
  2859 
  2860 	    case 3:
  2861 	    default:
  2862 		return BlitNtoNSurfaceAlpha;
  2863 	    }
  2864 	}
  2865     } else {
  2866 	/* Per-pixel alpha blits */
  2867 	switch(df->BytesPerPixel) {
  2868 	case 1:
  2869 	    return BlitNto1PixelAlpha;
  2870 
  2871 	case 2:
  2872 #if SDL_ALTIVEC_BLITTERS
  2873 	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
  2874            df->Gmask == 0x7e0 &&
  2875 	   df->Bmask == 0x1f && SDL_HasAltiVec())
  2876             return Blit32to565PixelAlphaAltivec;
  2877         else
  2878 #endif
  2879 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
  2880 		if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2881 		   && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
  2882 		   && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2883 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
  2884 		{
  2885 #if SDL_ARM_NEON_BLITTERS
  2886 		    if(SDL_HasARMNEON())
  2887 		        return BlitARGBto565PixelAlphaARMNEON;
  2888 #endif
  2889 #if SDL_ARM_SIMD_BLITTERS
  2890 		    if(SDL_HasARMSIMD())
  2891 		        return BlitARGBto565PixelAlphaARMSIMD;
  2892 #endif
  2893 		}
  2894 #endif
  2895 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2896 	       && sf->Gmask == 0xff00
  2897 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2898 		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2899 		if(df->Gmask == 0x7e0)
  2900 		    return BlitARGBto565PixelAlpha;
  2901 		else if(df->Gmask == 0x3e0)
  2902 		    return BlitARGBto555PixelAlpha;
  2903 	    }
  2904 	    return BlitNtoNPixelAlpha;
  2905 
  2906 	case 4:
  2907 	    if(sf->Rmask == df->Rmask
  2908 	       && sf->Gmask == df->Gmask
  2909 	       && sf->Bmask == df->Bmask
  2910 	       && sf->BytesPerPixel == 4)
  2911 	    {
  2912 #if MMX_ASMBLIT
  2913 		if(sf->Rshift % 8 == 0
  2914 		   && sf->Gshift % 8 == 0
  2915 		   && sf->Bshift % 8 == 0
  2916 		   && sf->Ashift % 8 == 0
  2917 		   && sf->Aloss == 0)
  2918 		{
  2919 			if(SDL_Has3DNow())
  2920 				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2921 			if(SDL_HasMMX())
  2922 				return BlitRGBtoRGBPixelAlphaMMX;
  2923 		}
  2924 #endif
  2925 		if(sf->Amask == 0xff000000)
  2926 		{
  2927 #if SDL_ALTIVEC_BLITTERS
  2928 			if(!(surface->map->dst->flags & SDL_HWSURFACE)
  2929 				&& SDL_HasAltiVec())
  2930 				return BlitRGBtoRGBPixelAlphaAltivec;
  2931 #endif
  2932 #if SDL_ARM_NEON_BLITTERS
  2933 			if (SDL_HasARMNEON())
  2934 				return BlitRGBtoRGBPixelAlphaARMNEON;
  2935 #endif
  2936 #if SDL_ARM_SIMD_BLITTERS
  2937 			if (SDL_HasARMSIMD())
  2938 				return BlitRGBtoRGBPixelAlphaARMSIMD;
  2939 #endif
  2940 			return BlitRGBtoRGBPixelAlpha;
  2941 		}
  2942 	    }
  2943 #if SDL_ALTIVEC_BLITTERS
  2944 	    if (sf->Amask && sf->BytesPerPixel == 4 &&
  2945 	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
  2946 		return Blit32to32PixelAlphaAltivec;
  2947 	    else
  2948 #endif
  2949 		return BlitNtoNPixelAlpha;
  2950 
  2951 	case 3:
  2952 	default:
  2953 	    return BlitNtoNPixelAlpha;
  2954 	}
  2955     }
  2956 }
  2957