src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Sat, 02 Mar 2013 20:44:16 -0800
changeset 6950 1ddb72193079
parent 6885 700f1b25f77f
child 7502 6ff02ff3cf06
permissions -rw-r--r--
Added a mouse ID to the mouse events, which set to the special value SDL_TOUCH_MOUSEID for mouse events simulated by touch input.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2013 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 #include "SDL_config.h"
    22 
    23 #include "SDL_video.h"
    24 #include "SDL_blit.h"
    25 
    26 /* Functions to perform alpha blended blitting */
    27 
    28 /* N->1 blending with per-surface alpha */
    29 static void
    30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    31 {
    32     int width = info->dst_w;
    33     int height = info->dst_h;
    34     Uint8 *src = info->src;
    35     int srcskip = info->src_skip;
    36     Uint8 *dst = info->dst;
    37     int dstskip = info->dst_skip;
    38     Uint8 *palmap = info->table;
    39     SDL_PixelFormat *srcfmt = info->src_fmt;
    40     SDL_PixelFormat *dstfmt = info->dst_fmt;
    41     int srcbpp = srcfmt->BytesPerPixel;
    42 
    43     const unsigned A = info->a;
    44 
    45     while (height--) {
    46 	    /* *INDENT-OFF* */
    47 	    DUFFS_LOOP4(
    48 	    {
    49 		Uint32 Pixel;
    50 		unsigned sR;
    51 		unsigned sG;
    52 		unsigned sB;
    53 		unsigned dR;
    54 		unsigned dG;
    55 		unsigned dB;
    56 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    57 		dR = dstfmt->palette->colors[*dst].r;
    58 		dG = dstfmt->palette->colors[*dst].g;
    59 		dB = dstfmt->palette->colors[*dst].b;
    60 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    61 		dR &= 0xff;
    62 		dG &= 0xff;
    63 		dB &= 0xff;
    64 		/* Pack RGB into 8bit pixel */
    65 		if ( palmap == NULL ) {
    66 		    *dst =((dR>>5)<<(3+2))|
    67 			  ((dG>>5)<<(2))|
    68 			  ((dB>>6)<<(0));
    69 		} else {
    70 		    *dst = palmap[((dR>>5)<<(3+2))|
    71 				  ((dG>>5)<<(2))  |
    72 				  ((dB>>6)<<(0))];
    73 		}
    74 		dst++;
    75 		src += srcbpp;
    76 	    },
    77 	    width);
    78 	    /* *INDENT-ON* */
    79         src += srcskip;
    80         dst += dstskip;
    81     }
    82 }
    83 
    84 /* N->1 blending with pixel alpha */
    85 static void
    86 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    87 {
    88     int width = info->dst_w;
    89     int height = info->dst_h;
    90     Uint8 *src = info->src;
    91     int srcskip = info->src_skip;
    92     Uint8 *dst = info->dst;
    93     int dstskip = info->dst_skip;
    94     Uint8 *palmap = info->table;
    95     SDL_PixelFormat *srcfmt = info->src_fmt;
    96     SDL_PixelFormat *dstfmt = info->dst_fmt;
    97     int srcbpp = srcfmt->BytesPerPixel;
    98 
    99     while (height--) {
   100 	    /* *INDENT-OFF* */
   101 	    DUFFS_LOOP4(
   102 	    {
   103 		Uint32 Pixel;
   104 		unsigned sR;
   105 		unsigned sG;
   106 		unsigned sB;
   107 		unsigned sA;
   108 		unsigned dR;
   109 		unsigned dG;
   110 		unsigned dB;
   111 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   112 		dR = dstfmt->palette->colors[*dst].r;
   113 		dG = dstfmt->palette->colors[*dst].g;
   114 		dB = dstfmt->palette->colors[*dst].b;
   115 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   116 		dR &= 0xff;
   117 		dG &= 0xff;
   118 		dB &= 0xff;
   119 		/* Pack RGB into 8bit pixel */
   120 		if ( palmap == NULL ) {
   121 		    *dst =((dR>>5)<<(3+2))|
   122 			  ((dG>>5)<<(2))|
   123 			  ((dB>>6)<<(0));
   124 		} else {
   125 		    *dst = palmap[((dR>>5)<<(3+2))|
   126 				  ((dG>>5)<<(2))  |
   127 				  ((dB>>6)<<(0))  ];
   128 		}
   129 		dst++;
   130 		src += srcbpp;
   131 	    },
   132 	    width);
   133 	    /* *INDENT-ON* */
   134         src += srcskip;
   135         dst += dstskip;
   136     }
   137 }
   138 
   139 /* colorkeyed N->1 blending with per-surface alpha */
   140 static void
   141 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   142 {
   143     int width = info->dst_w;
   144     int height = info->dst_h;
   145     Uint8 *src = info->src;
   146     int srcskip = info->src_skip;
   147     Uint8 *dst = info->dst;
   148     int dstskip = info->dst_skip;
   149     Uint8 *palmap = info->table;
   150     SDL_PixelFormat *srcfmt = info->src_fmt;
   151     SDL_PixelFormat *dstfmt = info->dst_fmt;
   152     int srcbpp = srcfmt->BytesPerPixel;
   153     Uint32 ckey = info->colorkey;
   154 
   155     const int A = info->a;
   156 
   157     while (height--) {
   158 	    /* *INDENT-OFF* */
   159 	    DUFFS_LOOP(
   160 	    {
   161 		Uint32 Pixel;
   162 		unsigned sR;
   163 		unsigned sG;
   164 		unsigned sB;
   165 		unsigned dR;
   166 		unsigned dG;
   167 		unsigned dB;
   168 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   169 		if ( Pixel != ckey ) {
   170 		    dR = dstfmt->palette->colors[*dst].r;
   171 		    dG = dstfmt->palette->colors[*dst].g;
   172 		    dB = dstfmt->palette->colors[*dst].b;
   173 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   174 		    dR &= 0xff;
   175 		    dG &= 0xff;
   176 		    dB &= 0xff;
   177 		    /* Pack RGB into 8bit pixel */
   178 		    if ( palmap == NULL ) {
   179 			*dst =((dR>>5)<<(3+2))|
   180 			      ((dG>>5)<<(2)) |
   181 			      ((dB>>6)<<(0));
   182 		    } else {
   183 			*dst = palmap[((dR>>5)<<(3+2))|
   184 				      ((dG>>5)<<(2))  |
   185 				      ((dB>>6)<<(0))  ];
   186 		    }
   187 		}
   188 		dst++;
   189 		src += srcbpp;
   190 	    },
   191 	    width);
   192 	    /* *INDENT-ON* */
   193         src += srcskip;
   194         dst += dstskip;
   195     }
   196 }
   197 
   198 #ifdef __MMX__
   199 
   200 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   201 static void
   202 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   203 {
   204     int width = info->dst_w;
   205     int height = info->dst_h;
   206     Uint32 *srcp = (Uint32 *) info->src;
   207     int srcskip = info->src_skip >> 2;
   208     Uint32 *dstp = (Uint32 *) info->dst;
   209     int dstskip = info->dst_skip >> 2;
   210     Uint32 dalpha = info->dst_fmt->Amask;
   211 
   212     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   213 
   214     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   215     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   216     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   217 
   218     while (height--) {
   219         int n = width;
   220         if (n & 1) {
   221             Uint32 s = *srcp++;
   222             Uint32 d = *dstp;
   223             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   224                        + (s & d & 0x00010101)) | dalpha;
   225             n--;
   226         }
   227 
   228         for (n >>= 1; n > 0; --n) {
   229             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   230             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   231 
   232             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   233             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   234 
   235             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   236             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   237             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   238             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   239 
   240             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   241             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   242             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   243             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   244 
   245             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   246             dstp += 2;
   247             srcp += 2;
   248         }
   249 
   250         srcp += srcskip;
   251         dstp += dstskip;
   252     }
   253     _mm_empty();
   254 }
   255 
   256 /* fast RGB888->(A)RGB888 blending with surface alpha */
   257 static void
   258 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   259 {
   260     SDL_PixelFormat *df = info->dst_fmt;
   261     Uint32 chanmask;
   262     unsigned alpha = info->a;
   263 
   264     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   265         /* only call a128 version when R,G,B occupy lower bits */
   266         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   267     } else {
   268         int width = info->dst_w;
   269         int height = info->dst_h;
   270         Uint32 *srcp = (Uint32 *) info->src;
   271         int srcskip = info->src_skip >> 2;
   272         Uint32 *dstp = (Uint32 *) info->dst;
   273         int dstskip = info->dst_skip >> 2;
   274         Uint32 dalpha = df->Amask;
   275         Uint32 amult;
   276 
   277         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   278 
   279         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   280         /* form the alpha mult */
   281         amult = alpha | (alpha << 8);
   282         amult = amult | (amult << 16);
   283         chanmask =
   284             (0xff << df->Rshift) | (0xff << df->
   285                                     Gshift) | (0xff << df->Bshift);
   286         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   287         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   288         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   289         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   290 
   291         while (height--) {
   292             int n = width;
   293             if (n & 1) {
   294                 /* One Pixel Blend */
   295                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   296                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   297 
   298                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   299                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   300 
   301                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   302                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   303                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   304                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   305 
   306                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   307                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   308                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   309 
   310                 ++srcp;
   311                 ++dstp;
   312 
   313                 n--;
   314             }
   315 
   316             for (n >>= 1; n > 0; --n) {
   317                 /* Two Pixels Blend */
   318                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   319                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   320                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   321                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   322 
   323                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   324                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   325                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   326                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   327 
   328                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   329                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   330                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   331                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   332 
   333                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   334                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   335                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   336                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   337 
   338                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   339                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   340 
   341                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   342 
   343                 srcp += 2;
   344                 dstp += 2;
   345             }
   346             srcp += srcskip;
   347             dstp += dstskip;
   348         }
   349         _mm_empty();
   350     }
   351 }
   352 
   353 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   354 static void
   355 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   356 {
   357     int width = info->dst_w;
   358     int height = info->dst_h;
   359     Uint32 *srcp = (Uint32 *) info->src;
   360     int srcskip = info->src_skip >> 2;
   361     Uint32 *dstp = (Uint32 *) info->dst;
   362     int dstskip = info->dst_skip >> 2;
   363     SDL_PixelFormat *sf = info->src_fmt;
   364     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   365     Uint32 amask = sf->Amask;
   366     Uint32 ashift = sf->Ashift;
   367     Uint64 multmask;
   368 
   369     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   370 
   371     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   372     multmask = 0xFFFF;
   373     multmask <<= (ashift * 2);
   374     multmask = ~multmask;
   375     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   376 
   377     while (height--) {
   378 		/* *INDENT-OFF* */
   379 		DUFFS_LOOP4({
   380 		Uint32 alpha = *srcp & amask;
   381 		if (alpha == 0) {
   382 			/* do nothing */
   383 		} else if (alpha == amask) {
   384 			/* opaque alpha -- copy RGB, keep dst alpha */
   385 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   386 		} else {
   387 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   388 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   389 
   390 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   391 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   392 
   393 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   394 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   395 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   396 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   397 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   398 
   399 			/* blend */		    
   400 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   401 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   402 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   403 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   404 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   405 			
   406 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   407 		}
   408 		++srcp;
   409 		++dstp;
   410 	    }, width);
   411 		/* *INDENT-ON* */
   412         srcp += srcskip;
   413         dstp += dstskip;
   414     }
   415     _mm_empty();
   416 }
   417 
   418 #endif /* __MMX__ */
   419 
   420 #if SDL_ALTIVEC_BLITTERS
   421 #if __MWERKS__
   422 #pragma altivec_model on
   423 #endif
   424 #if HAVE_ALTIVEC_H
   425 #include <altivec.h>
   426 #endif
   427 
   428 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   429 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   430         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   431 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   432         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   433 #else
   434 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   435         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   436 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   437         (vector unsigned short) { a,b,c,d,e,f,g,h }
   438 #endif
   439 
   440 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   441 #define VECPRINT(msg, v) do { \
   442     vector unsigned int tmpvec = (vector unsigned int)(v); \
   443     unsigned int *vp = (unsigned int *)&tmpvec; \
   444     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   445 } while (0)
   446 
   447 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   448     (vector unsigned char)(
   449         0x00, 0x10, 0x02, 0x12,
   450         0x04, 0x14, 0x06, 0x16,
   451         0x08, 0x18, 0x0A, 0x1A,
   452         0x0C, 0x1C, 0x0E, 0x1E );
   453 */
   454 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   455 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   456 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   457 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   458     ? vec_lvsl(0, src) \
   459     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   460 
   461 
   462 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   463     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   464     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   465     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   466     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   467     /* valpha2 is 255-alpha */ \
   468     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   469     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   470     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   471     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   472     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   473     /* add source and dest */ \
   474     vtemp1 = vec_add(vtemp1, vtemp3); \
   475     vtemp2 = vec_add(vtemp2, vtemp4); \
   476     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   477     vtemp1 = vec_add(vtemp1, v1_16); \
   478     vtemp3 = vec_sr(vtemp1, v8_16); \
   479     vtemp1 = vec_add(vtemp1, vtemp3); \
   480     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   481     vtemp2 = vec_add(vtemp2, v1_16); \
   482     vtemp4 = vec_sr(vtemp2, v8_16); \
   483     vtemp2 = vec_add(vtemp2, vtemp4); \
   484     /* (>>8) and get ARGBARGBARGBARGB */ \
   485     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   486 } while (0)
   487 
   488 /* Calculate the permute vector used for 32->32 swizzling */
   489 static vector unsigned char
   490 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   491 {
   492     /*
   493      * We have to assume that the bits that aren't used by other
   494      *  colors is alpha, and it's one complete byte, since some formats
   495      *  leave alpha with a zero mask, but we should still swizzle the bits.
   496      */
   497     /* ARGB */
   498     const static struct SDL_PixelFormat default_pixel_format = {
   499         0, NULL, 0, 0,
   500         {0, 0},
   501         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   502         0, 0, 0, 0,
   503         16, 8, 0, 24,
   504         0, NULL
   505     };
   506     if (!srcfmt) {
   507         srcfmt = &default_pixel_format;
   508     }
   509     if (!dstfmt) {
   510         dstfmt = &default_pixel_format;
   511     }
   512     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   513                                                        0x04, 0x04, 0x04, 0x04,
   514                                                        0x08, 0x08, 0x08, 0x08,
   515                                                        0x0C, 0x0C, 0x0C,
   516                                                        0x0C);
   517     vector unsigned char vswiz;
   518     vector unsigned int srcvec;
   519 #define RESHIFT(X) (3 - ((X) >> 3))
   520     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   521     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   522     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   523     Uint32 amask;
   524     /* Use zero for alpha if either surface doesn't have alpha */
   525     if (dstfmt->Amask) {
   526         amask =
   527             ((srcfmt->Amask) ? RESHIFT(srcfmt->
   528                                        Ashift) : 0x10) << (dstfmt->Ashift);
   529     } else {
   530         amask =
   531             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   532                           0xFFFFFFFF);
   533     }
   534 #undef RESHIFT
   535     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   536     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   537     return (vswiz);
   538 }
   539 
   540 static void
   541 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   542 {
   543     int height = info->dst_h;
   544     Uint8 *src = (Uint8 *) info->src;
   545     int srcskip = info->src_skip;
   546     Uint8 *dst = (Uint8 *) info->dst;
   547     int dstskip = info->dst_skip;
   548     SDL_PixelFormat *srcfmt = info->src_fmt;
   549 
   550     vector unsigned char v0 = vec_splat_u8(0);
   551     vector unsigned short v8_16 = vec_splat_u16(8);
   552     vector unsigned short v1_16 = vec_splat_u16(1);
   553     vector unsigned short v2_16 = vec_splat_u16(2);
   554     vector unsigned short v3_16 = vec_splat_u16(3);
   555     vector unsigned int v8_32 = vec_splat_u32(8);
   556     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   557     vector unsigned short v3f =
   558         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   559                           0x003f, 0x003f, 0x003f, 0x003f);
   560     vector unsigned short vfc =
   561         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   562                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   563 
   564     /* 
   565        0x10 - 0x1f is the alpha
   566        0x00 - 0x0e evens are the red
   567        0x01 - 0x0f odds are zero
   568      */
   569     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   570                                                        0x10, 0x02, 0x01, 0x01,
   571                                                        0x10, 0x04, 0x01, 0x01,
   572                                                        0x10, 0x06, 0x01,
   573                                                        0x01);
   574     vector unsigned char vredalpha2 =
   575         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   576                                         vec_sl(v8_32, v16_32))
   577         );
   578     /*
   579        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   580        0x11 - 0x0f odds are blue
   581      */
   582     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   583                                                    0x04, 0x05, 0x06, 0x13,
   584                                                    0x08, 0x09, 0x0a, 0x15,
   585                                                    0x0c, 0x0d, 0x0e, 0x17);
   586     vector unsigned char vblue2 =
   587         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   588         );
   589     /*
   590        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   591        0x10 - 0x0e evens are green
   592      */
   593     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   594                                                     0x04, 0x05, 0x12, 0x07,
   595                                                     0x08, 0x09, 0x14, 0x0b,
   596                                                     0x0c, 0x0d, 0x16, 0x0f);
   597     vector unsigned char vgreen2 =
   598         (vector unsigned
   599          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   600         );
   601     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   602                                                     0x00, 0x0a, 0x00, 0x0e,
   603                                                     0x00, 0x12, 0x00, 0x16,
   604                                                     0x00, 0x1a, 0x00, 0x1e);
   605     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   606     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   607     vector unsigned char valphaPermute =
   608         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   609 
   610     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   611     vf800 = vec_sl(vf800, vec_splat_u16(8));
   612 
   613     while (height--) {
   614         int extrawidth;
   615         vector unsigned char valigner;
   616         vector unsigned char vsrc;
   617         vector unsigned char voverflow;
   618         int width = info->dst_w;
   619 
   620 #define ONE_PIXEL_BLEND(condition, widthvar) \
   621         while (condition) { \
   622             Uint32 Pixel; \
   623             unsigned sR, sG, sB, dR, dG, dB, sA; \
   624             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   625             if(sA) { \
   626                 unsigned short dstpixel = *((unsigned short *)dst); \
   627                 dR = (dstpixel >> 8) & 0xf8; \
   628                 dG = (dstpixel >> 3) & 0xfc; \
   629                 dB = (dstpixel << 3) & 0xf8; \
   630                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   631                 *((unsigned short *)dst) = ( \
   632                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   633                 ); \
   634             } \
   635             src += 4; \
   636             dst += 2; \
   637             widthvar--; \
   638         }
   639         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   640         extrawidth = (width % 8);
   641         valigner = VEC_ALIGNER(src);
   642         vsrc = (vector unsigned char) vec_ld(0, src);
   643         width -= extrawidth;
   644         while (width) {
   645             vector unsigned char valpha;
   646             vector unsigned char vsrc1, vsrc2;
   647             vector unsigned char vdst1, vdst2;
   648             vector unsigned short vR, vG, vB;
   649             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   650 
   651             /* Load 8 pixels from src as ARGB */
   652             voverflow = (vector unsigned char) vec_ld(15, src);
   653             vsrc = vec_perm(vsrc, voverflow, valigner);
   654             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   655             src += 16;
   656             vsrc = (vector unsigned char) vec_ld(15, src);
   657             voverflow = vec_perm(voverflow, vsrc, valigner);
   658             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   659             src += 16;
   660 
   661             /* Load 8 pixels from dst as XRGB */
   662             voverflow = vec_ld(0, dst);
   663             vR = vec_and((vector unsigned short) voverflow, vf800);
   664             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   665             vG = vec_sl(vB, v2_16);
   666             vdst1 =
   667                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   668                                                 (vector unsigned char) vR,
   669                                                 vredalpha1);
   670             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   671             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   672             vdst2 =
   673                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   674                                                 (vector unsigned char) vR,
   675                                                 vredalpha2);
   676             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   677             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   678 
   679             /* Alpha blend 8 pixels as ARGB */
   680             valpha = vec_perm(vsrc1, v0, valphaPermute);
   681             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   682                                v8_16);
   683             valpha = vec_perm(vsrc2, v0, valphaPermute);
   684             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   685                                v8_16);
   686 
   687             /* Convert 8 pixels to 565 */
   688             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   689                                                         vdst1,
   690                                                         (vector unsigned int)
   691                                                         vdst2);
   692             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   693             vgpixel = vec_and(vgpixel, vfc);
   694             vgpixel = vec_sl(vgpixel, v3_16);
   695             vrpixel = vec_sl(vpixel, v1_16);
   696             vrpixel = vec_and(vrpixel, vf800);
   697             vbpixel = vec_and(vpixel, v3f);
   698             vdst1 =
   699                 vec_or((vector unsigned char) vrpixel,
   700                        (vector unsigned char) vgpixel);
   701             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   702 
   703             /* Store 8 pixels */
   704             vec_st(vdst1, 0, dst);
   705 
   706             width -= 8;
   707             dst += 16;
   708         }
   709         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   710 #undef ONE_PIXEL_BLEND
   711         src += srcskip;
   712         dst += dstskip;
   713     }
   714 }
   715 
   716 static void
   717 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   718 {
   719     int height = info->dst_h;
   720     Uint32 *srcp = (Uint32 *) info->src;
   721     int srcskip = info->src_skip >> 2;
   722     Uint32 *dstp = (Uint32 *) info->dst;
   723     int dstskip = info->dst_skip >> 2;
   724     SDL_PixelFormat *srcfmt = info->src_fmt;
   725     SDL_PixelFormat *dstfmt = info->dst_fmt;
   726     unsigned sA = info->a;
   727     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   728     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   729     Uint32 ckey = info->colorkey;
   730     vector unsigned char mergePermute;
   731     vector unsigned char vsrcPermute;
   732     vector unsigned char vdstPermute;
   733     vector unsigned char vsdstPermute;
   734     vector unsigned char valpha;
   735     vector unsigned char valphamask;
   736     vector unsigned char vbits;
   737     vector unsigned char v0;
   738     vector unsigned short v1;
   739     vector unsigned short v8;
   740     vector unsigned int vckey;
   741     vector unsigned int vrgbmask;
   742 
   743     mergePermute = VEC_MERGE_PERMUTE();
   744     v0 = vec_splat_u8(0);
   745     v1 = vec_splat_u16(1);
   746     v8 = vec_splat_u16(8);
   747 
   748     /* set the alpha to 255 on the destination surf */
   749     valphamask = VEC_ALPHA_MASK();
   750 
   751     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   752     vdstPermute = calc_swizzle32(NULL, dstfmt);
   753     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   754 
   755     /* set a vector full of alpha and 255-alpha */
   756     ((unsigned char *) &valpha)[0] = sA;
   757     valpha = vec_splat(valpha, 0);
   758     vbits = (vector unsigned char) vec_splat_s8(-1);
   759 
   760     ckey &= rgbmask;
   761     ((unsigned int *) (char *) &vckey)[0] = ckey;
   762     vckey = vec_splat(vckey, 0);
   763     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
   764     vrgbmask = vec_splat(vrgbmask, 0);
   765 
   766     while (height--) {
   767         int width = info->dst_w;
   768 #define ONE_PIXEL_BLEND(condition, widthvar) \
   769         while (condition) { \
   770             Uint32 Pixel; \
   771             unsigned sR, sG, sB, dR, dG, dB; \
   772             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   773             if(sA && Pixel != ckey) { \
   774                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   775                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   776                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   777                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   778             } \
   779             dstp++; \
   780             srcp++; \
   781             widthvar--; \
   782         }
   783         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   784         if (width > 0) {
   785             int extrawidth = (width % 4);
   786             vector unsigned char valigner = VEC_ALIGNER(srcp);
   787             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   788             width -= extrawidth;
   789             while (width) {
   790                 vector unsigned char vsel;
   791                 vector unsigned char voverflow;
   792                 vector unsigned char vd;
   793                 vector unsigned char vd_orig;
   794 
   795                 /* s = *srcp */
   796                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   797                 vs = vec_perm(vs, voverflow, valigner);
   798 
   799                 /* vsel is set for items that match the key */
   800                 vsel =
   801                     (vector unsigned char) vec_and((vector unsigned int) vs,
   802                                                    vrgbmask);
   803                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
   804                                                         vsel, vckey);
   805 
   806                 /* permute to source format */
   807                 vs = vec_perm(vs, valpha, vsrcPermute);
   808 
   809                 /* d = *dstp */
   810                 vd = (vector unsigned char) vec_ld(0, dstp);
   811                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   812 
   813                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   814 
   815                 /* set the alpha channel to full on */
   816                 vd = vec_or(vd, valphamask);
   817 
   818                 /* mask out color key */
   819                 vd = vec_sel(vd, vd_orig, vsel);
   820 
   821                 /* permute to dest format */
   822                 vd = vec_perm(vd, vbits, vdstPermute);
   823 
   824                 /* *dstp = res */
   825                 vec_st((vector unsigned int) vd, 0, dstp);
   826 
   827                 srcp += 4;
   828                 dstp += 4;
   829                 width -= 4;
   830                 vs = voverflow;
   831             }
   832             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   833         }
   834 #undef ONE_PIXEL_BLEND
   835 
   836         srcp += srcskip;
   837         dstp += dstskip;
   838     }
   839 }
   840 
   841 
   842 static void
   843 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
   844 {
   845     int width = info->dst_w;
   846     int height = info->dst_h;
   847     Uint32 *srcp = (Uint32 *) info->src;
   848     int srcskip = info->src_skip >> 2;
   849     Uint32 *dstp = (Uint32 *) info->dst;
   850     int dstskip = info->dst_skip >> 2;
   851     SDL_PixelFormat *srcfmt = info->src_fmt;
   852     SDL_PixelFormat *dstfmt = info->dst_fmt;
   853     vector unsigned char mergePermute;
   854     vector unsigned char valphaPermute;
   855     vector unsigned char vsrcPermute;
   856     vector unsigned char vdstPermute;
   857     vector unsigned char vsdstPermute;
   858     vector unsigned char valphamask;
   859     vector unsigned char vpixelmask;
   860     vector unsigned char v0;
   861     vector unsigned short v1;
   862     vector unsigned short v8;
   863 
   864     v0 = vec_splat_u8(0);
   865     v1 = vec_splat_u16(1);
   866     v8 = vec_splat_u16(8);
   867     mergePermute = VEC_MERGE_PERMUTE();
   868     valphamask = VEC_ALPHA_MASK();
   869     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   870     vpixelmask = vec_nor(valphamask, v0);
   871     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   872     vdstPermute = calc_swizzle32(NULL, dstfmt);
   873     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   874 
   875     while (height--) {
   876         width = info->dst_w;
   877 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   878             Uint32 Pixel; \
   879             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   880             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   881             if(sA) { \
   882               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   883               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   884               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   885             } \
   886             ++srcp; \
   887             ++dstp; \
   888             widthvar--; \
   889         }
   890         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   891         if (width > 0) {
   892             /* vsrcPermute */
   893             /* vdstPermute */
   894             int extrawidth = (width % 4);
   895             vector unsigned char valigner = VEC_ALIGNER(srcp);
   896             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   897             width -= extrawidth;
   898             while (width) {
   899                 vector unsigned char voverflow;
   900                 vector unsigned char vd;
   901                 vector unsigned char valpha;
   902                 vector unsigned char vdstalpha;
   903                 /* s = *srcp */
   904                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   905                 vs = vec_perm(vs, voverflow, valigner);
   906                 vs = vec_perm(vs, v0, vsrcPermute);
   907 
   908                 valpha = vec_perm(vs, v0, valphaPermute);
   909 
   910                 /* d = *dstp */
   911                 vd = (vector unsigned char) vec_ld(0, dstp);
   912                 vd = vec_perm(vd, v0, vsdstPermute);
   913                 vdstalpha = vec_and(vd, valphamask);
   914 
   915                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   916 
   917                 /* set the alpha to the dest alpha */
   918                 vd = vec_and(vd, vpixelmask);
   919                 vd = vec_or(vd, vdstalpha);
   920                 vd = vec_perm(vd, v0, vdstPermute);
   921 
   922                 /* *dstp = res */
   923                 vec_st((vector unsigned int) vd, 0, dstp);
   924 
   925                 srcp += 4;
   926                 dstp += 4;
   927                 width -= 4;
   928                 vs = voverflow;
   929 
   930             }
   931             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   932         }
   933         srcp += srcskip;
   934         dstp += dstskip;
   935 #undef ONE_PIXEL_BLEND
   936     }
   937 }
   938 
   939 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   940 static void
   941 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
   942 {
   943     int width = info->dst_w;
   944     int height = info->dst_h;
   945     Uint32 *srcp = (Uint32 *) info->src;
   946     int srcskip = info->src_skip >> 2;
   947     Uint32 *dstp = (Uint32 *) info->dst;
   948     int dstskip = info->dst_skip >> 2;
   949     vector unsigned char mergePermute;
   950     vector unsigned char valphaPermute;
   951     vector unsigned char valphamask;
   952     vector unsigned char vpixelmask;
   953     vector unsigned char v0;
   954     vector unsigned short v1;
   955     vector unsigned short v8;
   956     v0 = vec_splat_u8(0);
   957     v1 = vec_splat_u16(1);
   958     v8 = vec_splat_u16(8);
   959     mergePermute = VEC_MERGE_PERMUTE();
   960     valphamask = VEC_ALPHA_MASK();
   961     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   962 
   963 
   964     vpixelmask = vec_nor(valphamask, v0);
   965     while (height--) {
   966         width = info->dst_w;
   967 #define ONE_PIXEL_BLEND(condition, widthvar) \
   968         while ((condition)) { \
   969             Uint32 dalpha; \
   970             Uint32 d; \
   971             Uint32 s1; \
   972             Uint32 d1; \
   973             Uint32 s = *srcp; \
   974             Uint32 alpha = s >> 24; \
   975             if(alpha) { \
   976               if(alpha == SDL_ALPHA_OPAQUE) { \
   977                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   978               } else { \
   979                 d = *dstp; \
   980                 dalpha = d & 0xff000000; \
   981                 s1 = s & 0xff00ff; \
   982                 d1 = d & 0xff00ff; \
   983                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   984                 s &= 0xff00; \
   985                 d &= 0xff00; \
   986                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   987                 *dstp = d1 | d | dalpha; \
   988               } \
   989             } \
   990             ++srcp; \
   991             ++dstp; \
   992             widthvar--; \
   993 	    }
   994         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   995         if (width > 0) {
   996             int extrawidth = (width % 4);
   997             vector unsigned char valigner = VEC_ALIGNER(srcp);
   998             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   999             width -= extrawidth;
  1000             while (width) {
  1001                 vector unsigned char voverflow;
  1002                 vector unsigned char vd;
  1003                 vector unsigned char valpha;
  1004                 vector unsigned char vdstalpha;
  1005                 /* s = *srcp */
  1006                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1007                 vs = vec_perm(vs, voverflow, valigner);
  1008 
  1009                 valpha = vec_perm(vs, v0, valphaPermute);
  1010 
  1011                 /* d = *dstp */
  1012                 vd = (vector unsigned char) vec_ld(0, dstp);
  1013                 vdstalpha = vec_and(vd, valphamask);
  1014 
  1015                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1016 
  1017                 /* set the alpha to the dest alpha */
  1018                 vd = vec_and(vd, vpixelmask);
  1019                 vd = vec_or(vd, vdstalpha);
  1020 
  1021                 /* *dstp = res */
  1022                 vec_st((vector unsigned int) vd, 0, dstp);
  1023 
  1024                 srcp += 4;
  1025                 dstp += 4;
  1026                 width -= 4;
  1027                 vs = voverflow;
  1028             }
  1029             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1030         }
  1031         srcp += srcskip;
  1032         dstp += dstskip;
  1033     }
  1034 #undef ONE_PIXEL_BLEND
  1035 }
  1036 
  1037 static void
  1038 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1039 {
  1040     /* XXX : 6 */
  1041     int height = info->dst_h;
  1042     Uint32 *srcp = (Uint32 *) info->src;
  1043     int srcskip = info->src_skip >> 2;
  1044     Uint32 *dstp = (Uint32 *) info->dst;
  1045     int dstskip = info->dst_skip >> 2;
  1046     SDL_PixelFormat *srcfmt = info->src_fmt;
  1047     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1048     unsigned sA = info->a;
  1049     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1050     vector unsigned char mergePermute;
  1051     vector unsigned char vsrcPermute;
  1052     vector unsigned char vdstPermute;
  1053     vector unsigned char vsdstPermute;
  1054     vector unsigned char valpha;
  1055     vector unsigned char valphamask;
  1056     vector unsigned char vbits;
  1057     vector unsigned short v1;
  1058     vector unsigned short v8;
  1059 
  1060     mergePermute = VEC_MERGE_PERMUTE();
  1061     v1 = vec_splat_u16(1);
  1062     v8 = vec_splat_u16(8);
  1063 
  1064     /* set the alpha to 255 on the destination surf */
  1065     valphamask = VEC_ALPHA_MASK();
  1066 
  1067     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1068     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1069     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1070 
  1071     /* set a vector full of alpha and 255-alpha */
  1072     ((unsigned char *) &valpha)[0] = sA;
  1073     valpha = vec_splat(valpha, 0);
  1074     vbits = (vector unsigned char) vec_splat_s8(-1);
  1075 
  1076     while (height--) {
  1077         int width = info->dst_w;
  1078 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1079             Uint32 Pixel; \
  1080             unsigned sR, sG, sB, dR, dG, dB; \
  1081             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1082             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1083             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1084             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1085             ++srcp; \
  1086             ++dstp; \
  1087             widthvar--; \
  1088         }
  1089         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1090         if (width > 0) {
  1091             int extrawidth = (width % 4);
  1092             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1093             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1094             width -= extrawidth;
  1095             while (width) {
  1096                 vector unsigned char voverflow;
  1097                 vector unsigned char vd;
  1098 
  1099                 /* s = *srcp */
  1100                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1101                 vs = vec_perm(vs, voverflow, valigner);
  1102                 vs = vec_perm(vs, valpha, vsrcPermute);
  1103 
  1104                 /* d = *dstp */
  1105                 vd = (vector unsigned char) vec_ld(0, dstp);
  1106                 vd = vec_perm(vd, vd, vsdstPermute);
  1107 
  1108                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1109 
  1110                 /* set the alpha channel to full on */
  1111                 vd = vec_or(vd, valphamask);
  1112                 vd = vec_perm(vd, vbits, vdstPermute);
  1113 
  1114                 /* *dstp = res */
  1115                 vec_st((vector unsigned int) vd, 0, dstp);
  1116 
  1117                 srcp += 4;
  1118                 dstp += 4;
  1119                 width -= 4;
  1120                 vs = voverflow;
  1121             }
  1122             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1123         }
  1124 #undef ONE_PIXEL_BLEND
  1125 
  1126         srcp += srcskip;
  1127         dstp += dstskip;
  1128     }
  1129 
  1130 }
  1131 
  1132 
  1133 /* fast RGB888->(A)RGB888 blending */
  1134 static void
  1135 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1136 {
  1137     unsigned alpha = info->a;
  1138     int height = info->dst_h;
  1139     Uint32 *srcp = (Uint32 *) info->src;
  1140     int srcskip = info->src_skip >> 2;
  1141     Uint32 *dstp = (Uint32 *) info->dst;
  1142     int dstskip = info->dst_skip >> 2;
  1143     vector unsigned char mergePermute;
  1144     vector unsigned char valpha;
  1145     vector unsigned char valphamask;
  1146     vector unsigned short v1;
  1147     vector unsigned short v8;
  1148 
  1149     mergePermute = VEC_MERGE_PERMUTE();
  1150     v1 = vec_splat_u16(1);
  1151     v8 = vec_splat_u16(8);
  1152 
  1153     /* set the alpha to 255 on the destination surf */
  1154     valphamask = VEC_ALPHA_MASK();
  1155 
  1156     /* set a vector full of alpha and 255-alpha */
  1157     ((unsigned char *) &valpha)[0] = alpha;
  1158     valpha = vec_splat(valpha, 0);
  1159 
  1160     while (height--) {
  1161         int width = info->dst_w;
  1162 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1163             Uint32 s = *srcp; \
  1164             Uint32 d = *dstp; \
  1165             Uint32 s1 = s & 0xff00ff; \
  1166             Uint32 d1 = d & 0xff00ff; \
  1167             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1168                  & 0xff00ff; \
  1169             s &= 0xff00; \
  1170             d &= 0xff00; \
  1171             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1172             *dstp = d1 | d | 0xff000000; \
  1173             ++srcp; \
  1174             ++dstp; \
  1175             widthvar--; \
  1176         }
  1177         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1178         if (width > 0) {
  1179             int extrawidth = (width % 4);
  1180             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1181             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1182             width -= extrawidth;
  1183             while (width) {
  1184                 vector unsigned char voverflow;
  1185                 vector unsigned char vd;
  1186 
  1187                 /* s = *srcp */
  1188                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1189                 vs = vec_perm(vs, voverflow, valigner);
  1190 
  1191                 /* d = *dstp */
  1192                 vd = (vector unsigned char) vec_ld(0, dstp);
  1193 
  1194                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1195 
  1196                 /* set the alpha channel to full on */
  1197                 vd = vec_or(vd, valphamask);
  1198 
  1199                 /* *dstp = res */
  1200                 vec_st((vector unsigned int) vd, 0, dstp);
  1201 
  1202                 srcp += 4;
  1203                 dstp += 4;
  1204                 width -= 4;
  1205                 vs = voverflow;
  1206             }
  1207             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1208         }
  1209 #undef ONE_PIXEL_BLEND
  1210 
  1211         srcp += srcskip;
  1212         dstp += dstskip;
  1213     }
  1214 }
  1215 
  1216 #if __MWERKS__
  1217 #pragma altivec_model off
  1218 #endif
  1219 #endif /* SDL_ALTIVEC_BLITTERS */
  1220 
  1221 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1222 static void
  1223 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1224 {
  1225     int width = info->dst_w;
  1226     int height = info->dst_h;
  1227     Uint32 *srcp = (Uint32 *) info->src;
  1228     int srcskip = info->src_skip >> 2;
  1229     Uint32 *dstp = (Uint32 *) info->dst;
  1230     int dstskip = info->dst_skip >> 2;
  1231 
  1232     while (height--) {
  1233 	    /* *INDENT-OFF* */
  1234 	    DUFFS_LOOP4({
  1235 		    Uint32 s = *srcp++;
  1236 		    Uint32 d = *dstp;
  1237 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1238 			       + (s & d & 0x00010101)) | 0xff000000;
  1239 	    }, width);
  1240 	    /* *INDENT-ON* */
  1241         srcp += srcskip;
  1242         dstp += dstskip;
  1243     }
  1244 }
  1245 
  1246 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1247 static void
  1248 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1249 {
  1250     unsigned alpha = info->a;
  1251     if (alpha == 128) {
  1252         BlitRGBtoRGBSurfaceAlpha128(info);
  1253     } else {
  1254         int width = info->dst_w;
  1255         int height = info->dst_h;
  1256         Uint32 *srcp = (Uint32 *) info->src;
  1257         int srcskip = info->src_skip >> 2;
  1258         Uint32 *dstp = (Uint32 *) info->dst;
  1259         int dstskip = info->dst_skip >> 2;
  1260         Uint32 s;
  1261         Uint32 d;
  1262         Uint32 s1;
  1263         Uint32 d1;
  1264 
  1265         while (height--) {
  1266 			/* *INDENT-OFF* */
  1267 			DUFFS_LOOP4({
  1268 				s = *srcp;
  1269 				d = *dstp;
  1270 				s1 = s & 0xff00ff;
  1271 				d1 = d & 0xff00ff;
  1272 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1273 				     & 0xff00ff;
  1274 				s &= 0xff00;
  1275 				d &= 0xff00;
  1276 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1277 				*dstp = d1 | d | 0xff000000;
  1278 				++srcp;
  1279 				++dstp;
  1280 			}, width);
  1281 			/* *INDENT-ON* */
  1282             srcp += srcskip;
  1283             dstp += dstskip;
  1284         }
  1285     }
  1286 }
  1287 
  1288 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1289 static void
  1290 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1291 {
  1292     int width = info->dst_w;
  1293     int height = info->dst_h;
  1294     Uint32 *srcp = (Uint32 *) info->src;
  1295     int srcskip = info->src_skip >> 2;
  1296     Uint32 *dstp = (Uint32 *) info->dst;
  1297     int dstskip = info->dst_skip >> 2;
  1298 
  1299     while (height--) {
  1300 	    /* *INDENT-OFF* */
  1301 	    DUFFS_LOOP4({
  1302 		Uint32 dalpha;
  1303 		Uint32 d;
  1304 		Uint32 s1;
  1305 		Uint32 d1;
  1306 		Uint32 s = *srcp;
  1307 		Uint32 alpha = s >> 24;
  1308 		/* FIXME: Here we special-case opaque alpha since the
  1309 		   compositioning used (>>8 instead of /255) doesn't handle
  1310 		   it correctly. Also special-case alpha=0 for speed?
  1311 		   Benchmark this! */
  1312 		if(alpha) {   
  1313 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1314 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1315 		  } else {
  1316 		    /*
  1317 		     * take out the middle component (green), and process
  1318 		     * the other two in parallel. One multiply less.
  1319 		     */
  1320 		    d = *dstp;
  1321 		    dalpha = d & 0xff000000;
  1322 		    s1 = s & 0xff00ff;
  1323 		    d1 = d & 0xff00ff;
  1324 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1325 		    s &= 0xff00;
  1326 		    d &= 0xff00;
  1327 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1328 		    *dstp = d1 | d | dalpha;
  1329 		  }
  1330 		}
  1331 		++srcp;
  1332 		++dstp;
  1333 	    }, width);
  1334 	    /* *INDENT-ON* */
  1335         srcp += srcskip;
  1336         dstp += dstskip;
  1337     }
  1338 }
  1339 
  1340 #ifdef __3dNOW__
  1341 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1342 static void
  1343 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1344 {
  1345     int width = info->dst_w;
  1346     int height = info->dst_h;
  1347     Uint32 *srcp = (Uint32 *) info->src;
  1348     int srcskip = info->src_skip >> 2;
  1349     Uint32 *dstp = (Uint32 *) info->dst;
  1350     int dstskip = info->dst_skip >> 2;
  1351     SDL_PixelFormat *sf = info->src_fmt;
  1352     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1353     Uint32 amask = sf->Amask;
  1354     Uint32 ashift = sf->Ashift;
  1355     Uint64 multmask;
  1356 
  1357     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1358 
  1359     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1360     multmask = 0xFFFF;
  1361     multmask <<= (ashift * 2);
  1362     multmask = ~multmask;
  1363     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1364 
  1365     while (height--) {
  1366 	    /* *INDENT-OFF* */
  1367 	    DUFFS_LOOP4({
  1368 		Uint32 alpha;
  1369 
  1370 		_m_prefetch(srcp + 16);
  1371 		_m_prefetch(dstp + 16);
  1372 
  1373 		alpha = *srcp & amask;
  1374 		if (alpha == 0) {
  1375 			/* do nothing */
  1376 		} else if (alpha == amask) {
  1377 			/* copy RGB, keep dst alpha */
  1378 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1379 		} else {
  1380 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1381 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1382 
  1383 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1384 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1385 
  1386 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1387 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1388 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1389 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1390 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1391 
  1392 			/* blend */		    
  1393 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1394 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1395 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1396 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1397 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1398 			
  1399 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1400 		}
  1401 		++srcp;
  1402 		++dstp;
  1403 	    }, width);
  1404 	    /* *INDENT-ON* */
  1405         srcp += srcskip;
  1406         dstp += dstskip;
  1407     }
  1408     _mm_empty();
  1409 }
  1410 
  1411 #endif /* __MMX__ */
  1412 
  1413 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1414 
  1415 /* blend a single 16 bit pixel at 50% */
  1416 #define BLEND16_50(d, s, mask)						\
  1417 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1418 
  1419 /* blend two 16 bit pixels at 50% */
  1420 #define BLEND2x16_50(d, s, mask)					     \
  1421 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1422 	 + (s & d & (~(mask | mask << 16))))
  1423 
  1424 static void
  1425 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1426 {
  1427     int width = info->dst_w;
  1428     int height = info->dst_h;
  1429     Uint16 *srcp = (Uint16 *) info->src;
  1430     int srcskip = info->src_skip >> 1;
  1431     Uint16 *dstp = (Uint16 *) info->dst;
  1432     int dstskip = info->dst_skip >> 1;
  1433 
  1434     while (height--) {
  1435         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1436             /*
  1437              * Source and destination not aligned, pipeline it.
  1438              * This is mostly a win for big blits but no loss for
  1439              * small ones
  1440              */
  1441             Uint32 prev_sw;
  1442             int w = width;
  1443 
  1444             /* handle odd destination */
  1445             if ((uintptr_t) dstp & 2) {
  1446                 Uint16 d = *dstp, s = *srcp;
  1447                 *dstp = BLEND16_50(d, s, mask);
  1448                 dstp++;
  1449                 srcp++;
  1450                 w--;
  1451             }
  1452             srcp++;             /* srcp is now 32-bit aligned */
  1453 
  1454             /* bootstrap pipeline with first halfword */
  1455             prev_sw = ((Uint32 *) srcp)[-1];
  1456 
  1457             while (w > 1) {
  1458                 Uint32 sw, dw, s;
  1459                 sw = *(Uint32 *) srcp;
  1460                 dw = *(Uint32 *) dstp;
  1461 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1462                 s = (prev_sw << 16) + (sw >> 16);
  1463 #else
  1464                 s = (prev_sw >> 16) + (sw << 16);
  1465 #endif
  1466                 prev_sw = sw;
  1467                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1468                 dstp += 2;
  1469                 srcp += 2;
  1470                 w -= 2;
  1471             }
  1472 
  1473             /* final pixel if any */
  1474             if (w) {
  1475                 Uint16 d = *dstp, s;
  1476 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1477                 s = (Uint16) prev_sw;
  1478 #else
  1479                 s = (Uint16) (prev_sw >> 16);
  1480 #endif
  1481                 *dstp = BLEND16_50(d, s, mask);
  1482                 srcp++;
  1483                 dstp++;
  1484             }
  1485             srcp += srcskip - 1;
  1486             dstp += dstskip;
  1487         } else {
  1488             /* source and destination are aligned */
  1489             int w = width;
  1490 
  1491             /* first odd pixel? */
  1492             if ((uintptr_t) srcp & 2) {
  1493                 Uint16 d = *dstp, s = *srcp;
  1494                 *dstp = BLEND16_50(d, s, mask);
  1495                 srcp++;
  1496                 dstp++;
  1497                 w--;
  1498             }
  1499             /* srcp and dstp are now 32-bit aligned */
  1500 
  1501             while (w > 1) {
  1502                 Uint32 sw = *(Uint32 *) srcp;
  1503                 Uint32 dw = *(Uint32 *) dstp;
  1504                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1505                 srcp += 2;
  1506                 dstp += 2;
  1507                 w -= 2;
  1508             }
  1509 
  1510             /* last odd pixel? */
  1511             if (w) {
  1512                 Uint16 d = *dstp, s = *srcp;
  1513                 *dstp = BLEND16_50(d, s, mask);
  1514                 srcp++;
  1515                 dstp++;
  1516             }
  1517             srcp += srcskip;
  1518             dstp += dstskip;
  1519         }
  1520     }
  1521 }
  1522 
  1523 #ifdef __MMX__
  1524 
  1525 /* fast RGB565->RGB565 blending with surface alpha */
  1526 static void
  1527 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1528 {
  1529     unsigned alpha = info->a;
  1530     if (alpha == 128) {
  1531         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1532     } else {
  1533         int width = info->dst_w;
  1534         int height = info->dst_h;
  1535         Uint16 *srcp = (Uint16 *) info->src;
  1536         int srcskip = info->src_skip >> 1;
  1537         Uint16 *dstp = (Uint16 *) info->dst;
  1538         int dstskip = info->dst_skip >> 1;
  1539         Uint32 s, d;
  1540 
  1541         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  1542 
  1543         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1544         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1545         alpha >>= 3;            /* downscale alpha to 5 bits */
  1546 
  1547         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1548         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1549         /* position alpha to allow for mullo and mulhi on diff channels
  1550            to reduce the number of operations */
  1551         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1552 
  1553         /* Setup the 565 color channel masks */
  1554         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  1555         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1556 
  1557         while (height--) {
  1558 			/* *INDENT-OFF* */
  1559 			DUFFS_LOOP_124(
  1560 			{
  1561 				s = *srcp++;
  1562 				d = *dstp;
  1563 				/*
  1564 				 * shift out the middle component (green) to
  1565 				 * the high 16 bits, and process all three RGB
  1566 				 * components at the same time.
  1567 				 */
  1568 				s = (s | s << 16) & 0x07e0f81f;
  1569 				d = (d | d << 16) & 0x07e0f81f;
  1570 				d += (s - d) * alpha >> 5;
  1571 				d &= 0x07e0f81f;
  1572 				*dstp++ = (Uint16)(d | d >> 16);
  1573 			},{
  1574 				s = *srcp++;
  1575 				d = *dstp;
  1576 				/*
  1577 				 * shift out the middle component (green) to
  1578 				 * the high 16 bits, and process all three RGB
  1579 				 * components at the same time.
  1580 				 */
  1581 				s = (s | s << 16) & 0x07e0f81f;
  1582 				d = (d | d << 16) & 0x07e0f81f;
  1583 				d += (s - d) * alpha >> 5;
  1584 				d &= 0x07e0f81f;
  1585 				*dstp++ = (Uint16)(d | d >> 16);
  1586 				s = *srcp++;
  1587 				d = *dstp;
  1588 				/*
  1589 				 * shift out the middle component (green) to
  1590 				 * the high 16 bits, and process all three RGB
  1591 				 * components at the same time.
  1592 				 */
  1593 				s = (s | s << 16) & 0x07e0f81f;
  1594 				d = (d | d << 16) & 0x07e0f81f;
  1595 				d += (s - d) * alpha >> 5;
  1596 				d &= 0x07e0f81f;
  1597 				*dstp++ = (Uint16)(d | d >> 16);
  1598 			},{
  1599 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1600 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1601 
  1602 				/* red */
  1603 				src2 = src1;
  1604 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  1605 
  1606 				dst2 = dst1;
  1607 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  1608 
  1609 				/* blend */
  1610 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1611 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1612 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1613 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1614 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  1615 
  1616 				mm_res = dst2; /* RED -> mm_res */
  1617 
  1618 				/* green -- process the bits in place */
  1619 				src2 = src1;
  1620 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1621 
  1622 				dst2 = dst1;
  1623 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1624 
  1625 				/* blend */
  1626 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1627 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1628 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1629 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1630 
  1631 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1632 
  1633 				/* blue */
  1634 				src2 = src1;
  1635 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1636 
  1637 				dst2 = dst1;
  1638 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1639 
  1640 				/* blend */
  1641 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1642 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1643 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1644 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1645 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1646 
  1647 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1648 
  1649 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1650 
  1651 				srcp += 4;
  1652 				dstp += 4;
  1653 			}, width);
  1654 			/* *INDENT-ON* */
  1655             srcp += srcskip;
  1656             dstp += dstskip;
  1657         }
  1658         _mm_empty();
  1659     }
  1660 }
  1661 
  1662 /* fast RGB555->RGB555 blending with surface alpha */
  1663 static void
  1664 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  1665 {
  1666     unsigned alpha = info->a;
  1667     if (alpha == 128) {
  1668         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1669     } else {
  1670         int width = info->dst_w;
  1671         int height = info->dst_h;
  1672         Uint16 *srcp = (Uint16 *) info->src;
  1673         int srcskip = info->src_skip >> 1;
  1674         Uint16 *dstp = (Uint16 *) info->dst;
  1675         int dstskip = info->dst_skip >> 1;
  1676         Uint32 s, d;
  1677 
  1678         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  1679 
  1680         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1681         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1682         alpha >>= 3;            /* downscale alpha to 5 bits */
  1683 
  1684         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1685         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1686         /* position alpha to allow for mullo and mulhi on diff channels
  1687            to reduce the number of operations */
  1688         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1689 
  1690         /* Setup the 555 color channel masks */
  1691         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  1692         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  1693         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1694 
  1695         while (height--) {
  1696 			/* *INDENT-OFF* */
  1697 			DUFFS_LOOP_124(
  1698 			{
  1699 				s = *srcp++;
  1700 				d = *dstp;
  1701 				/*
  1702 				 * shift out the middle component (green) to
  1703 				 * the high 16 bits, and process all three RGB
  1704 				 * components at the same time.
  1705 				 */
  1706 				s = (s | s << 16) & 0x03e07c1f;
  1707 				d = (d | d << 16) & 0x03e07c1f;
  1708 				d += (s - d) * alpha >> 5;
  1709 				d &= 0x03e07c1f;
  1710 				*dstp++ = (Uint16)(d | d >> 16);
  1711 			},{
  1712 				s = *srcp++;
  1713 				d = *dstp;
  1714 				/*
  1715 				 * shift out the middle component (green) to
  1716 				 * the high 16 bits, and process all three RGB
  1717 				 * components at the same time.
  1718 				 */
  1719 				s = (s | s << 16) & 0x03e07c1f;
  1720 				d = (d | d << 16) & 0x03e07c1f;
  1721 				d += (s - d) * alpha >> 5;
  1722 				d &= 0x03e07c1f;
  1723 				*dstp++ = (Uint16)(d | d >> 16);
  1724 			        s = *srcp++;
  1725 				d = *dstp;
  1726 				/*
  1727 				 * shift out the middle component (green) to
  1728 				 * the high 16 bits, and process all three RGB
  1729 				 * components at the same time.
  1730 				 */
  1731 				s = (s | s << 16) & 0x03e07c1f;
  1732 				d = (d | d << 16) & 0x03e07c1f;
  1733 				d += (s - d) * alpha >> 5;
  1734 				d &= 0x03e07c1f;
  1735 				*dstp++ = (Uint16)(d | d >> 16);
  1736 			},{
  1737 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1738 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1739 
  1740 				/* red -- process the bits in place */
  1741 				src2 = src1;
  1742 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  1743 
  1744 				dst2 = dst1;
  1745 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  1746 
  1747 				/* blend */
  1748 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1749 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1750 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1751 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1752 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  1753 
  1754 				mm_res = dst2; /* RED -> mm_res */
  1755 				
  1756 				/* green -- process the bits in place */
  1757 				src2 = src1;
  1758 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1759 
  1760 				dst2 = dst1;
  1761 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1762 
  1763 				/* blend */
  1764 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1765 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1766 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1767 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1768 
  1769 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1770 
  1771 				/* blue */
  1772 				src2 = src1; /* src -> src2 */
  1773 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1774 
  1775 				dst2 = dst1; /* dst -> dst2 */
  1776 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1777 
  1778 				/* blend */
  1779 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1780 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1781 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1782 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1783 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1784 
  1785 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1786 
  1787 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1788 
  1789 				srcp += 4;
  1790 				dstp += 4;
  1791 			}, width);
  1792 			/* *INDENT-ON* */
  1793             srcp += srcskip;
  1794             dstp += dstskip;
  1795         }
  1796         _mm_empty();
  1797     }
  1798 }
  1799 
  1800 #endif /* __MMX__ */
  1801 
  1802 /* fast RGB565->RGB565 blending with surface alpha */
  1803 static void
  1804 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1805 {
  1806     unsigned alpha = info->a;
  1807     if (alpha == 128) {
  1808         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1809     } else {
  1810         int width = info->dst_w;
  1811         int height = info->dst_h;
  1812         Uint16 *srcp = (Uint16 *) info->src;
  1813         int srcskip = info->src_skip >> 1;
  1814         Uint16 *dstp = (Uint16 *) info->dst;
  1815         int dstskip = info->dst_skip >> 1;
  1816         alpha >>= 3;            /* downscale alpha to 5 bits */
  1817 
  1818         while (height--) {
  1819 			/* *INDENT-OFF* */
  1820 			DUFFS_LOOP4({
  1821 				Uint32 s = *srcp++;
  1822 				Uint32 d = *dstp;
  1823 				/*
  1824 				 * shift out the middle component (green) to
  1825 				 * the high 16 bits, and process all three RGB
  1826 				 * components at the same time.
  1827 				 */
  1828 				s = (s | s << 16) & 0x07e0f81f;
  1829 				d = (d | d << 16) & 0x07e0f81f;
  1830 				d += (s - d) * alpha >> 5;
  1831 				d &= 0x07e0f81f;
  1832 				*dstp++ = (Uint16)(d | d >> 16);
  1833 			}, width);
  1834 			/* *INDENT-ON* */
  1835             srcp += srcskip;
  1836             dstp += dstskip;
  1837         }
  1838     }
  1839 }
  1840 
  1841 /* fast RGB555->RGB555 blending with surface alpha */
  1842 static void
  1843 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1844 {
  1845     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1846     if (alpha == 128) {
  1847         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1848     } else {
  1849         int width = info->dst_w;
  1850         int height = info->dst_h;
  1851         Uint16 *srcp = (Uint16 *) info->src;
  1852         int srcskip = info->src_skip >> 1;
  1853         Uint16 *dstp = (Uint16 *) info->dst;
  1854         int dstskip = info->dst_skip >> 1;
  1855         alpha >>= 3;            /* downscale alpha to 5 bits */
  1856 
  1857         while (height--) {
  1858 			/* *INDENT-OFF* */
  1859 			DUFFS_LOOP4({
  1860 				Uint32 s = *srcp++;
  1861 				Uint32 d = *dstp;
  1862 				/*
  1863 				 * shift out the middle component (green) to
  1864 				 * the high 16 bits, and process all three RGB
  1865 				 * components at the same time.
  1866 				 */
  1867 				s = (s | s << 16) & 0x03e07c1f;
  1868 				d = (d | d << 16) & 0x03e07c1f;
  1869 				d += (s - d) * alpha >> 5;
  1870 				d &= 0x03e07c1f;
  1871 				*dstp++ = (Uint16)(d | d >> 16);
  1872 			}, width);
  1873 			/* *INDENT-ON* */
  1874             srcp += srcskip;
  1875             dstp += dstskip;
  1876         }
  1877     }
  1878 }
  1879 
  1880 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1881 static void
  1882 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1883 {
  1884     int width = info->dst_w;
  1885     int height = info->dst_h;
  1886     Uint32 *srcp = (Uint32 *) info->src;
  1887     int srcskip = info->src_skip >> 2;
  1888     Uint16 *dstp = (Uint16 *) info->dst;
  1889     int dstskip = info->dst_skip >> 1;
  1890 
  1891     while (height--) {
  1892 	    /* *INDENT-OFF* */
  1893 	    DUFFS_LOOP4({
  1894 		Uint32 s = *srcp;
  1895 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1896 		/* FIXME: Here we special-case opaque alpha since the
  1897 		   compositioning used (>>8 instead of /255) doesn't handle
  1898 		   it correctly. Also special-case alpha=0 for speed?
  1899 		   Benchmark this! */
  1900 		if(alpha) {   
  1901 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1902 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1903 		  } else {
  1904 		    Uint32 d = *dstp;
  1905 		    /*
  1906 		     * convert source and destination to G0RAB65565
  1907 		     * and blend all components at the same time
  1908 		     */
  1909 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1910 		      + (s >> 3 & 0x1f);
  1911 		    d = (d | d << 16) & 0x07e0f81f;
  1912 		    d += (s - d) * alpha >> 5;
  1913 		    d &= 0x07e0f81f;
  1914 		    *dstp = (Uint16)(d | d >> 16);
  1915 		  }
  1916 		}
  1917 		srcp++;
  1918 		dstp++;
  1919 	    }, width);
  1920 	    /* *INDENT-ON* */
  1921         srcp += srcskip;
  1922         dstp += dstskip;
  1923     }
  1924 }
  1925 
  1926 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1927 static void
  1928 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1929 {
  1930     int width = info->dst_w;
  1931     int height = info->dst_h;
  1932     Uint32 *srcp = (Uint32 *) info->src;
  1933     int srcskip = info->src_skip >> 2;
  1934     Uint16 *dstp = (Uint16 *) info->dst;
  1935     int dstskip = info->dst_skip >> 1;
  1936 
  1937     while (height--) {
  1938 	    /* *INDENT-OFF* */
  1939 	    DUFFS_LOOP4({
  1940 		unsigned alpha;
  1941 		Uint32 s = *srcp;
  1942 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1943 		/* FIXME: Here we special-case opaque alpha since the
  1944 		   compositioning used (>>8 instead of /255) doesn't handle
  1945 		   it correctly. Also special-case alpha=0 for speed?
  1946 		   Benchmark this! */
  1947 		if(alpha) {   
  1948 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1949 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1950 		  } else {
  1951 		    Uint32 d = *dstp;
  1952 		    /*
  1953 		     * convert source and destination to G0RAB65565
  1954 		     * and blend all components at the same time
  1955 		     */
  1956 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1957 		      + (s >> 3 & 0x1f);
  1958 		    d = (d | d << 16) & 0x03e07c1f;
  1959 		    d += (s - d) * alpha >> 5;
  1960 		    d &= 0x03e07c1f;
  1961 		    *dstp = (Uint16)(d | d >> 16);
  1962 		  }
  1963 		}
  1964 		srcp++;
  1965 		dstp++;
  1966 	    }, width);
  1967 	    /* *INDENT-ON* */
  1968         srcp += srcskip;
  1969         dstp += dstskip;
  1970     }
  1971 }
  1972 
  1973 /* General (slow) N->N blending with per-surface alpha */
  1974 static void
  1975 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1976 {
  1977     int width = info->dst_w;
  1978     int height = info->dst_h;
  1979     Uint8 *src = info->src;
  1980     int srcskip = info->src_skip;
  1981     Uint8 *dst = info->dst;
  1982     int dstskip = info->dst_skip;
  1983     SDL_PixelFormat *srcfmt = info->src_fmt;
  1984     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1985     int srcbpp = srcfmt->BytesPerPixel;
  1986     int dstbpp = dstfmt->BytesPerPixel;
  1987     unsigned sA = info->a;
  1988     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1989 
  1990     if (sA) {
  1991         while (height--) {
  1992 	    /* *INDENT-OFF* */
  1993 	    DUFFS_LOOP4(
  1994 	    {
  1995 		Uint32 Pixel;
  1996 		unsigned sR;
  1997 		unsigned sG;
  1998 		unsigned sB;
  1999 		unsigned dR;
  2000 		unsigned dG;
  2001 		unsigned dB;
  2002 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2003 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2004 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2005 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2006 		src += srcbpp;
  2007 		dst += dstbpp;
  2008 	    },
  2009 	    width);
  2010 	    /* *INDENT-ON* */
  2011             src += srcskip;
  2012             dst += dstskip;
  2013         }
  2014     }
  2015 }
  2016 
  2017 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2018 static void
  2019 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2020 {
  2021     int width = info->dst_w;
  2022     int height = info->dst_h;
  2023     Uint8 *src = info->src;
  2024     int srcskip = info->src_skip;
  2025     Uint8 *dst = info->dst;
  2026     int dstskip = info->dst_skip;
  2027     SDL_PixelFormat *srcfmt = info->src_fmt;
  2028     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2029     Uint32 ckey = info->colorkey;
  2030     int srcbpp = srcfmt->BytesPerPixel;
  2031     int dstbpp = dstfmt->BytesPerPixel;
  2032     unsigned sA = info->a;
  2033     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2034 
  2035     while (height--) {
  2036 	    /* *INDENT-OFF* */
  2037 	    DUFFS_LOOP4(
  2038 	    {
  2039 		Uint32 Pixel;
  2040 		unsigned sR;
  2041 		unsigned sG;
  2042 		unsigned sB;
  2043 		unsigned dR;
  2044 		unsigned dG;
  2045 		unsigned dB;
  2046 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2047 		if(sA && Pixel != ckey) {
  2048 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2049 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2050 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2051 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2052 		}
  2053 		src += srcbpp;
  2054 		dst += dstbpp;
  2055 	    },
  2056 	    width);
  2057 	    /* *INDENT-ON* */
  2058         src += srcskip;
  2059         dst += dstskip;
  2060     }
  2061 }
  2062 
  2063 /* General (slow) N->N blending with pixel alpha */
  2064 static void
  2065 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2066 {
  2067     int width = info->dst_w;
  2068     int height = info->dst_h;
  2069     Uint8 *src = info->src;
  2070     int srcskip = info->src_skip;
  2071     Uint8 *dst = info->dst;
  2072     int dstskip = info->dst_skip;
  2073     SDL_PixelFormat *srcfmt = info->src_fmt;
  2074     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2075 
  2076     int srcbpp;
  2077     int dstbpp;
  2078 
  2079     /* Set up some basic variables */
  2080     srcbpp = srcfmt->BytesPerPixel;
  2081     dstbpp = dstfmt->BytesPerPixel;
  2082 
  2083     while (height--) {
  2084 	    /* *INDENT-OFF* */
  2085 	    DUFFS_LOOP4(
  2086 	    {
  2087 		Uint32 Pixel;
  2088 		unsigned sR;
  2089 		unsigned sG;
  2090 		unsigned sB;
  2091 		unsigned dR;
  2092 		unsigned dG;
  2093 		unsigned dB;
  2094 		unsigned sA;
  2095 		unsigned dA;
  2096 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2097 		if(sA) {
  2098 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2099 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2100 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2101 		}
  2102 		src += srcbpp;
  2103 		dst += dstbpp;
  2104 	    },
  2105 	    width);
  2106 	    /* *INDENT-ON* */
  2107         src += srcskip;
  2108         dst += dstskip;
  2109     }
  2110 }
  2111 
  2112 
  2113 SDL_BlitFunc
  2114 SDL_CalculateBlitA(SDL_Surface * surface)
  2115 {
  2116     SDL_PixelFormat *sf = surface->format;
  2117     SDL_PixelFormat *df = surface->map->dst->format;
  2118 
  2119     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  2120     case SDL_COPY_BLEND:
  2121         /* Per-pixel alpha blits */
  2122         switch (df->BytesPerPixel) {
  2123         case 1:
  2124             return BlitNto1PixelAlpha;
  2125 
  2126         case 2:
  2127 #if SDL_ALTIVEC_BLITTERS
  2128             if (sf->BytesPerPixel == 4
  2129                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2130                 && SDL_HasAltiVec())
  2131                 return Blit32to565PixelAlphaAltivec;
  2132             else
  2133 #endif
  2134                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2135                     && sf->Gmask == 0xff00
  2136                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2137                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2138                 if (df->Gmask == 0x7e0)
  2139                     return BlitARGBto565PixelAlpha;
  2140                 else if (df->Gmask == 0x3e0)
  2141                     return BlitARGBto555PixelAlpha;
  2142             }
  2143             return BlitNtoNPixelAlpha;
  2144 
  2145         case 4:
  2146             if (sf->Rmask == df->Rmask
  2147                 && sf->Gmask == df->Gmask
  2148                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2149 #if defined(__MMX__) || defined(__3dNOW__)
  2150                 if (sf->Rshift % 8 == 0
  2151                     && sf->Gshift % 8 == 0
  2152                     && sf->Bshift % 8 == 0
  2153                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2154 #ifdef __3dNOW__
  2155                     if (SDL_Has3DNow())
  2156                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2157 #endif
  2158 #ifdef __MMX__
  2159                     if (SDL_HasMMX())
  2160                         return BlitRGBtoRGBPixelAlphaMMX;
  2161 #endif
  2162                 }
  2163 #endif /* __MMX__ || __3dNOW__ */
  2164                 if (sf->Amask == 0xff000000) {
  2165 #if SDL_ALTIVEC_BLITTERS
  2166                     if (SDL_HasAltiVec())
  2167                         return BlitRGBtoRGBPixelAlphaAltivec;
  2168 #endif
  2169                     return BlitRGBtoRGBPixelAlpha;
  2170                 }
  2171             }
  2172 #if SDL_ALTIVEC_BLITTERS
  2173             if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
  2174                 return Blit32to32PixelAlphaAltivec;
  2175             else
  2176 #endif
  2177                 return BlitNtoNPixelAlpha;
  2178 
  2179         case 3:
  2180         default:
  2181             return BlitNtoNPixelAlpha;
  2182         }
  2183         break;
  2184 
  2185     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2186         if (sf->Amask == 0) {
  2187             /* Per-surface alpha blits */
  2188             switch (df->BytesPerPixel) {
  2189             case 1:
  2190                 return BlitNto1SurfaceAlpha;
  2191 
  2192             case 2:
  2193                 if (surface->map->identity) {
  2194                     if (df->Gmask == 0x7e0) {
  2195 #ifdef __MMX__
  2196                         if (SDL_HasMMX())
  2197                             return Blit565to565SurfaceAlphaMMX;
  2198                         else
  2199 #endif
  2200                             return Blit565to565SurfaceAlpha;
  2201                     } else if (df->Gmask == 0x3e0) {
  2202 #ifdef __MMX__
  2203                         if (SDL_HasMMX())
  2204                             return Blit555to555SurfaceAlphaMMX;
  2205                         else
  2206 #endif
  2207                             return Blit555to555SurfaceAlpha;
  2208                     }
  2209                 }
  2210                 return BlitNtoNSurfaceAlpha;
  2211 
  2212             case 4:
  2213                 if (sf->Rmask == df->Rmask
  2214                     && sf->Gmask == df->Gmask
  2215                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2216 #ifdef __MMX__
  2217                     if (sf->Rshift % 8 == 0
  2218                         && sf->Gshift % 8 == 0
  2219                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2220                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2221 #endif
  2222                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2223 #if SDL_ALTIVEC_BLITTERS
  2224                         if (SDL_HasAltiVec())
  2225                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2226 #endif
  2227                         return BlitRGBtoRGBSurfaceAlpha;
  2228                     }
  2229                 }
  2230 #if SDL_ALTIVEC_BLITTERS
  2231                 if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
  2232                     return Blit32to32SurfaceAlphaAltivec;
  2233                 else
  2234 #endif
  2235                     return BlitNtoNSurfaceAlpha;
  2236 
  2237             case 3:
  2238             default:
  2239                 return BlitNtoNSurfaceAlpha;
  2240             }
  2241         }
  2242         break;
  2243 
  2244     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2245         if (sf->Amask == 0) {
  2246             if (df->BytesPerPixel == 1)
  2247                 return BlitNto1SurfaceAlphaKey;
  2248             else
  2249 #if SDL_ALTIVEC_BLITTERS
  2250             if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2251                     SDL_HasAltiVec())
  2252                 return Blit32to32SurfaceAlphaKeyAltivec;
  2253             else
  2254 #endif
  2255                 return BlitNtoNSurfaceAlphaKey;
  2256         }
  2257         break;
  2258     }
  2259 
  2260     return NULL;
  2261 }
  2262 
  2263 /* vi: set ts=4 sw=4 expandtab: */