src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Tue, 13 Jan 2009 07:20:55 +0000
changeset 3035 ff602fdfdedc
parent 3013 8cc00819c8d6
child 3363 90aec03bf9fd
permissions -rw-r--r--
Removed Rafal Bursig's MMX RLE code, at his request.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2009 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /* Functions to perform alpha blended blitting */
    28 
    29 /* N->1 blending with per-surface alpha */
    30 static void
    31 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    32 {
    33     int width = info->dst_w;
    34     int height = info->dst_h;
    35     Uint8 *src = info->src;
    36     int srcskip = info->src_skip;
    37     Uint8 *dst = info->dst;
    38     int dstskip = info->dst_skip;
    39     Uint8 *palmap = info->table;
    40     SDL_PixelFormat *srcfmt = info->src_fmt;
    41     SDL_PixelFormat *dstfmt = info->dst_fmt;
    42     int srcbpp = srcfmt->BytesPerPixel;
    43 
    44     const unsigned A = info->a;
    45 
    46     while (height--) {
    47 	    /* *INDENT-OFF* */
    48 	    DUFFS_LOOP4(
    49 	    {
    50 		Uint32 Pixel;
    51 		unsigned sR;
    52 		unsigned sG;
    53 		unsigned sB;
    54 		unsigned dR;
    55 		unsigned dG;
    56 		unsigned dB;
    57 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    58 		dR = dstfmt->palette->colors[*dst].r;
    59 		dG = dstfmt->palette->colors[*dst].g;
    60 		dB = dstfmt->palette->colors[*dst].b;
    61 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    62 		dR &= 0xff;
    63 		dG &= 0xff;
    64 		dB &= 0xff;
    65 		/* Pack RGB into 8bit pixel */
    66 		if ( palmap == NULL ) {
    67 		    *dst =((dR>>5)<<(3+2))|
    68 			  ((dG>>5)<<(2))|
    69 			  ((dB>>6)<<(0));
    70 		} else {
    71 		    *dst = palmap[((dR>>5)<<(3+2))|
    72 				  ((dG>>5)<<(2))  |
    73 				  ((dB>>6)<<(0))];
    74 		}
    75 		dst++;
    76 		src += srcbpp;
    77 	    },
    78 	    width);
    79 	    /* *INDENT-ON* */
    80         src += srcskip;
    81         dst += dstskip;
    82     }
    83 }
    84 
    85 /* N->1 blending with pixel alpha */
    86 static void
    87 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    88 {
    89     int width = info->dst_w;
    90     int height = info->dst_h;
    91     Uint8 *src = info->src;
    92     int srcskip = info->src_skip;
    93     Uint8 *dst = info->dst;
    94     int dstskip = info->dst_skip;
    95     Uint8 *palmap = info->table;
    96     SDL_PixelFormat *srcfmt = info->src_fmt;
    97     SDL_PixelFormat *dstfmt = info->dst_fmt;
    98     int srcbpp = srcfmt->BytesPerPixel;
    99 
   100     /* FIXME: fix alpha bit field expansion here too? */
   101     while (height--) {
   102 	    /* *INDENT-OFF* */
   103 	    DUFFS_LOOP4(
   104 	    {
   105 		Uint32 Pixel;
   106 		unsigned sR;
   107 		unsigned sG;
   108 		unsigned sB;
   109 		unsigned sA;
   110 		unsigned dR;
   111 		unsigned dG;
   112 		unsigned dB;
   113 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   114 		dR = dstfmt->palette->colors[*dst].r;
   115 		dG = dstfmt->palette->colors[*dst].g;
   116 		dB = dstfmt->palette->colors[*dst].b;
   117 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   118 		dR &= 0xff;
   119 		dG &= 0xff;
   120 		dB &= 0xff;
   121 		/* Pack RGB into 8bit pixel */
   122 		if ( palmap == NULL ) {
   123 		    *dst =((dR>>5)<<(3+2))|
   124 			  ((dG>>5)<<(2))|
   125 			  ((dB>>6)<<(0));
   126 		} else {
   127 		    *dst = palmap[((dR>>5)<<(3+2))|
   128 				  ((dG>>5)<<(2))  |
   129 				  ((dB>>6)<<(0))  ];
   130 		}
   131 		dst++;
   132 		src += srcbpp;
   133 	    },
   134 	    width);
   135 	    /* *INDENT-ON* */
   136         src += srcskip;
   137         dst += dstskip;
   138     }
   139 }
   140 
   141 /* colorkeyed N->1 blending with per-surface alpha */
   142 static void
   143 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   144 {
   145     int width = info->dst_w;
   146     int height = info->dst_h;
   147     Uint8 *src = info->src;
   148     int srcskip = info->src_skip;
   149     Uint8 *dst = info->dst;
   150     int dstskip = info->dst_skip;
   151     Uint8 *palmap = info->table;
   152     SDL_PixelFormat *srcfmt = info->src_fmt;
   153     SDL_PixelFormat *dstfmt = info->dst_fmt;
   154     int srcbpp = srcfmt->BytesPerPixel;
   155     Uint32 ckey = info->colorkey;
   156 
   157     const int A = info->a;
   158 
   159     while (height--) {
   160 	    /* *INDENT-OFF* */
   161 	    DUFFS_LOOP(
   162 	    {
   163 		Uint32 Pixel;
   164 		unsigned sR;
   165 		unsigned sG;
   166 		unsigned sB;
   167 		unsigned dR;
   168 		unsigned dG;
   169 		unsigned dB;
   170 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   171 		if ( Pixel != ckey ) {
   172 		    dR = dstfmt->palette->colors[*dst].r;
   173 		    dG = dstfmt->palette->colors[*dst].g;
   174 		    dB = dstfmt->palette->colors[*dst].b;
   175 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   176 		    dR &= 0xff;
   177 		    dG &= 0xff;
   178 		    dB &= 0xff;
   179 		    /* Pack RGB into 8bit pixel */
   180 		    if ( palmap == NULL ) {
   181 			*dst =((dR>>5)<<(3+2))|
   182 			      ((dG>>5)<<(2)) |
   183 			      ((dB>>6)<<(0));
   184 		    } else {
   185 			*dst = palmap[((dR>>5)<<(3+2))|
   186 				      ((dG>>5)<<(2))  |
   187 				      ((dB>>6)<<(0))  ];
   188 		    }
   189 		}
   190 		dst++;
   191 		src += srcbpp;
   192 	    },
   193 	    width);
   194 	    /* *INDENT-ON* */
   195         src += srcskip;
   196         dst += dstskip;
   197     }
   198 }
   199 
   200 #ifdef __MMX__
   201 
   202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   203 static void
   204 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   205 {
   206     int width = info->dst_w;
   207     int height = info->dst_h;
   208     Uint32 *srcp = (Uint32 *) info->src;
   209     int srcskip = info->src_skip >> 2;
   210     Uint32 *dstp = (Uint32 *) info->dst;
   211     int dstskip = info->dst_skip >> 2;
   212     Uint32 dalpha = info->dst_fmt->Amask;
   213 
   214     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   215 
   216     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   217     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   218     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   219 
   220     while (height--) {
   221         int n = width;
   222         if (n & 1) {
   223             Uint32 s = *srcp++;
   224             Uint32 d = *dstp;
   225             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   226                        + (s & d & 0x00010101)) | dalpha;
   227             n--;
   228         }
   229 
   230         for (n >>= 1; n > 0; --n) {
   231             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   232             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   233 
   234             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   235             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   236 
   237             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   238             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   239             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   240             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   241 
   242             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   243             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   244             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   245             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   246 
   247             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   248             dstp += 2;
   249             srcp += 2;
   250         }
   251 
   252         srcp += srcskip;
   253         dstp += dstskip;
   254     }
   255     _mm_empty();
   256 }
   257 
   258 /* fast RGB888->(A)RGB888 blending with surface alpha */
   259 static void
   260 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   261 {
   262     SDL_PixelFormat *df = info->dst_fmt;
   263     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   264     unsigned alpha = info->a;
   265 
   266     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   267         /* only call a128 version when R,G,B occupy lower bits */
   268         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   269     } else {
   270         int width = info->dst_w;
   271         int height = info->dst_h;
   272         Uint32 *srcp = (Uint32 *) info->src;
   273         int srcskip = info->src_skip >> 2;
   274         Uint32 *dstp = (Uint32 *) info->dst;
   275         int dstskip = info->dst_skip >> 2;
   276         Uint32 dalpha = df->Amask;
   277         Uint32 amult;
   278 
   279         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   280 
   281         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   282         /* form the alpha mult */
   283         amult = alpha | (alpha << 8);
   284         amult = amult | (amult << 16);
   285         chanmask =
   286             (0xff << df->Rshift) | (0xff << df->
   287                                     Gshift) | (0xff << df->Bshift);
   288         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   289         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   290         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   291         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   292 
   293         while (height--) {
   294             int n = width;
   295             if (n & 1) {
   296                 /* One Pixel Blend */
   297                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   298                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   299 
   300                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   301                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   302 
   303                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   304                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   305                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   306                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   307 
   308                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   309                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   310                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   311 
   312                 ++srcp;
   313                 ++dstp;
   314 
   315                 n--;
   316             }
   317 
   318             for (n >>= 1; n > 0; --n) {
   319                 /* Two Pixels Blend */
   320                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   321                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   322                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   323                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   324 
   325                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   326                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   327                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   328                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   329 
   330                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   331                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   332                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   333                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   334 
   335                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   336                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   337                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   338                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   339 
   340                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   341                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   342 
   343                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   344 
   345                 srcp += 2;
   346                 dstp += 2;
   347             }
   348             srcp += srcskip;
   349             dstp += dstskip;
   350         }
   351         _mm_empty();
   352     }
   353 }
   354 
   355 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   356 static void
   357 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   358 {
   359     int width = info->dst_w;
   360     int height = info->dst_h;
   361     Uint32 *srcp = (Uint32 *) info->src;
   362     int srcskip = info->src_skip >> 2;
   363     Uint32 *dstp = (Uint32 *) info->dst;
   364     int dstskip = info->dst_skip >> 2;
   365     SDL_PixelFormat *sf = info->src_fmt;
   366     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   367     Uint32 amask = sf->Amask;
   368     Uint32 ashift = sf->Ashift;
   369     Uint64 multmask;
   370 
   371     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   372 
   373     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   374     multmask = 0xFFFF;
   375     multmask <<= (ashift * 2);
   376     multmask = ~multmask;
   377     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   378 
   379     while (height--) {
   380 		/* *INDENT-OFF* */
   381 		DUFFS_LOOP4({
   382 		Uint32 alpha = *srcp & amask;
   383 		if (alpha == 0) {
   384 			/* do nothing */
   385 		} else if (alpha == amask) {
   386 			/* opaque alpha -- copy RGB, keep dst alpha */
   387 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   388 		} else {
   389 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   390 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   391 
   392 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   393 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   394 
   395 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   396 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   397 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   398 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   399 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   400 
   401 			/* blend */		    
   402 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   403 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   404 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   405 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   406 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   407 			
   408 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   409 		}
   410 		++srcp;
   411 		++dstp;
   412 	    }, width);
   413 		/* *INDENT-ON* */
   414         srcp += srcskip;
   415         dstp += dstskip;
   416     }
   417     _mm_empty();
   418 }
   419 
   420 #endif /* __MMX__ */
   421 
   422 #if SDL_ALTIVEC_BLITTERS
   423 #if __MWERKS__
   424 #pragma altivec_model on
   425 #endif
   426 #if HAVE_ALTIVEC_H
   427 #include <altivec.h>
   428 #endif
   429 #include <assert.h>
   430 
   431 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   432 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   433         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   434 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   435         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   436 #else
   437 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   438         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   439 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   440         (vector unsigned short) { a,b,c,d,e,f,g,h }
   441 #endif
   442 
   443 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   444 #define VECPRINT(msg, v) do { \
   445     vector unsigned int tmpvec = (vector unsigned int)(v); \
   446     unsigned int *vp = (unsigned int *)&tmpvec; \
   447     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   448 } while (0)
   449 
   450 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   451     (vector unsigned char)(
   452         0x00, 0x10, 0x02, 0x12,
   453         0x04, 0x14, 0x06, 0x16,
   454         0x08, 0x18, 0x0A, 0x1A,
   455         0x0C, 0x1C, 0x0E, 0x1E );
   456 */
   457 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   458 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   459 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   460 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   461     ? vec_lvsl(0, src) \
   462     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   463 
   464 
   465 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   466     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   467     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   468     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   469     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   470     /* valpha2 is 255-alpha */ \
   471     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   472     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   473     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   474     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   475     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   476     /* add source and dest */ \
   477     vtemp1 = vec_add(vtemp1, vtemp3); \
   478     vtemp2 = vec_add(vtemp2, vtemp4); \
   479     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   480     vtemp1 = vec_add(vtemp1, v1_16); \
   481     vtemp3 = vec_sr(vtemp1, v8_16); \
   482     vtemp1 = vec_add(vtemp1, vtemp3); \
   483     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   484     vtemp2 = vec_add(vtemp2, v1_16); \
   485     vtemp4 = vec_sr(vtemp2, v8_16); \
   486     vtemp2 = vec_add(vtemp2, vtemp4); \
   487     /* (>>8) and get ARGBARGBARGBARGB */ \
   488     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   489 } while (0)
   490 
   491 /* Calculate the permute vector used for 32->32 swizzling */
   492 static vector unsigned char
   493 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   494 {
   495     /*
   496      * We have to assume that the bits that aren't used by other
   497      *  colors is alpha, and it's one complete byte, since some formats
   498      *  leave alpha with a zero mask, but we should still swizzle the bits.
   499      */
   500     /* ARGB */
   501     const static struct SDL_PixelFormat default_pixel_format = {
   502         NULL, 0, 0,
   503         0, 0, 0, 0,
   504         16, 8, 0, 24,
   505         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   506         0, 0
   507     };
   508     if (!srcfmt) {
   509         srcfmt = &default_pixel_format;
   510     }
   511     if (!dstfmt) {
   512         dstfmt = &default_pixel_format;
   513     }
   514     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   515                                                        0x04, 0x04, 0x04, 0x04,
   516                                                        0x08, 0x08, 0x08, 0x08,
   517                                                        0x0C, 0x0C, 0x0C,
   518                                                        0x0C);
   519     vector unsigned char vswiz;
   520     vector unsigned int srcvec;
   521 #define RESHIFT(X) (3 - ((X) >> 3))
   522     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   523     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   524     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   525     Uint32 amask;
   526     /* Use zero for alpha if either surface doesn't have alpha */
   527     if (dstfmt->Amask) {
   528         amask =
   529             ((srcfmt->Amask) ? RESHIFT(srcfmt->
   530                                        Ashift) : 0x10) << (dstfmt->Ashift);
   531     } else {
   532         amask =
   533             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   534                           0xFFFFFFFF);
   535     }
   536 #undef RESHIFT
   537     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   538     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   539     return (vswiz);
   540 }
   541 
   542 static void
   543 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   544 {
   545     int height = info->dst_h;
   546     Uint8 *src = (Uint8 *) info->src;
   547     int srcskip = info->src_skip;
   548     Uint8 *dst = (Uint8 *) info->dst;
   549     int dstskip = info->dst_skip;
   550     SDL_PixelFormat *srcfmt = info->src_fmt;
   551 
   552     vector unsigned char v0 = vec_splat_u8(0);
   553     vector unsigned short v8_16 = vec_splat_u16(8);
   554     vector unsigned short v1_16 = vec_splat_u16(1);
   555     vector unsigned short v2_16 = vec_splat_u16(2);
   556     vector unsigned short v3_16 = vec_splat_u16(3);
   557     vector unsigned int v8_32 = vec_splat_u32(8);
   558     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   559     vector unsigned short v3f =
   560         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   561                           0x003f, 0x003f, 0x003f, 0x003f);
   562     vector unsigned short vfc =
   563         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   564                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   565 
   566     /* 
   567        0x10 - 0x1f is the alpha
   568        0x00 - 0x0e evens are the red
   569        0x01 - 0x0f odds are zero
   570      */
   571     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   572                                                        0x10, 0x02, 0x01, 0x01,
   573                                                        0x10, 0x04, 0x01, 0x01,
   574                                                        0x10, 0x06, 0x01,
   575                                                        0x01);
   576     vector unsigned char vredalpha2 =
   577         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   578                                         vec_sl(v8_32, v16_32))
   579         );
   580     /*
   581        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   582        0x11 - 0x0f odds are blue
   583      */
   584     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   585                                                    0x04, 0x05, 0x06, 0x13,
   586                                                    0x08, 0x09, 0x0a, 0x15,
   587                                                    0x0c, 0x0d, 0x0e, 0x17);
   588     vector unsigned char vblue2 =
   589         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   590         );
   591     /*
   592        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   593        0x10 - 0x0e evens are green
   594      */
   595     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   596                                                     0x04, 0x05, 0x12, 0x07,
   597                                                     0x08, 0x09, 0x14, 0x0b,
   598                                                     0x0c, 0x0d, 0x16, 0x0f);
   599     vector unsigned char vgreen2 =
   600         (vector unsigned
   601          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   602         );
   603     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   604                                                     0x00, 0x0a, 0x00, 0x0e,
   605                                                     0x00, 0x12, 0x00, 0x16,
   606                                                     0x00, 0x1a, 0x00, 0x1e);
   607     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   608     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   609     vector unsigned char valphaPermute =
   610         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   611 
   612     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   613     vf800 = vec_sl(vf800, vec_splat_u16(8));
   614 
   615     while (height--) {
   616         int extrawidth;
   617         vector unsigned char valigner;
   618         vector unsigned char vsrc;
   619         vector unsigned char voverflow;
   620         int width = info->dst_w;
   621 
   622 #define ONE_PIXEL_BLEND(condition, widthvar) \
   623         while (condition) { \
   624             Uint32 Pixel; \
   625             unsigned sR, sG, sB, dR, dG, dB, sA; \
   626             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   627             if(sA) { \
   628                 unsigned short dstpixel = *((unsigned short *)dst); \
   629                 dR = (dstpixel >> 8) & 0xf8; \
   630                 dG = (dstpixel >> 3) & 0xfc; \
   631                 dB = (dstpixel << 3) & 0xf8; \
   632                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   633                 *((unsigned short *)dst) = ( \
   634                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   635                 ); \
   636             } \
   637             src += 4; \
   638             dst += 2; \
   639             widthvar--; \
   640         }
   641         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   642         extrawidth = (width % 8);
   643         valigner = VEC_ALIGNER(src);
   644         vsrc = (vector unsigned char) vec_ld(0, src);
   645         width -= extrawidth;
   646         while (width) {
   647             vector unsigned char valpha;
   648             vector unsigned char vsrc1, vsrc2;
   649             vector unsigned char vdst1, vdst2;
   650             vector unsigned short vR, vG, vB;
   651             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   652 
   653             /* Load 8 pixels from src as ARGB */
   654             voverflow = (vector unsigned char) vec_ld(15, src);
   655             vsrc = vec_perm(vsrc, voverflow, valigner);
   656             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   657             src += 16;
   658             vsrc = (vector unsigned char) vec_ld(15, src);
   659             voverflow = vec_perm(voverflow, vsrc, valigner);
   660             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   661             src += 16;
   662 
   663             /* Load 8 pixels from dst as XRGB */
   664             voverflow = vec_ld(0, dst);
   665             vR = vec_and((vector unsigned short) voverflow, vf800);
   666             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   667             vG = vec_sl(vB, v2_16);
   668             vdst1 =
   669                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   670                                                 (vector unsigned char) vR,
   671                                                 vredalpha1);
   672             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   673             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   674             vdst2 =
   675                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   676                                                 (vector unsigned char) vR,
   677                                                 vredalpha2);
   678             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   679             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   680 
   681             /* Alpha blend 8 pixels as ARGB */
   682             valpha = vec_perm(vsrc1, v0, valphaPermute);
   683             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   684                                v8_16);
   685             valpha = vec_perm(vsrc2, v0, valphaPermute);
   686             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   687                                v8_16);
   688 
   689             /* Convert 8 pixels to 565 */
   690             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   691                                                         vdst1,
   692                                                         (vector unsigned int)
   693                                                         vdst2);
   694             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   695             vgpixel = vec_and(vgpixel, vfc);
   696             vgpixel = vec_sl(vgpixel, v3_16);
   697             vrpixel = vec_sl(vpixel, v1_16);
   698             vrpixel = vec_and(vrpixel, vf800);
   699             vbpixel = vec_and(vpixel, v3f);
   700             vdst1 =
   701                 vec_or((vector unsigned char) vrpixel,
   702                        (vector unsigned char) vgpixel);
   703             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   704 
   705             /* Store 8 pixels */
   706             vec_st(vdst1, 0, dst);
   707 
   708             width -= 8;
   709             dst += 16;
   710         }
   711         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   712 #undef ONE_PIXEL_BLEND
   713         src += srcskip;
   714         dst += dstskip;
   715     }
   716 }
   717 
   718 static void
   719 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   720 {
   721     int height = info->dst_h;
   722     Uint32 *srcp = (Uint32 *) info->src;
   723     int srcskip = info->src_skip >> 2;
   724     Uint32 *dstp = (Uint32 *) info->dst;
   725     int dstskip = info->dst_skip >> 2;
   726     SDL_PixelFormat *srcfmt = info->src_fmt;
   727     SDL_PixelFormat *dstfmt = info->dst_fmt;
   728     unsigned sA = info->a;
   729     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   730     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   731     Uint32 ckey = info->colorkey;
   732     vector unsigned char mergePermute;
   733     vector unsigned char vsrcPermute;
   734     vector unsigned char vdstPermute;
   735     vector unsigned char vsdstPermute;
   736     vector unsigned char valpha;
   737     vector unsigned char valphamask;
   738     vector unsigned char vbits;
   739     vector unsigned char v0;
   740     vector unsigned short v1;
   741     vector unsigned short v8;
   742     vector unsigned int vckey;
   743     vector unsigned int vrgbmask;
   744 
   745     mergePermute = VEC_MERGE_PERMUTE();
   746     v0 = vec_splat_u8(0);
   747     v1 = vec_splat_u16(1);
   748     v8 = vec_splat_u16(8);
   749 
   750     /* set the alpha to 255 on the destination surf */
   751     valphamask = VEC_ALPHA_MASK();
   752 
   753     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   754     vdstPermute = calc_swizzle32(NULL, dstfmt);
   755     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   756 
   757     /* set a vector full of alpha and 255-alpha */
   758     ((unsigned char *) &valpha)[0] = sA;
   759     valpha = vec_splat(valpha, 0);
   760     vbits = (vector unsigned char) vec_splat_s8(-1);
   761 
   762     ckey &= rgbmask;
   763     ((unsigned int *) (char *) &vckey)[0] = ckey;
   764     vckey = vec_splat(vckey, 0);
   765     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
   766     vrgbmask = vec_splat(vrgbmask, 0);
   767 
   768     while (height--) {
   769         int width = info->dst_w;
   770 #define ONE_PIXEL_BLEND(condition, widthvar) \
   771         while (condition) { \
   772             Uint32 Pixel; \
   773             unsigned sR, sG, sB, dR, dG, dB; \
   774             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   775             if(sA && Pixel != ckey) { \
   776                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   777                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   778                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   779                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   780             } \
   781             dstp++; \
   782             srcp++; \
   783             widthvar--; \
   784         }
   785         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   786         if (width > 0) {
   787             int extrawidth = (width % 4);
   788             vector unsigned char valigner = VEC_ALIGNER(srcp);
   789             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   790             width -= extrawidth;
   791             while (width) {
   792                 vector unsigned char vsel;
   793                 vector unsigned char voverflow;
   794                 vector unsigned char vd;
   795                 vector unsigned char vd_orig;
   796 
   797                 /* s = *srcp */
   798                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   799                 vs = vec_perm(vs, voverflow, valigner);
   800 
   801                 /* vsel is set for items that match the key */
   802                 vsel =
   803                     (vector unsigned char) vec_and((vector unsigned int) vs,
   804                                                    vrgbmask);
   805                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
   806                                                         vsel, vckey);
   807 
   808                 /* permute to source format */
   809                 vs = vec_perm(vs, valpha, vsrcPermute);
   810 
   811                 /* d = *dstp */
   812                 vd = (vector unsigned char) vec_ld(0, dstp);
   813                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   814 
   815                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   816 
   817                 /* set the alpha channel to full on */
   818                 vd = vec_or(vd, valphamask);
   819 
   820                 /* mask out color key */
   821                 vd = vec_sel(vd, vd_orig, vsel);
   822 
   823                 /* permute to dest format */
   824                 vd = vec_perm(vd, vbits, vdstPermute);
   825 
   826                 /* *dstp = res */
   827                 vec_st((vector unsigned int) vd, 0, dstp);
   828 
   829                 srcp += 4;
   830                 dstp += 4;
   831                 width -= 4;
   832                 vs = voverflow;
   833             }
   834             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   835         }
   836 #undef ONE_PIXEL_BLEND
   837 
   838         srcp += srcskip;
   839         dstp += dstskip;
   840     }
   841 }
   842 
   843 
   844 static void
   845 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
   846 {
   847     int width = info->dst_w;
   848     int height = info->dst_h;
   849     Uint32 *srcp = (Uint32 *) info->src;
   850     int srcskip = info->src_skip >> 2;
   851     Uint32 *dstp = (Uint32 *) info->dst;
   852     int dstskip = info->dst_skip >> 2;
   853     SDL_PixelFormat *srcfmt = info->src_fmt;
   854     SDL_PixelFormat *dstfmt = info->dst_fmt;
   855     vector unsigned char mergePermute;
   856     vector unsigned char valphaPermute;
   857     vector unsigned char vsrcPermute;
   858     vector unsigned char vdstPermute;
   859     vector unsigned char vsdstPermute;
   860     vector unsigned char valphamask;
   861     vector unsigned char vpixelmask;
   862     vector unsigned char v0;
   863     vector unsigned short v1;
   864     vector unsigned short v8;
   865 
   866     v0 = vec_splat_u8(0);
   867     v1 = vec_splat_u16(1);
   868     v8 = vec_splat_u16(8);
   869     mergePermute = VEC_MERGE_PERMUTE();
   870     valphamask = VEC_ALPHA_MASK();
   871     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   872     vpixelmask = vec_nor(valphamask, v0);
   873     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   874     vdstPermute = calc_swizzle32(NULL, dstfmt);
   875     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   876 
   877     while (height--) {
   878         width = info->dst_w;
   879 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   880             Uint32 Pixel; \
   881             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   882             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   883             if(sA) { \
   884               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   885               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   886               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   887             } \
   888             ++srcp; \
   889             ++dstp; \
   890             widthvar--; \
   891         }
   892         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   893         if (width > 0) {
   894             /* vsrcPermute */
   895             /* vdstPermute */
   896             int extrawidth = (width % 4);
   897             vector unsigned char valigner = VEC_ALIGNER(srcp);
   898             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   899             width -= extrawidth;
   900             while (width) {
   901                 vector unsigned char voverflow;
   902                 vector unsigned char vd;
   903                 vector unsigned char valpha;
   904                 vector unsigned char vdstalpha;
   905                 /* s = *srcp */
   906                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   907                 vs = vec_perm(vs, voverflow, valigner);
   908                 vs = vec_perm(vs, v0, vsrcPermute);
   909 
   910                 valpha = vec_perm(vs, v0, valphaPermute);
   911 
   912                 /* d = *dstp */
   913                 vd = (vector unsigned char) vec_ld(0, dstp);
   914                 vd = vec_perm(vd, v0, vsdstPermute);
   915                 vdstalpha = vec_and(vd, valphamask);
   916 
   917                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   918 
   919                 /* set the alpha to the dest alpha */
   920                 vd = vec_and(vd, vpixelmask);
   921                 vd = vec_or(vd, vdstalpha);
   922                 vd = vec_perm(vd, v0, vdstPermute);
   923 
   924                 /* *dstp = res */
   925                 vec_st((vector unsigned int) vd, 0, dstp);
   926 
   927                 srcp += 4;
   928                 dstp += 4;
   929                 width -= 4;
   930                 vs = voverflow;
   931 
   932             }
   933             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   934         }
   935         srcp += srcskip;
   936         dstp += dstskip;
   937 #undef ONE_PIXEL_BLEND
   938     }
   939 }
   940 
   941 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   942 static void
   943 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
   944 {
   945     int width = info->dst_w;
   946     int height = info->dst_h;
   947     Uint32 *srcp = (Uint32 *) info->src;
   948     int srcskip = info->src_skip >> 2;
   949     Uint32 *dstp = (Uint32 *) info->dst;
   950     int dstskip = info->dst_skip >> 2;
   951     vector unsigned char mergePermute;
   952     vector unsigned char valphaPermute;
   953     vector unsigned char valphamask;
   954     vector unsigned char vpixelmask;
   955     vector unsigned char v0;
   956     vector unsigned short v1;
   957     vector unsigned short v8;
   958     v0 = vec_splat_u8(0);
   959     v1 = vec_splat_u16(1);
   960     v8 = vec_splat_u16(8);
   961     mergePermute = VEC_MERGE_PERMUTE();
   962     valphamask = VEC_ALPHA_MASK();
   963     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   964 
   965 
   966     vpixelmask = vec_nor(valphamask, v0);
   967     while (height--) {
   968         width = info->dst_w;
   969 #define ONE_PIXEL_BLEND(condition, widthvar) \
   970         while ((condition)) { \
   971             Uint32 dalpha; \
   972             Uint32 d; \
   973             Uint32 s1; \
   974             Uint32 d1; \
   975             Uint32 s = *srcp; \
   976             Uint32 alpha = s >> 24; \
   977             if(alpha) { \
   978               if(alpha == SDL_ALPHA_OPAQUE) { \
   979                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   980               } else { \
   981                 d = *dstp; \
   982                 dalpha = d & 0xff000000; \
   983                 s1 = s & 0xff00ff; \
   984                 d1 = d & 0xff00ff; \
   985                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   986                 s &= 0xff00; \
   987                 d &= 0xff00; \
   988                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   989                 *dstp = d1 | d | dalpha; \
   990               } \
   991             } \
   992             ++srcp; \
   993             ++dstp; \
   994             widthvar--; \
   995 	    }
   996         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   997         if (width > 0) {
   998             int extrawidth = (width % 4);
   999             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1000             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1001             width -= extrawidth;
  1002             while (width) {
  1003                 vector unsigned char voverflow;
  1004                 vector unsigned char vd;
  1005                 vector unsigned char valpha;
  1006                 vector unsigned char vdstalpha;
  1007                 /* s = *srcp */
  1008                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1009                 vs = vec_perm(vs, voverflow, valigner);
  1010 
  1011                 valpha = vec_perm(vs, v0, valphaPermute);
  1012 
  1013                 /* d = *dstp */
  1014                 vd = (vector unsigned char) vec_ld(0, dstp);
  1015                 vdstalpha = vec_and(vd, valphamask);
  1016 
  1017                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1018 
  1019                 /* set the alpha to the dest alpha */
  1020                 vd = vec_and(vd, vpixelmask);
  1021                 vd = vec_or(vd, vdstalpha);
  1022 
  1023                 /* *dstp = res */
  1024                 vec_st((vector unsigned int) vd, 0, dstp);
  1025 
  1026                 srcp += 4;
  1027                 dstp += 4;
  1028                 width -= 4;
  1029                 vs = voverflow;
  1030             }
  1031             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1032         }
  1033         srcp += srcskip;
  1034         dstp += dstskip;
  1035     }
  1036 #undef ONE_PIXEL_BLEND
  1037 }
  1038 
  1039 static void
  1040 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1041 {
  1042     /* XXX : 6 */
  1043     int height = info->dst_h;
  1044     Uint32 *srcp = (Uint32 *) info->src;
  1045     int srcskip = info->src_skip >> 2;
  1046     Uint32 *dstp = (Uint32 *) info->dst;
  1047     int dstskip = info->dst_skip >> 2;
  1048     SDL_PixelFormat *srcfmt = info->src_fmt;
  1049     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1050     unsigned sA = info->a;
  1051     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1052     vector unsigned char mergePermute;
  1053     vector unsigned char vsrcPermute;
  1054     vector unsigned char vdstPermute;
  1055     vector unsigned char vsdstPermute;
  1056     vector unsigned char valpha;
  1057     vector unsigned char valphamask;
  1058     vector unsigned char vbits;
  1059     vector unsigned short v1;
  1060     vector unsigned short v8;
  1061 
  1062     mergePermute = VEC_MERGE_PERMUTE();
  1063     v1 = vec_splat_u16(1);
  1064     v8 = vec_splat_u16(8);
  1065 
  1066     /* set the alpha to 255 on the destination surf */
  1067     valphamask = VEC_ALPHA_MASK();
  1068 
  1069     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1070     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1071     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1072 
  1073     /* set a vector full of alpha and 255-alpha */
  1074     ((unsigned char *) &valpha)[0] = sA;
  1075     valpha = vec_splat(valpha, 0);
  1076     vbits = (vector unsigned char) vec_splat_s8(-1);
  1077 
  1078     while (height--) {
  1079         int width = info->dst_w;
  1080 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1081             Uint32 Pixel; \
  1082             unsigned sR, sG, sB, dR, dG, dB; \
  1083             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1084             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1085             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1086             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1087             ++srcp; \
  1088             ++dstp; \
  1089             widthvar--; \
  1090         }
  1091         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1092         if (width > 0) {
  1093             int extrawidth = (width % 4);
  1094             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1095             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1096             width -= extrawidth;
  1097             while (width) {
  1098                 vector unsigned char voverflow;
  1099                 vector unsigned char vd;
  1100 
  1101                 /* s = *srcp */
  1102                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1103                 vs = vec_perm(vs, voverflow, valigner);
  1104                 vs = vec_perm(vs, valpha, vsrcPermute);
  1105 
  1106                 /* d = *dstp */
  1107                 vd = (vector unsigned char) vec_ld(0, dstp);
  1108                 vd = vec_perm(vd, vd, vsdstPermute);
  1109 
  1110                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1111 
  1112                 /* set the alpha channel to full on */
  1113                 vd = vec_or(vd, valphamask);
  1114                 vd = vec_perm(vd, vbits, vdstPermute);
  1115 
  1116                 /* *dstp = res */
  1117                 vec_st((vector unsigned int) vd, 0, dstp);
  1118 
  1119                 srcp += 4;
  1120                 dstp += 4;
  1121                 width -= 4;
  1122                 vs = voverflow;
  1123             }
  1124             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1125         }
  1126 #undef ONE_PIXEL_BLEND
  1127 
  1128         srcp += srcskip;
  1129         dstp += dstskip;
  1130     }
  1131 
  1132 }
  1133 
  1134 
  1135 /* fast RGB888->(A)RGB888 blending */
  1136 static void
  1137 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1138 {
  1139     unsigned alpha = info->a;
  1140     int height = info->dst_h;
  1141     Uint32 *srcp = (Uint32 *) info->src;
  1142     int srcskip = info->src_skip >> 2;
  1143     Uint32 *dstp = (Uint32 *) info->dst;
  1144     int dstskip = info->dst_skip >> 2;
  1145     vector unsigned char mergePermute;
  1146     vector unsigned char valpha;
  1147     vector unsigned char valphamask;
  1148     vector unsigned short v1;
  1149     vector unsigned short v8;
  1150 
  1151     mergePermute = VEC_MERGE_PERMUTE();
  1152     v1 = vec_splat_u16(1);
  1153     v8 = vec_splat_u16(8);
  1154 
  1155     /* set the alpha to 255 on the destination surf */
  1156     valphamask = VEC_ALPHA_MASK();
  1157 
  1158     /* set a vector full of alpha and 255-alpha */
  1159     ((unsigned char *) &valpha)[0] = alpha;
  1160     valpha = vec_splat(valpha, 0);
  1161 
  1162     while (height--) {
  1163         int width = info->dst_w;
  1164 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1165             Uint32 s = *srcp; \
  1166             Uint32 d = *dstp; \
  1167             Uint32 s1 = s & 0xff00ff; \
  1168             Uint32 d1 = d & 0xff00ff; \
  1169             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1170                  & 0xff00ff; \
  1171             s &= 0xff00; \
  1172             d &= 0xff00; \
  1173             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1174             *dstp = d1 | d | 0xff000000; \
  1175             ++srcp; \
  1176             ++dstp; \
  1177             widthvar--; \
  1178         }
  1179         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1180         if (width > 0) {
  1181             int extrawidth = (width % 4);
  1182             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1183             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1184             width -= extrawidth;
  1185             while (width) {
  1186                 vector unsigned char voverflow;
  1187                 vector unsigned char vd;
  1188 
  1189                 /* s = *srcp */
  1190                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1191                 vs = vec_perm(vs, voverflow, valigner);
  1192 
  1193                 /* d = *dstp */
  1194                 vd = (vector unsigned char) vec_ld(0, dstp);
  1195 
  1196                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1197 
  1198                 /* set the alpha channel to full on */
  1199                 vd = vec_or(vd, valphamask);
  1200 
  1201                 /* *dstp = res */
  1202                 vec_st((vector unsigned int) vd, 0, dstp);
  1203 
  1204                 srcp += 4;
  1205                 dstp += 4;
  1206                 width -= 4;
  1207                 vs = voverflow;
  1208             }
  1209             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1210         }
  1211 #undef ONE_PIXEL_BLEND
  1212 
  1213         srcp += srcskip;
  1214         dstp += dstskip;
  1215     }
  1216 }
  1217 
  1218 #if __MWERKS__
  1219 #pragma altivec_model off
  1220 #endif
  1221 #endif /* SDL_ALTIVEC_BLITTERS */
  1222 
  1223 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1224 static void
  1225 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1226 {
  1227     int width = info->dst_w;
  1228     int height = info->dst_h;
  1229     Uint32 *srcp = (Uint32 *) info->src;
  1230     int srcskip = info->src_skip >> 2;
  1231     Uint32 *dstp = (Uint32 *) info->dst;
  1232     int dstskip = info->dst_skip >> 2;
  1233 
  1234     while (height--) {
  1235 	    /* *INDENT-OFF* */
  1236 	    DUFFS_LOOP4({
  1237 		    Uint32 s = *srcp++;
  1238 		    Uint32 d = *dstp;
  1239 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1240 			       + (s & d & 0x00010101)) | 0xff000000;
  1241 	    }, width);
  1242 	    /* *INDENT-ON* */
  1243         srcp += srcskip;
  1244         dstp += dstskip;
  1245     }
  1246 }
  1247 
  1248 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1249 static void
  1250 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1251 {
  1252     unsigned alpha = info->a;
  1253     if (alpha == 128) {
  1254         BlitRGBtoRGBSurfaceAlpha128(info);
  1255     } else {
  1256         int width = info->dst_w;
  1257         int height = info->dst_h;
  1258         Uint32 *srcp = (Uint32 *) info->src;
  1259         int srcskip = info->src_skip >> 2;
  1260         Uint32 *dstp = (Uint32 *) info->dst;
  1261         int dstskip = info->dst_skip >> 2;
  1262         Uint32 s;
  1263         Uint32 d;
  1264         Uint32 s1;
  1265         Uint32 d1;
  1266 
  1267         while (height--) {
  1268 			/* *INDENT-OFF* */
  1269 			DUFFS_LOOP4({
  1270 				s = *srcp;
  1271 				d = *dstp;
  1272 				s1 = s & 0xff00ff;
  1273 				d1 = d & 0xff00ff;
  1274 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1275 				     & 0xff00ff;
  1276 				s &= 0xff00;
  1277 				d &= 0xff00;
  1278 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1279 				*dstp = d1 | d | 0xff000000;
  1280 				++srcp;
  1281 				++dstp;
  1282 			}, width);
  1283 			/* *INDENT-ON* */
  1284             srcp += srcskip;
  1285             dstp += dstskip;
  1286         }
  1287     }
  1288 }
  1289 
  1290 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1291 static void
  1292 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1293 {
  1294     int width = info->dst_w;
  1295     int height = info->dst_h;
  1296     Uint32 *srcp = (Uint32 *) info->src;
  1297     int srcskip = info->src_skip >> 2;
  1298     Uint32 *dstp = (Uint32 *) info->dst;
  1299     int dstskip = info->dst_skip >> 2;
  1300 
  1301     while (height--) {
  1302 	    /* *INDENT-OFF* */
  1303 	    DUFFS_LOOP4({
  1304 		Uint32 dalpha;
  1305 		Uint32 d;
  1306 		Uint32 s1;
  1307 		Uint32 d1;
  1308 		Uint32 s = *srcp;
  1309 		Uint32 alpha = s >> 24;
  1310 		/* FIXME: Here we special-case opaque alpha since the
  1311 		   compositioning used (>>8 instead of /255) doesn't handle
  1312 		   it correctly. Also special-case alpha=0 for speed?
  1313 		   Benchmark this! */
  1314 		if(alpha) {   
  1315 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1316 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1317 		  } else {
  1318 		    /*
  1319 		     * take out the middle component (green), and process
  1320 		     * the other two in parallel. One multiply less.
  1321 		     */
  1322 		    d = *dstp;
  1323 		    dalpha = d & 0xff000000;
  1324 		    s1 = s & 0xff00ff;
  1325 		    d1 = d & 0xff00ff;
  1326 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1327 		    s &= 0xff00;
  1328 		    d &= 0xff00;
  1329 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1330 		    *dstp = d1 | d | dalpha;
  1331 		  }
  1332 		}
  1333 		++srcp;
  1334 		++dstp;
  1335 	    }, width);
  1336 	    /* *INDENT-ON* */
  1337         srcp += srcskip;
  1338         dstp += dstskip;
  1339     }
  1340 }
  1341 
  1342 #ifdef __3dNOW__
  1343 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1344 static void
  1345 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1346 {
  1347     int width = info->dst_w;
  1348     int height = info->dst_h;
  1349     Uint32 *srcp = (Uint32 *) info->src;
  1350     int srcskip = info->src_skip >> 2;
  1351     Uint32 *dstp = (Uint32 *) info->dst;
  1352     int dstskip = info->dst_skip >> 2;
  1353     SDL_PixelFormat *sf = info->src_fmt;
  1354     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1355     Uint32 amask = sf->Amask;
  1356     Uint32 ashift = sf->Ashift;
  1357     Uint64 multmask;
  1358 
  1359     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1360 
  1361     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1362     multmask = 0xFFFF;
  1363     multmask <<= (ashift * 2);
  1364     multmask = ~multmask;
  1365     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1366 
  1367     while (height--) {
  1368 	    /* *INDENT-OFF* */
  1369 	    DUFFS_LOOP4({
  1370 		Uint32 alpha;
  1371 
  1372 		_m_prefetch(srcp + 16);
  1373 		_m_prefetch(dstp + 16);
  1374 
  1375 		alpha = *srcp & amask;
  1376 		if (alpha == 0) {
  1377 			/* do nothing */
  1378 		} else if (alpha == amask) {
  1379 			/* copy RGB, keep dst alpha */
  1380 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1381 		} else {
  1382 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1383 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1384 
  1385 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1386 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1387 
  1388 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1389 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1390 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1391 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1392 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1393 
  1394 			/* blend */		    
  1395 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1396 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1397 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1398 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1399 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1400 			
  1401 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1402 		}
  1403 		++srcp;
  1404 		++dstp;
  1405 	    }, width);
  1406 	    /* *INDENT-ON* */
  1407         srcp += srcskip;
  1408         dstp += dstskip;
  1409     }
  1410     _mm_empty();
  1411 }
  1412 
  1413 #endif /* __MMX__ */
  1414 
  1415 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1416 
  1417 /* blend a single 16 bit pixel at 50% */
  1418 #define BLEND16_50(d, s, mask)						\
  1419 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1420 
  1421 /* blend two 16 bit pixels at 50% */
  1422 #define BLEND2x16_50(d, s, mask)					     \
  1423 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1424 	 + (s & d & (~(mask | mask << 16))))
  1425 
  1426 static void
  1427 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1428 {
  1429     int width = info->dst_w;
  1430     int height = info->dst_h;
  1431     Uint16 *srcp = (Uint16 *) info->src;
  1432     int srcskip = info->src_skip >> 1;
  1433     Uint16 *dstp = (Uint16 *) info->dst;
  1434     int dstskip = info->dst_skip >> 1;
  1435 
  1436     while (height--) {
  1437         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1438             /*
  1439              * Source and destination not aligned, pipeline it.
  1440              * This is mostly a win for big blits but no loss for
  1441              * small ones
  1442              */
  1443             Uint32 prev_sw;
  1444             int w = width;
  1445 
  1446             /* handle odd destination */
  1447             if ((uintptr_t) dstp & 2) {
  1448                 Uint16 d = *dstp, s = *srcp;
  1449                 *dstp = BLEND16_50(d, s, mask);
  1450                 dstp++;
  1451                 srcp++;
  1452                 w--;
  1453             }
  1454             srcp++;             /* srcp is now 32-bit aligned */
  1455 
  1456             /* bootstrap pipeline with first halfword */
  1457             prev_sw = ((Uint32 *) srcp)[-1];
  1458 
  1459             while (w > 1) {
  1460                 Uint32 sw, dw, s;
  1461                 sw = *(Uint32 *) srcp;
  1462                 dw = *(Uint32 *) dstp;
  1463 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1464                 s = (prev_sw << 16) + (sw >> 16);
  1465 #else
  1466                 s = (prev_sw >> 16) + (sw << 16);
  1467 #endif
  1468                 prev_sw = sw;
  1469                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1470                 dstp += 2;
  1471                 srcp += 2;
  1472                 w -= 2;
  1473             }
  1474 
  1475             /* final pixel if any */
  1476             if (w) {
  1477                 Uint16 d = *dstp, s;
  1478 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1479                 s = (Uint16) prev_sw;
  1480 #else
  1481                 s = (Uint16) (prev_sw >> 16);
  1482 #endif
  1483                 *dstp = BLEND16_50(d, s, mask);
  1484                 srcp++;
  1485                 dstp++;
  1486             }
  1487             srcp += srcskip - 1;
  1488             dstp += dstskip;
  1489         } else {
  1490             /* source and destination are aligned */
  1491             int w = width;
  1492 
  1493             /* first odd pixel? */
  1494             if ((uintptr_t) srcp & 2) {
  1495                 Uint16 d = *dstp, s = *srcp;
  1496                 *dstp = BLEND16_50(d, s, mask);
  1497                 srcp++;
  1498                 dstp++;
  1499                 w--;
  1500             }
  1501             /* srcp and dstp are now 32-bit aligned */
  1502 
  1503             while (w > 1) {
  1504                 Uint32 sw = *(Uint32 *) srcp;
  1505                 Uint32 dw = *(Uint32 *) dstp;
  1506                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1507                 srcp += 2;
  1508                 dstp += 2;
  1509                 w -= 2;
  1510             }
  1511 
  1512             /* last odd pixel? */
  1513             if (w) {
  1514                 Uint16 d = *dstp, s = *srcp;
  1515                 *dstp = BLEND16_50(d, s, mask);
  1516                 srcp++;
  1517                 dstp++;
  1518             }
  1519             srcp += srcskip;
  1520             dstp += dstskip;
  1521         }
  1522     }
  1523 }
  1524 
  1525 #ifdef __MMX__
  1526 
  1527 /* fast RGB565->RGB565 blending with surface alpha */
  1528 static void
  1529 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1530 {
  1531     unsigned alpha = info->a;
  1532     if (alpha == 128) {
  1533         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1534     } else {
  1535         int width = info->dst_w;
  1536         int height = info->dst_h;
  1537         Uint16 *srcp = (Uint16 *) info->src;
  1538         int srcskip = info->src_skip >> 1;
  1539         Uint16 *dstp = (Uint16 *) info->dst;
  1540         int dstskip = info->dst_skip >> 1;
  1541         Uint32 s, d;
  1542 
  1543         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  1544 
  1545         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1546         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1547         alpha >>= 3;            /* downscale alpha to 5 bits */
  1548 
  1549         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1550         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1551         /* position alpha to allow for mullo and mulhi on diff channels
  1552            to reduce the number of operations */
  1553         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1554 
  1555         /* Setup the 565 color channel masks */
  1556         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  1557         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1558 
  1559         while (height--) {
  1560 			/* *INDENT-OFF* */
  1561 			DUFFS_LOOP_124(
  1562 			{
  1563 				s = *srcp++;
  1564 				d = *dstp;
  1565 				/*
  1566 				 * shift out the middle component (green) to
  1567 				 * the high 16 bits, and process all three RGB
  1568 				 * components at the same time.
  1569 				 */
  1570 				s = (s | s << 16) & 0x07e0f81f;
  1571 				d = (d | d << 16) & 0x07e0f81f;
  1572 				d += (s - d) * alpha >> 5;
  1573 				d &= 0x07e0f81f;
  1574 				*dstp++ = (Uint16)(d | d >> 16);
  1575 			},{
  1576 				s = *srcp++;
  1577 				d = *dstp;
  1578 				/*
  1579 				 * shift out the middle component (green) to
  1580 				 * the high 16 bits, and process all three RGB
  1581 				 * components at the same time.
  1582 				 */
  1583 				s = (s | s << 16) & 0x07e0f81f;
  1584 				d = (d | d << 16) & 0x07e0f81f;
  1585 				d += (s - d) * alpha >> 5;
  1586 				d &= 0x07e0f81f;
  1587 				*dstp++ = (Uint16)(d | d >> 16);
  1588 				s = *srcp++;
  1589 				d = *dstp;
  1590 				/*
  1591 				 * shift out the middle component (green) to
  1592 				 * the high 16 bits, and process all three RGB
  1593 				 * components at the same time.
  1594 				 */
  1595 				s = (s | s << 16) & 0x07e0f81f;
  1596 				d = (d | d << 16) & 0x07e0f81f;
  1597 				d += (s - d) * alpha >> 5;
  1598 				d &= 0x07e0f81f;
  1599 				*dstp++ = (Uint16)(d | d >> 16);
  1600 			},{
  1601 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1602 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1603 
  1604 				/* red */
  1605 				src2 = src1;
  1606 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  1607 
  1608 				dst2 = dst1;
  1609 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  1610 
  1611 				/* blend */
  1612 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1613 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1614 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1615 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1616 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  1617 
  1618 				mm_res = dst2; /* RED -> mm_res */
  1619 
  1620 				/* green -- process the bits in place */
  1621 				src2 = src1;
  1622 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1623 
  1624 				dst2 = dst1;
  1625 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1626 
  1627 				/* blend */
  1628 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1629 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1630 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1631 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1632 
  1633 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1634 
  1635 				/* blue */
  1636 				src2 = src1;
  1637 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1638 
  1639 				dst2 = dst1;
  1640 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1641 
  1642 				/* blend */
  1643 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1644 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1645 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1646 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1647 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1648 
  1649 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1650 
  1651 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1652 
  1653 				srcp += 4;
  1654 				dstp += 4;
  1655 			}, width);
  1656 			/* *INDENT-ON* */
  1657             srcp += srcskip;
  1658             dstp += dstskip;
  1659         }
  1660         _mm_empty();
  1661     }
  1662 }
  1663 
  1664 /* fast RGB555->RGB555 blending with surface alpha */
  1665 static void
  1666 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  1667 {
  1668     unsigned alpha = info->a;
  1669     if (alpha == 128) {
  1670         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1671     } else {
  1672         int width = info->dst_w;
  1673         int height = info->dst_h;
  1674         Uint16 *srcp = (Uint16 *) info->src;
  1675         int srcskip = info->src_skip >> 1;
  1676         Uint16 *dstp = (Uint16 *) info->dst;
  1677         int dstskip = info->dst_skip >> 1;
  1678         Uint32 s, d;
  1679 
  1680         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  1681 
  1682         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1683         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1684         alpha >>= 3;            /* downscale alpha to 5 bits */
  1685 
  1686         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1687         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1688         /* position alpha to allow for mullo and mulhi on diff channels
  1689            to reduce the number of operations */
  1690         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1691 
  1692         /* Setup the 555 color channel masks */
  1693         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  1694         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  1695         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1696 
  1697         while (height--) {
  1698 			/* *INDENT-OFF* */
  1699 			DUFFS_LOOP_124(
  1700 			{
  1701 				s = *srcp++;
  1702 				d = *dstp;
  1703 				/*
  1704 				 * shift out the middle component (green) to
  1705 				 * the high 16 bits, and process all three RGB
  1706 				 * components at the same time.
  1707 				 */
  1708 				s = (s | s << 16) & 0x03e07c1f;
  1709 				d = (d | d << 16) & 0x03e07c1f;
  1710 				d += (s - d) * alpha >> 5;
  1711 				d &= 0x03e07c1f;
  1712 				*dstp++ = (Uint16)(d | d >> 16);
  1713 			},{
  1714 				s = *srcp++;
  1715 				d = *dstp;
  1716 				/*
  1717 				 * shift out the middle component (green) to
  1718 				 * the high 16 bits, and process all three RGB
  1719 				 * components at the same time.
  1720 				 */
  1721 				s = (s | s << 16) & 0x03e07c1f;
  1722 				d = (d | d << 16) & 0x03e07c1f;
  1723 				d += (s - d) * alpha >> 5;
  1724 				d &= 0x03e07c1f;
  1725 				*dstp++ = (Uint16)(d | d >> 16);
  1726 			        s = *srcp++;
  1727 				d = *dstp;
  1728 				/*
  1729 				 * shift out the middle component (green) to
  1730 				 * the high 16 bits, and process all three RGB
  1731 				 * components at the same time.
  1732 				 */
  1733 				s = (s | s << 16) & 0x03e07c1f;
  1734 				d = (d | d << 16) & 0x03e07c1f;
  1735 				d += (s - d) * alpha >> 5;
  1736 				d &= 0x03e07c1f;
  1737 				*dstp++ = (Uint16)(d | d >> 16);
  1738 			},{
  1739 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1740 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1741 
  1742 				/* red -- process the bits in place */
  1743 				src2 = src1;
  1744 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  1745 
  1746 				dst2 = dst1;
  1747 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  1748 
  1749 				/* blend */
  1750 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1751 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1752 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1753 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1754 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  1755 
  1756 				mm_res = dst2; /* RED -> mm_res */
  1757 				
  1758 				/* green -- process the bits in place */
  1759 				src2 = src1;
  1760 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1761 
  1762 				dst2 = dst1;
  1763 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1764 
  1765 				/* blend */
  1766 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1767 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1768 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1769 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1770 
  1771 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1772 
  1773 				/* blue */
  1774 				src2 = src1; /* src -> src2 */
  1775 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1776 
  1777 				dst2 = dst1; /* dst -> dst2 */
  1778 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1779 
  1780 				/* blend */
  1781 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1782 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1783 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1784 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1785 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1786 
  1787 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1788 
  1789 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1790 
  1791 				srcp += 4;
  1792 				dstp += 4;
  1793 			}, width);
  1794 			/* *INDENT-ON* */
  1795             srcp += srcskip;
  1796             dstp += dstskip;
  1797         }
  1798         _mm_empty();
  1799     }
  1800 }
  1801 
  1802 #endif /* __MMX__ */
  1803 
  1804 /* fast RGB565->RGB565 blending with surface alpha */
  1805 static void
  1806 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1807 {
  1808     unsigned alpha = info->a;
  1809     if (alpha == 128) {
  1810         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1811     } else {
  1812         int width = info->dst_w;
  1813         int height = info->dst_h;
  1814         Uint16 *srcp = (Uint16 *) info->src;
  1815         int srcskip = info->src_skip >> 1;
  1816         Uint16 *dstp = (Uint16 *) info->dst;
  1817         int dstskip = info->dst_skip >> 1;
  1818         alpha >>= 3;            /* downscale alpha to 5 bits */
  1819 
  1820         while (height--) {
  1821 			/* *INDENT-OFF* */
  1822 			DUFFS_LOOP4({
  1823 				Uint32 s = *srcp++;
  1824 				Uint32 d = *dstp;
  1825 				/*
  1826 				 * shift out the middle component (green) to
  1827 				 * the high 16 bits, and process all three RGB
  1828 				 * components at the same time.
  1829 				 */
  1830 				s = (s | s << 16) & 0x07e0f81f;
  1831 				d = (d | d << 16) & 0x07e0f81f;
  1832 				d += (s - d) * alpha >> 5;
  1833 				d &= 0x07e0f81f;
  1834 				*dstp++ = (Uint16)(d | d >> 16);
  1835 			}, width);
  1836 			/* *INDENT-ON* */
  1837             srcp += srcskip;
  1838             dstp += dstskip;
  1839         }
  1840     }
  1841 }
  1842 
  1843 /* fast RGB555->RGB555 blending with surface alpha */
  1844 static void
  1845 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1846 {
  1847     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1848     if (alpha == 128) {
  1849         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1850     } else {
  1851         int width = info->dst_w;
  1852         int height = info->dst_h;
  1853         Uint16 *srcp = (Uint16 *) info->src;
  1854         int srcskip = info->src_skip >> 1;
  1855         Uint16 *dstp = (Uint16 *) info->dst;
  1856         int dstskip = info->dst_skip >> 1;
  1857         alpha >>= 3;            /* downscale alpha to 5 bits */
  1858 
  1859         while (height--) {
  1860 			/* *INDENT-OFF* */
  1861 			DUFFS_LOOP4({
  1862 				Uint32 s = *srcp++;
  1863 				Uint32 d = *dstp;
  1864 				/*
  1865 				 * shift out the middle component (green) to
  1866 				 * the high 16 bits, and process all three RGB
  1867 				 * components at the same time.
  1868 				 */
  1869 				s = (s | s << 16) & 0x03e07c1f;
  1870 				d = (d | d << 16) & 0x03e07c1f;
  1871 				d += (s - d) * alpha >> 5;
  1872 				d &= 0x03e07c1f;
  1873 				*dstp++ = (Uint16)(d | d >> 16);
  1874 			}, width);
  1875 			/* *INDENT-ON* */
  1876             srcp += srcskip;
  1877             dstp += dstskip;
  1878         }
  1879     }
  1880 }
  1881 
  1882 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1883 static void
  1884 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1885 {
  1886     int width = info->dst_w;
  1887     int height = info->dst_h;
  1888     Uint32 *srcp = (Uint32 *) info->src;
  1889     int srcskip = info->src_skip >> 2;
  1890     Uint16 *dstp = (Uint16 *) info->dst;
  1891     int dstskip = info->dst_skip >> 1;
  1892 
  1893     while (height--) {
  1894 	    /* *INDENT-OFF* */
  1895 	    DUFFS_LOOP4({
  1896 		Uint32 s = *srcp;
  1897 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1898 		/* FIXME: Here we special-case opaque alpha since the
  1899 		   compositioning used (>>8 instead of /255) doesn't handle
  1900 		   it correctly. Also special-case alpha=0 for speed?
  1901 		   Benchmark this! */
  1902 		if(alpha) {   
  1903 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1904 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1905 		  } else {
  1906 		    Uint32 d = *dstp;
  1907 		    /*
  1908 		     * convert source and destination to G0RAB65565
  1909 		     * and blend all components at the same time
  1910 		     */
  1911 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1912 		      + (s >> 3 & 0x1f);
  1913 		    d = (d | d << 16) & 0x07e0f81f;
  1914 		    d += (s - d) * alpha >> 5;
  1915 		    d &= 0x07e0f81f;
  1916 		    *dstp = (Uint16)(d | d >> 16);
  1917 		  }
  1918 		}
  1919 		srcp++;
  1920 		dstp++;
  1921 	    }, width);
  1922 	    /* *INDENT-ON* */
  1923         srcp += srcskip;
  1924         dstp += dstskip;
  1925     }
  1926 }
  1927 
  1928 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1929 static void
  1930 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1931 {
  1932     int width = info->dst_w;
  1933     int height = info->dst_h;
  1934     Uint32 *srcp = (Uint32 *) info->src;
  1935     int srcskip = info->src_skip >> 2;
  1936     Uint16 *dstp = (Uint16 *) info->dst;
  1937     int dstskip = info->dst_skip >> 1;
  1938 
  1939     while (height--) {
  1940 	    /* *INDENT-OFF* */
  1941 	    DUFFS_LOOP4({
  1942 		unsigned alpha;
  1943 		Uint32 s = *srcp;
  1944 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1945 		/* FIXME: Here we special-case opaque alpha since the
  1946 		   compositioning used (>>8 instead of /255) doesn't handle
  1947 		   it correctly. Also special-case alpha=0 for speed?
  1948 		   Benchmark this! */
  1949 		if(alpha) {   
  1950 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1951 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1952 		  } else {
  1953 		    Uint32 d = *dstp;
  1954 		    /*
  1955 		     * convert source and destination to G0RAB65565
  1956 		     * and blend all components at the same time
  1957 		     */
  1958 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1959 		      + (s >> 3 & 0x1f);
  1960 		    d = (d | d << 16) & 0x03e07c1f;
  1961 		    d += (s - d) * alpha >> 5;
  1962 		    d &= 0x03e07c1f;
  1963 		    *dstp = (Uint16)(d | d >> 16);
  1964 		  }
  1965 		}
  1966 		srcp++;
  1967 		dstp++;
  1968 	    }, width);
  1969 	    /* *INDENT-ON* */
  1970         srcp += srcskip;
  1971         dstp += dstskip;
  1972     }
  1973 }
  1974 
  1975 /* General (slow) N->N blending with per-surface alpha */
  1976 static void
  1977 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1978 {
  1979     int width = info->dst_w;
  1980     int height = info->dst_h;
  1981     Uint8 *src = info->src;
  1982     int srcskip = info->src_skip;
  1983     Uint8 *dst = info->dst;
  1984     int dstskip = info->dst_skip;
  1985     SDL_PixelFormat *srcfmt = info->src_fmt;
  1986     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1987     int srcbpp = srcfmt->BytesPerPixel;
  1988     int dstbpp = dstfmt->BytesPerPixel;
  1989     unsigned sA = info->a;
  1990     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1991 
  1992     if (sA) {
  1993         while (height--) {
  1994 	    /* *INDENT-OFF* */
  1995 	    DUFFS_LOOP4(
  1996 	    {
  1997 		Uint32 Pixel;
  1998 		unsigned sR;
  1999 		unsigned sG;
  2000 		unsigned sB;
  2001 		unsigned dR;
  2002 		unsigned dG;
  2003 		unsigned dB;
  2004 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2005 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2006 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2007 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2008 		src += srcbpp;
  2009 		dst += dstbpp;
  2010 	    },
  2011 	    width);
  2012 	    /* *INDENT-ON* */
  2013             src += srcskip;
  2014             dst += dstskip;
  2015         }
  2016     }
  2017 }
  2018 
  2019 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2020 static void
  2021 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2022 {
  2023     int width = info->dst_w;
  2024     int height = info->dst_h;
  2025     Uint8 *src = info->src;
  2026     int srcskip = info->src_skip;
  2027     Uint8 *dst = info->dst;
  2028     int dstskip = info->dst_skip;
  2029     SDL_PixelFormat *srcfmt = info->src_fmt;
  2030     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2031     Uint32 ckey = info->colorkey;
  2032     int srcbpp = srcfmt->BytesPerPixel;
  2033     int dstbpp = dstfmt->BytesPerPixel;
  2034     unsigned sA = info->a;
  2035     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2036 
  2037     while (height--) {
  2038 	    /* *INDENT-OFF* */
  2039 	    DUFFS_LOOP4(
  2040 	    {
  2041 		Uint32 Pixel;
  2042 		unsigned sR;
  2043 		unsigned sG;
  2044 		unsigned sB;
  2045 		unsigned dR;
  2046 		unsigned dG;
  2047 		unsigned dB;
  2048 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2049 		if(sA && Pixel != ckey) {
  2050 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2051 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2052 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2053 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2054 		}
  2055 		src += srcbpp;
  2056 		dst += dstbpp;
  2057 	    },
  2058 	    width);
  2059 	    /* *INDENT-ON* */
  2060         src += srcskip;
  2061         dst += dstskip;
  2062     }
  2063 }
  2064 
  2065 /* General (slow) N->N blending with pixel alpha */
  2066 static void
  2067 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2068 {
  2069     int width = info->dst_w;
  2070     int height = info->dst_h;
  2071     Uint8 *src = info->src;
  2072     int srcskip = info->src_skip;
  2073     Uint8 *dst = info->dst;
  2074     int dstskip = info->dst_skip;
  2075     SDL_PixelFormat *srcfmt = info->src_fmt;
  2076     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2077 
  2078     int srcbpp;
  2079     int dstbpp;
  2080 
  2081     /* Set up some basic variables */
  2082     srcbpp = srcfmt->BytesPerPixel;
  2083     dstbpp = dstfmt->BytesPerPixel;
  2084 
  2085     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2086        quite right. for <8bpp source alpha, it gets them very wrong
  2087        (check all macros!)
  2088        It is unclear whether there is a good general solution that doesn't
  2089        need a branch (or a divide). */
  2090     while (height--) {
  2091 	    /* *INDENT-OFF* */
  2092 	    DUFFS_LOOP4(
  2093 	    {
  2094 		Uint32 Pixel;
  2095 		unsigned sR;
  2096 		unsigned sG;
  2097 		unsigned sB;
  2098 		unsigned dR;
  2099 		unsigned dG;
  2100 		unsigned dB;
  2101 		unsigned sA;
  2102 		unsigned dA;
  2103 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2104 		if(sA) {
  2105 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2106 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2107 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2108 		}
  2109 		src += srcbpp;
  2110 		dst += dstbpp;
  2111 	    },
  2112 	    width);
  2113 	    /* *INDENT-ON* */
  2114         src += srcskip;
  2115         dst += dstskip;
  2116     }
  2117 }
  2118 
  2119 
  2120 SDL_BlitFunc
  2121 SDL_CalculateBlitA(SDL_Surface * surface)
  2122 {
  2123     SDL_PixelFormat *sf = surface->format;
  2124     SDL_PixelFormat *df = surface->map->dst->format;
  2125 
  2126     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  2127     case SDL_COPY_BLEND:
  2128         /* Per-pixel alpha blits */
  2129         switch (df->BytesPerPixel) {
  2130         case 1:
  2131             return BlitNto1PixelAlpha;
  2132 
  2133         case 2:
  2134 #if SDL_ALTIVEC_BLITTERS
  2135             if (sf->BytesPerPixel == 4
  2136                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2137                 && SDL_HasAltiVec())
  2138                 return Blit32to565PixelAlphaAltivec;
  2139             else
  2140 #endif
  2141                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2142                     && sf->Gmask == 0xff00
  2143                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2144                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2145                 if (df->Gmask == 0x7e0)
  2146                     return BlitARGBto565PixelAlpha;
  2147                 else if (df->Gmask == 0x3e0)
  2148                     return BlitARGBto555PixelAlpha;
  2149             }
  2150             return BlitNtoNPixelAlpha;
  2151 
  2152         case 4:
  2153             if (sf->Rmask == df->Rmask
  2154                 && sf->Gmask == df->Gmask
  2155                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2156 #if defined(__MMX__) || defined(__3dNOW__)
  2157                 if (sf->Rshift % 8 == 0
  2158                     && sf->Gshift % 8 == 0
  2159                     && sf->Bshift % 8 == 0
  2160                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2161 #ifdef __3dNOW__
  2162                     if (SDL_Has3DNow())
  2163                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2164 #endif
  2165 #ifdef __MMX__
  2166                     if (SDL_HasMMX())
  2167                         return BlitRGBtoRGBPixelAlphaMMX;
  2168 #endif
  2169                 }
  2170 #endif /* __MMX__ || __3dNOW__ */
  2171                 if (sf->Amask == 0xff000000) {
  2172 #if SDL_ALTIVEC_BLITTERS
  2173                     if (SDL_HasAltiVec())
  2174                         return BlitRGBtoRGBPixelAlphaAltivec;
  2175 #endif
  2176                     return BlitRGBtoRGBPixelAlpha;
  2177                 }
  2178             }
  2179 #if SDL_ALTIVEC_BLITTERS
  2180             if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
  2181                 return Blit32to32PixelAlphaAltivec;
  2182             else
  2183 #endif
  2184                 return BlitNtoNPixelAlpha;
  2185 
  2186         case 3:
  2187         default:
  2188             return BlitNtoNPixelAlpha;
  2189         }
  2190         break;
  2191 
  2192     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2193         if (sf->Amask == 0) {
  2194             /* Per-surface alpha blits */
  2195             switch (df->BytesPerPixel) {
  2196             case 1:
  2197                 return BlitNto1SurfaceAlpha;
  2198 
  2199             case 2:
  2200                 if (surface->map->identity) {
  2201                     if (df->Gmask == 0x7e0) {
  2202 #ifdef __MMX__
  2203                         if (SDL_HasMMX())
  2204                             return Blit565to565SurfaceAlphaMMX;
  2205                         else
  2206 #endif
  2207                             return Blit565to565SurfaceAlpha;
  2208                     } else if (df->Gmask == 0x3e0) {
  2209 #ifdef __MMX__
  2210                         if (SDL_HasMMX())
  2211                             return Blit555to555SurfaceAlphaMMX;
  2212                         else
  2213 #endif
  2214                             return Blit555to555SurfaceAlpha;
  2215                     }
  2216                 }
  2217                 return BlitNtoNSurfaceAlpha;
  2218 
  2219             case 4:
  2220                 if (sf->Rmask == df->Rmask
  2221                     && sf->Gmask == df->Gmask
  2222                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2223 #ifdef __MMX__
  2224                     if (sf->Rshift % 8 == 0
  2225                         && sf->Gshift % 8 == 0
  2226                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2227                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2228 #endif
  2229                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2230 #if SDL_ALTIVEC_BLITTERS
  2231                         if (SDL_HasAltiVec())
  2232                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2233 #endif
  2234                         return BlitRGBtoRGBSurfaceAlpha;
  2235                     }
  2236                 }
  2237 #if SDL_ALTIVEC_BLITTERS
  2238                 if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
  2239                     return Blit32to32SurfaceAlphaAltivec;
  2240                 else
  2241 #endif
  2242                     return BlitNtoNSurfaceAlpha;
  2243 
  2244             case 3:
  2245             default:
  2246                 return BlitNtoNSurfaceAlpha;
  2247             }
  2248         }
  2249         break;
  2250 
  2251     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2252         if (sf->Amask == 0) {
  2253             if (df->BytesPerPixel == 1)
  2254                 return BlitNto1SurfaceAlphaKey;
  2255             else
  2256 #if SDL_ALTIVEC_BLITTERS
  2257             if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2258                     SDL_HasAltiVec())
  2259                 return Blit32to32SurfaceAlphaKeyAltivec;
  2260             else
  2261 #endif
  2262                 return BlitNtoNSurfaceAlphaKey;
  2263         }
  2264         break;
  2265     }
  2266 
  2267     return NULL;
  2268 }
  2269 
  2270 /* vi: set ts=4 sw=4 expandtab: */