src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Tue, 15 Dec 2009 08:11:06 +0000
changeset 3565 f43c8f688f77
parent 3430 baeff5f3203b
child 3697 f7b03b6838cb
permissions -rw-r--r--
Fixed bug #906

Added better error reporting for OpenGL context creation failing.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2009 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /* Functions to perform alpha blended blitting */
    28 
    29 /* N->1 blending with per-surface alpha */
    30 static void
    31 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    32 {
    33     int width = info->dst_w;
    34     int height = info->dst_h;
    35     Uint8 *src = info->src;
    36     int srcskip = info->src_skip;
    37     Uint8 *dst = info->dst;
    38     int dstskip = info->dst_skip;
    39     Uint8 *palmap = info->table;
    40     SDL_PixelFormat *srcfmt = info->src_fmt;
    41     SDL_PixelFormat *dstfmt = info->dst_fmt;
    42     int srcbpp = srcfmt->BytesPerPixel;
    43 
    44     const unsigned A = info->a;
    45 
    46     while (height--) {
    47 	    /* *INDENT-OFF* */
    48 	    DUFFS_LOOP4(
    49 	    {
    50 		Uint32 Pixel;
    51 		unsigned sR;
    52 		unsigned sG;
    53 		unsigned sB;
    54 		unsigned dR;
    55 		unsigned dG;
    56 		unsigned dB;
    57 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    58 		dR = dstfmt->palette->colors[*dst].r;
    59 		dG = dstfmt->palette->colors[*dst].g;
    60 		dB = dstfmt->palette->colors[*dst].b;
    61 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    62 		dR &= 0xff;
    63 		dG &= 0xff;
    64 		dB &= 0xff;
    65 		/* Pack RGB into 8bit pixel */
    66 		if ( palmap == NULL ) {
    67 		    *dst =((dR>>5)<<(3+2))|
    68 			  ((dG>>5)<<(2))|
    69 			  ((dB>>6)<<(0));
    70 		} else {
    71 		    *dst = palmap[((dR>>5)<<(3+2))|
    72 				  ((dG>>5)<<(2))  |
    73 				  ((dB>>6)<<(0))];
    74 		}
    75 		dst++;
    76 		src += srcbpp;
    77 	    },
    78 	    width);
    79 	    /* *INDENT-ON* */
    80         src += srcskip;
    81         dst += dstskip;
    82     }
    83 }
    84 
    85 /* N->1 blending with pixel alpha */
    86 static void
    87 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    88 {
    89     int width = info->dst_w;
    90     int height = info->dst_h;
    91     Uint8 *src = info->src;
    92     int srcskip = info->src_skip;
    93     Uint8 *dst = info->dst;
    94     int dstskip = info->dst_skip;
    95     Uint8 *palmap = info->table;
    96     SDL_PixelFormat *srcfmt = info->src_fmt;
    97     SDL_PixelFormat *dstfmt = info->dst_fmt;
    98     int srcbpp = srcfmt->BytesPerPixel;
    99 
   100     /* FIXME: fix alpha bit field expansion here too? */
   101     while (height--) {
   102 	    /* *INDENT-OFF* */
   103 	    DUFFS_LOOP4(
   104 	    {
   105 		Uint32 Pixel;
   106 		unsigned sR;
   107 		unsigned sG;
   108 		unsigned sB;
   109 		unsigned sA;
   110 		unsigned dR;
   111 		unsigned dG;
   112 		unsigned dB;
   113 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   114 		dR = dstfmt->palette->colors[*dst].r;
   115 		dG = dstfmt->palette->colors[*dst].g;
   116 		dB = dstfmt->palette->colors[*dst].b;
   117 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   118 		dR &= 0xff;
   119 		dG &= 0xff;
   120 		dB &= 0xff;
   121 		/* Pack RGB into 8bit pixel */
   122 		if ( palmap == NULL ) {
   123 		    *dst =((dR>>5)<<(3+2))|
   124 			  ((dG>>5)<<(2))|
   125 			  ((dB>>6)<<(0));
   126 		} else {
   127 		    *dst = palmap[((dR>>5)<<(3+2))|
   128 				  ((dG>>5)<<(2))  |
   129 				  ((dB>>6)<<(0))  ];
   130 		}
   131 		dst++;
   132 		src += srcbpp;
   133 	    },
   134 	    width);
   135 	    /* *INDENT-ON* */
   136         src += srcskip;
   137         dst += dstskip;
   138     }
   139 }
   140 
   141 /* colorkeyed N->1 blending with per-surface alpha */
   142 static void
   143 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   144 {
   145     int width = info->dst_w;
   146     int height = info->dst_h;
   147     Uint8 *src = info->src;
   148     int srcskip = info->src_skip;
   149     Uint8 *dst = info->dst;
   150     int dstskip = info->dst_skip;
   151     Uint8 *palmap = info->table;
   152     SDL_PixelFormat *srcfmt = info->src_fmt;
   153     SDL_PixelFormat *dstfmt = info->dst_fmt;
   154     int srcbpp = srcfmt->BytesPerPixel;
   155     Uint32 ckey = info->colorkey;
   156 
   157     const int A = info->a;
   158 
   159     while (height--) {
   160 	    /* *INDENT-OFF* */
   161 	    DUFFS_LOOP(
   162 	    {
   163 		Uint32 Pixel;
   164 		unsigned sR;
   165 		unsigned sG;
   166 		unsigned sB;
   167 		unsigned dR;
   168 		unsigned dG;
   169 		unsigned dB;
   170 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   171 		if ( Pixel != ckey ) {
   172 		    dR = dstfmt->palette->colors[*dst].r;
   173 		    dG = dstfmt->palette->colors[*dst].g;
   174 		    dB = dstfmt->palette->colors[*dst].b;
   175 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   176 		    dR &= 0xff;
   177 		    dG &= 0xff;
   178 		    dB &= 0xff;
   179 		    /* Pack RGB into 8bit pixel */
   180 		    if ( palmap == NULL ) {
   181 			*dst =((dR>>5)<<(3+2))|
   182 			      ((dG>>5)<<(2)) |
   183 			      ((dB>>6)<<(0));
   184 		    } else {
   185 			*dst = palmap[((dR>>5)<<(3+2))|
   186 				      ((dG>>5)<<(2))  |
   187 				      ((dB>>6)<<(0))  ];
   188 		    }
   189 		}
   190 		dst++;
   191 		src += srcbpp;
   192 	    },
   193 	    width);
   194 	    /* *INDENT-ON* */
   195         src += srcskip;
   196         dst += dstskip;
   197     }
   198 }
   199 
   200 #ifdef __MMX__
   201 
   202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   203 static void
   204 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   205 {
   206     int width = info->dst_w;
   207     int height = info->dst_h;
   208     Uint32 *srcp = (Uint32 *) info->src;
   209     int srcskip = info->src_skip >> 2;
   210     Uint32 *dstp = (Uint32 *) info->dst;
   211     int dstskip = info->dst_skip >> 2;
   212     Uint32 dalpha = info->dst_fmt->Amask;
   213 
   214     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   215 
   216     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   217     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   218     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   219 
   220     while (height--) {
   221         int n = width;
   222         if (n & 1) {
   223             Uint32 s = *srcp++;
   224             Uint32 d = *dstp;
   225             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   226                        + (s & d & 0x00010101)) | dalpha;
   227             n--;
   228         }
   229 
   230         for (n >>= 1; n > 0; --n) {
   231             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   232             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   233 
   234             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   235             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   236 
   237             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   238             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   239             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   240             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   241 
   242             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   243             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   244             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   245             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   246 
   247             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   248             dstp += 2;
   249             srcp += 2;
   250         }
   251 
   252         srcp += srcskip;
   253         dstp += dstskip;
   254     }
   255     _mm_empty();
   256 }
   257 
   258 /* fast RGB888->(A)RGB888 blending with surface alpha */
   259 static void
   260 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   261 {
   262     SDL_PixelFormat *df = info->dst_fmt;
   263     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   264     unsigned alpha = info->a;
   265 
   266     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   267         /* only call a128 version when R,G,B occupy lower bits */
   268         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   269     } else {
   270         int width = info->dst_w;
   271         int height = info->dst_h;
   272         Uint32 *srcp = (Uint32 *) info->src;
   273         int srcskip = info->src_skip >> 2;
   274         Uint32 *dstp = (Uint32 *) info->dst;
   275         int dstskip = info->dst_skip >> 2;
   276         Uint32 dalpha = df->Amask;
   277         Uint32 amult;
   278 
   279         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   280 
   281         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   282         /* form the alpha mult */
   283         amult = alpha | (alpha << 8);
   284         amult = amult | (amult << 16);
   285         chanmask =
   286             (0xff << df->Rshift) | (0xff << df->
   287                                     Gshift) | (0xff << df->Bshift);
   288         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   289         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   290         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   291         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   292 
   293         while (height--) {
   294             int n = width;
   295             if (n & 1) {
   296                 /* One Pixel Blend */
   297                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   298                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   299 
   300                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   301                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   302 
   303                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   304                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   305                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   306                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   307 
   308                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   309                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   310                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   311 
   312                 ++srcp;
   313                 ++dstp;
   314 
   315                 n--;
   316             }
   317 
   318             for (n >>= 1; n > 0; --n) {
   319                 /* Two Pixels Blend */
   320                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   321                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   322                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   323                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   324 
   325                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   326                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   327                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   328                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   329 
   330                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   331                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   332                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   333                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   334 
   335                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   336                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   337                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   338                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   339 
   340                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   341                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   342 
   343                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   344 
   345                 srcp += 2;
   346                 dstp += 2;
   347             }
   348             srcp += srcskip;
   349             dstp += dstskip;
   350         }
   351         _mm_empty();
   352     }
   353 }
   354 
   355 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   356 static void
   357 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   358 {
   359     int width = info->dst_w;
   360     int height = info->dst_h;
   361     Uint32 *srcp = (Uint32 *) info->src;
   362     int srcskip = info->src_skip >> 2;
   363     Uint32 *dstp = (Uint32 *) info->dst;
   364     int dstskip = info->dst_skip >> 2;
   365     SDL_PixelFormat *sf = info->src_fmt;
   366     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   367     Uint32 amask = sf->Amask;
   368     Uint32 ashift = sf->Ashift;
   369     Uint64 multmask;
   370 
   371     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   372 
   373     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   374     multmask = 0xFFFF;
   375     multmask <<= (ashift * 2);
   376     multmask = ~multmask;
   377     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   378 
   379     while (height--) {
   380 		/* *INDENT-OFF* */
   381 		DUFFS_LOOP4({
   382 		Uint32 alpha = *srcp & amask;
   383 		if (alpha == 0) {
   384 			/* do nothing */
   385 		} else if (alpha == amask) {
   386 			/* opaque alpha -- copy RGB, keep dst alpha */
   387 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   388 		} else {
   389 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   390 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   391 
   392 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   393 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   394 
   395 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   396 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   397 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   398 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   399 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   400 
   401 			/* blend */		    
   402 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   403 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   404 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   405 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   406 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   407 			
   408 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   409 		}
   410 		++srcp;
   411 		++dstp;
   412 	    }, width);
   413 		/* *INDENT-ON* */
   414         srcp += srcskip;
   415         dstp += dstskip;
   416     }
   417     _mm_empty();
   418 }
   419 
   420 #endif /* __MMX__ */
   421 
   422 #if SDL_ALTIVEC_BLITTERS
   423 #if __MWERKS__
   424 #pragma altivec_model on
   425 #endif
   426 #if HAVE_ALTIVEC_H
   427 #include <altivec.h>
   428 #endif
   429 #include <assert.h>
   430 
   431 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   432 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   433         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   434 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   435         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   436 #else
   437 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   438         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   439 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   440         (vector unsigned short) { a,b,c,d,e,f,g,h }
   441 #endif
   442 
   443 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   444 #define VECPRINT(msg, v) do { \
   445     vector unsigned int tmpvec = (vector unsigned int)(v); \
   446     unsigned int *vp = (unsigned int *)&tmpvec; \
   447     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   448 } while (0)
   449 
   450 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   451     (vector unsigned char)(
   452         0x00, 0x10, 0x02, 0x12,
   453         0x04, 0x14, 0x06, 0x16,
   454         0x08, 0x18, 0x0A, 0x1A,
   455         0x0C, 0x1C, 0x0E, 0x1E );
   456 */
   457 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   458 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   459 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   460 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   461     ? vec_lvsl(0, src) \
   462     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   463 
   464 
   465 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   466     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   467     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   468     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   469     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   470     /* valpha2 is 255-alpha */ \
   471     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   472     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   473     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   474     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   475     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   476     /* add source and dest */ \
   477     vtemp1 = vec_add(vtemp1, vtemp3); \
   478     vtemp2 = vec_add(vtemp2, vtemp4); \
   479     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   480     vtemp1 = vec_add(vtemp1, v1_16); \
   481     vtemp3 = vec_sr(vtemp1, v8_16); \
   482     vtemp1 = vec_add(vtemp1, vtemp3); \
   483     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   484     vtemp2 = vec_add(vtemp2, v1_16); \
   485     vtemp4 = vec_sr(vtemp2, v8_16); \
   486     vtemp2 = vec_add(vtemp2, vtemp4); \
   487     /* (>>8) and get ARGBARGBARGBARGB */ \
   488     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   489 } while (0)
   490 
   491 /* Calculate the permute vector used for 32->32 swizzling */
   492 static vector unsigned char
   493 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   494 {
   495     /*
   496      * We have to assume that the bits that aren't used by other
   497      *  colors is alpha, and it's one complete byte, since some formats
   498      *  leave alpha with a zero mask, but we should still swizzle the bits.
   499      */
   500     /* ARGB */
   501     const static struct SDL_PixelFormat default_pixel_format = {
   502         NULL, 0, 0,
   503         0, 0, 0, 0,
   504         16, 8, 0, 24,
   505         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
   506     };
   507     if (!srcfmt) {
   508         srcfmt = &default_pixel_format;
   509     }
   510     if (!dstfmt) {
   511         dstfmt = &default_pixel_format;
   512     }
   513     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   514                                                        0x04, 0x04, 0x04, 0x04,
   515                                                        0x08, 0x08, 0x08, 0x08,
   516                                                        0x0C, 0x0C, 0x0C,
   517                                                        0x0C);
   518     vector unsigned char vswiz;
   519     vector unsigned int srcvec;
   520 #define RESHIFT(X) (3 - ((X) >> 3))
   521     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   522     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   523     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   524     Uint32 amask;
   525     /* Use zero for alpha if either surface doesn't have alpha */
   526     if (dstfmt->Amask) {
   527         amask =
   528             ((srcfmt->Amask) ? RESHIFT(srcfmt->
   529                                        Ashift) : 0x10) << (dstfmt->Ashift);
   530     } else {
   531         amask =
   532             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   533                           0xFFFFFFFF);
   534     }
   535 #undef RESHIFT
   536     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   537     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   538     return (vswiz);
   539 }
   540 
   541 static void
   542 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   543 {
   544     int height = info->dst_h;
   545     Uint8 *src = (Uint8 *) info->src;
   546     int srcskip = info->src_skip;
   547     Uint8 *dst = (Uint8 *) info->dst;
   548     int dstskip = info->dst_skip;
   549     SDL_PixelFormat *srcfmt = info->src_fmt;
   550 
   551     vector unsigned char v0 = vec_splat_u8(0);
   552     vector unsigned short v8_16 = vec_splat_u16(8);
   553     vector unsigned short v1_16 = vec_splat_u16(1);
   554     vector unsigned short v2_16 = vec_splat_u16(2);
   555     vector unsigned short v3_16 = vec_splat_u16(3);
   556     vector unsigned int v8_32 = vec_splat_u32(8);
   557     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   558     vector unsigned short v3f =
   559         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   560                           0x003f, 0x003f, 0x003f, 0x003f);
   561     vector unsigned short vfc =
   562         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   563                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   564 
   565     /* 
   566        0x10 - 0x1f is the alpha
   567        0x00 - 0x0e evens are the red
   568        0x01 - 0x0f odds are zero
   569      */
   570     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   571                                                        0x10, 0x02, 0x01, 0x01,
   572                                                        0x10, 0x04, 0x01, 0x01,
   573                                                        0x10, 0x06, 0x01,
   574                                                        0x01);
   575     vector unsigned char vredalpha2 =
   576         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   577                                         vec_sl(v8_32, v16_32))
   578         );
   579     /*
   580        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   581        0x11 - 0x0f odds are blue
   582      */
   583     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   584                                                    0x04, 0x05, 0x06, 0x13,
   585                                                    0x08, 0x09, 0x0a, 0x15,
   586                                                    0x0c, 0x0d, 0x0e, 0x17);
   587     vector unsigned char vblue2 =
   588         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   589         );
   590     /*
   591        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   592        0x10 - 0x0e evens are green
   593      */
   594     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   595                                                     0x04, 0x05, 0x12, 0x07,
   596                                                     0x08, 0x09, 0x14, 0x0b,
   597                                                     0x0c, 0x0d, 0x16, 0x0f);
   598     vector unsigned char vgreen2 =
   599         (vector unsigned
   600          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   601         );
   602     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   603                                                     0x00, 0x0a, 0x00, 0x0e,
   604                                                     0x00, 0x12, 0x00, 0x16,
   605                                                     0x00, 0x1a, 0x00, 0x1e);
   606     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   607     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   608     vector unsigned char valphaPermute =
   609         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   610 
   611     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   612     vf800 = vec_sl(vf800, vec_splat_u16(8));
   613 
   614     while (height--) {
   615         int extrawidth;
   616         vector unsigned char valigner;
   617         vector unsigned char vsrc;
   618         vector unsigned char voverflow;
   619         int width = info->dst_w;
   620 
   621 #define ONE_PIXEL_BLEND(condition, widthvar) \
   622         while (condition) { \
   623             Uint32 Pixel; \
   624             unsigned sR, sG, sB, dR, dG, dB, sA; \
   625             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   626             if(sA) { \
   627                 unsigned short dstpixel = *((unsigned short *)dst); \
   628                 dR = (dstpixel >> 8) & 0xf8; \
   629                 dG = (dstpixel >> 3) & 0xfc; \
   630                 dB = (dstpixel << 3) & 0xf8; \
   631                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   632                 *((unsigned short *)dst) = ( \
   633                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   634                 ); \
   635             } \
   636             src += 4; \
   637             dst += 2; \
   638             widthvar--; \
   639         }
   640         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   641         extrawidth = (width % 8);
   642         valigner = VEC_ALIGNER(src);
   643         vsrc = (vector unsigned char) vec_ld(0, src);
   644         width -= extrawidth;
   645         while (width) {
   646             vector unsigned char valpha;
   647             vector unsigned char vsrc1, vsrc2;
   648             vector unsigned char vdst1, vdst2;
   649             vector unsigned short vR, vG, vB;
   650             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   651 
   652             /* Load 8 pixels from src as ARGB */
   653             voverflow = (vector unsigned char) vec_ld(15, src);
   654             vsrc = vec_perm(vsrc, voverflow, valigner);
   655             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   656             src += 16;
   657             vsrc = (vector unsigned char) vec_ld(15, src);
   658             voverflow = vec_perm(voverflow, vsrc, valigner);
   659             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   660             src += 16;
   661 
   662             /* Load 8 pixels from dst as XRGB */
   663             voverflow = vec_ld(0, dst);
   664             vR = vec_and((vector unsigned short) voverflow, vf800);
   665             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   666             vG = vec_sl(vB, v2_16);
   667             vdst1 =
   668                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   669                                                 (vector unsigned char) vR,
   670                                                 vredalpha1);
   671             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   672             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   673             vdst2 =
   674                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   675                                                 (vector unsigned char) vR,
   676                                                 vredalpha2);
   677             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   678             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   679 
   680             /* Alpha blend 8 pixels as ARGB */
   681             valpha = vec_perm(vsrc1, v0, valphaPermute);
   682             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   683                                v8_16);
   684             valpha = vec_perm(vsrc2, v0, valphaPermute);
   685             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   686                                v8_16);
   687 
   688             /* Convert 8 pixels to 565 */
   689             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   690                                                         vdst1,
   691                                                         (vector unsigned int)
   692                                                         vdst2);
   693             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   694             vgpixel = vec_and(vgpixel, vfc);
   695             vgpixel = vec_sl(vgpixel, v3_16);
   696             vrpixel = vec_sl(vpixel, v1_16);
   697             vrpixel = vec_and(vrpixel, vf800);
   698             vbpixel = vec_and(vpixel, v3f);
   699             vdst1 =
   700                 vec_or((vector unsigned char) vrpixel,
   701                        (vector unsigned char) vgpixel);
   702             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   703 
   704             /* Store 8 pixels */
   705             vec_st(vdst1, 0, dst);
   706 
   707             width -= 8;
   708             dst += 16;
   709         }
   710         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   711 #undef ONE_PIXEL_BLEND
   712         src += srcskip;
   713         dst += dstskip;
   714     }
   715 }
   716 
   717 static void
   718 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   719 {
   720     int height = info->dst_h;
   721     Uint32 *srcp = (Uint32 *) info->src;
   722     int srcskip = info->src_skip >> 2;
   723     Uint32 *dstp = (Uint32 *) info->dst;
   724     int dstskip = info->dst_skip >> 2;
   725     SDL_PixelFormat *srcfmt = info->src_fmt;
   726     SDL_PixelFormat *dstfmt = info->dst_fmt;
   727     unsigned sA = info->a;
   728     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   729     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   730     Uint32 ckey = info->colorkey;
   731     vector unsigned char mergePermute;
   732     vector unsigned char vsrcPermute;
   733     vector unsigned char vdstPermute;
   734     vector unsigned char vsdstPermute;
   735     vector unsigned char valpha;
   736     vector unsigned char valphamask;
   737     vector unsigned char vbits;
   738     vector unsigned char v0;
   739     vector unsigned short v1;
   740     vector unsigned short v8;
   741     vector unsigned int vckey;
   742     vector unsigned int vrgbmask;
   743 
   744     mergePermute = VEC_MERGE_PERMUTE();
   745     v0 = vec_splat_u8(0);
   746     v1 = vec_splat_u16(1);
   747     v8 = vec_splat_u16(8);
   748 
   749     /* set the alpha to 255 on the destination surf */
   750     valphamask = VEC_ALPHA_MASK();
   751 
   752     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   753     vdstPermute = calc_swizzle32(NULL, dstfmt);
   754     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   755 
   756     /* set a vector full of alpha and 255-alpha */
   757     ((unsigned char *) &valpha)[0] = sA;
   758     valpha = vec_splat(valpha, 0);
   759     vbits = (vector unsigned char) vec_splat_s8(-1);
   760 
   761     ckey &= rgbmask;
   762     ((unsigned int *) (char *) &vckey)[0] = ckey;
   763     vckey = vec_splat(vckey, 0);
   764     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
   765     vrgbmask = vec_splat(vrgbmask, 0);
   766 
   767     while (height--) {
   768         int width = info->dst_w;
   769 #define ONE_PIXEL_BLEND(condition, widthvar) \
   770         while (condition) { \
   771             Uint32 Pixel; \
   772             unsigned sR, sG, sB, dR, dG, dB; \
   773             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   774             if(sA && Pixel != ckey) { \
   775                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   776                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   777                 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   778                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   779             } \
   780             dstp++; \
   781             srcp++; \
   782             widthvar--; \
   783         }
   784         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   785         if (width > 0) {
   786             int extrawidth = (width % 4);
   787             vector unsigned char valigner = VEC_ALIGNER(srcp);
   788             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   789             width -= extrawidth;
   790             while (width) {
   791                 vector unsigned char vsel;
   792                 vector unsigned char voverflow;
   793                 vector unsigned char vd;
   794                 vector unsigned char vd_orig;
   795 
   796                 /* s = *srcp */
   797                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   798                 vs = vec_perm(vs, voverflow, valigner);
   799 
   800                 /* vsel is set for items that match the key */
   801                 vsel =
   802                     (vector unsigned char) vec_and((vector unsigned int) vs,
   803                                                    vrgbmask);
   804                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
   805                                                         vsel, vckey);
   806 
   807                 /* permute to source format */
   808                 vs = vec_perm(vs, valpha, vsrcPermute);
   809 
   810                 /* d = *dstp */
   811                 vd = (vector unsigned char) vec_ld(0, dstp);
   812                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   813 
   814                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   815 
   816                 /* set the alpha channel to full on */
   817                 vd = vec_or(vd, valphamask);
   818 
   819                 /* mask out color key */
   820                 vd = vec_sel(vd, vd_orig, vsel);
   821 
   822                 /* permute to dest format */
   823                 vd = vec_perm(vd, vbits, vdstPermute);
   824 
   825                 /* *dstp = res */
   826                 vec_st((vector unsigned int) vd, 0, dstp);
   827 
   828                 srcp += 4;
   829                 dstp += 4;
   830                 width -= 4;
   831                 vs = voverflow;
   832             }
   833             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   834         }
   835 #undef ONE_PIXEL_BLEND
   836 
   837         srcp += srcskip;
   838         dstp += dstskip;
   839     }
   840 }
   841 
   842 
   843 static void
   844 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
   845 {
   846     int width = info->dst_w;
   847     int height = info->dst_h;
   848     Uint32 *srcp = (Uint32 *) info->src;
   849     int srcskip = info->src_skip >> 2;
   850     Uint32 *dstp = (Uint32 *) info->dst;
   851     int dstskip = info->dst_skip >> 2;
   852     SDL_PixelFormat *srcfmt = info->src_fmt;
   853     SDL_PixelFormat *dstfmt = info->dst_fmt;
   854     vector unsigned char mergePermute;
   855     vector unsigned char valphaPermute;
   856     vector unsigned char vsrcPermute;
   857     vector unsigned char vdstPermute;
   858     vector unsigned char vsdstPermute;
   859     vector unsigned char valphamask;
   860     vector unsigned char vpixelmask;
   861     vector unsigned char v0;
   862     vector unsigned short v1;
   863     vector unsigned short v8;
   864 
   865     v0 = vec_splat_u8(0);
   866     v1 = vec_splat_u16(1);
   867     v8 = vec_splat_u16(8);
   868     mergePermute = VEC_MERGE_PERMUTE();
   869     valphamask = VEC_ALPHA_MASK();
   870     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   871     vpixelmask = vec_nor(valphamask, v0);
   872     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   873     vdstPermute = calc_swizzle32(NULL, dstfmt);
   874     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   875 
   876     while (height--) {
   877         width = info->dst_w;
   878 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   879             Uint32 Pixel; \
   880             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   881             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   882             if(sA) { \
   883               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   884               ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   885               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   886             } \
   887             ++srcp; \
   888             ++dstp; \
   889             widthvar--; \
   890         }
   891         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   892         if (width > 0) {
   893             /* vsrcPermute */
   894             /* vdstPermute */
   895             int extrawidth = (width % 4);
   896             vector unsigned char valigner = VEC_ALIGNER(srcp);
   897             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   898             width -= extrawidth;
   899             while (width) {
   900                 vector unsigned char voverflow;
   901                 vector unsigned char vd;
   902                 vector unsigned char valpha;
   903                 vector unsigned char vdstalpha;
   904                 /* s = *srcp */
   905                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   906                 vs = vec_perm(vs, voverflow, valigner);
   907                 vs = vec_perm(vs, v0, vsrcPermute);
   908 
   909                 valpha = vec_perm(vs, v0, valphaPermute);
   910 
   911                 /* d = *dstp */
   912                 vd = (vector unsigned char) vec_ld(0, dstp);
   913                 vd = vec_perm(vd, v0, vsdstPermute);
   914                 vdstalpha = vec_and(vd, valphamask);
   915 
   916                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   917 
   918                 /* set the alpha to the dest alpha */
   919                 vd = vec_and(vd, vpixelmask);
   920                 vd = vec_or(vd, vdstalpha);
   921                 vd = vec_perm(vd, v0, vdstPermute);
   922 
   923                 /* *dstp = res */
   924                 vec_st((vector unsigned int) vd, 0, dstp);
   925 
   926                 srcp += 4;
   927                 dstp += 4;
   928                 width -= 4;
   929                 vs = voverflow;
   930 
   931             }
   932             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   933         }
   934         srcp += srcskip;
   935         dstp += dstskip;
   936 #undef ONE_PIXEL_BLEND
   937     }
   938 }
   939 
   940 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   941 static void
   942 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
   943 {
   944     int width = info->dst_w;
   945     int height = info->dst_h;
   946     Uint32 *srcp = (Uint32 *) info->src;
   947     int srcskip = info->src_skip >> 2;
   948     Uint32 *dstp = (Uint32 *) info->dst;
   949     int dstskip = info->dst_skip >> 2;
   950     vector unsigned char mergePermute;
   951     vector unsigned char valphaPermute;
   952     vector unsigned char valphamask;
   953     vector unsigned char vpixelmask;
   954     vector unsigned char v0;
   955     vector unsigned short v1;
   956     vector unsigned short v8;
   957     v0 = vec_splat_u8(0);
   958     v1 = vec_splat_u16(1);
   959     v8 = vec_splat_u16(8);
   960     mergePermute = VEC_MERGE_PERMUTE();
   961     valphamask = VEC_ALPHA_MASK();
   962     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   963 
   964 
   965     vpixelmask = vec_nor(valphamask, v0);
   966     while (height--) {
   967         width = info->dst_w;
   968 #define ONE_PIXEL_BLEND(condition, widthvar) \
   969         while ((condition)) { \
   970             Uint32 dalpha; \
   971             Uint32 d; \
   972             Uint32 s1; \
   973             Uint32 d1; \
   974             Uint32 s = *srcp; \
   975             Uint32 alpha = s >> 24; \
   976             if(alpha) { \
   977               if(alpha == SDL_ALPHA_OPAQUE) { \
   978                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   979               } else { \
   980                 d = *dstp; \
   981                 dalpha = d & 0xff000000; \
   982                 s1 = s & 0xff00ff; \
   983                 d1 = d & 0xff00ff; \
   984                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   985                 s &= 0xff00; \
   986                 d &= 0xff00; \
   987                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   988                 *dstp = d1 | d | dalpha; \
   989               } \
   990             } \
   991             ++srcp; \
   992             ++dstp; \
   993             widthvar--; \
   994 	    }
   995         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   996         if (width > 0) {
   997             int extrawidth = (width % 4);
   998             vector unsigned char valigner = VEC_ALIGNER(srcp);
   999             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1000             width -= extrawidth;
  1001             while (width) {
  1002                 vector unsigned char voverflow;
  1003                 vector unsigned char vd;
  1004                 vector unsigned char valpha;
  1005                 vector unsigned char vdstalpha;
  1006                 /* s = *srcp */
  1007                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1008                 vs = vec_perm(vs, voverflow, valigner);
  1009 
  1010                 valpha = vec_perm(vs, v0, valphaPermute);
  1011 
  1012                 /* d = *dstp */
  1013                 vd = (vector unsigned char) vec_ld(0, dstp);
  1014                 vdstalpha = vec_and(vd, valphamask);
  1015 
  1016                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1017 
  1018                 /* set the alpha to the dest alpha */
  1019                 vd = vec_and(vd, vpixelmask);
  1020                 vd = vec_or(vd, vdstalpha);
  1021 
  1022                 /* *dstp = res */
  1023                 vec_st((vector unsigned int) vd, 0, dstp);
  1024 
  1025                 srcp += 4;
  1026                 dstp += 4;
  1027                 width -= 4;
  1028                 vs = voverflow;
  1029             }
  1030             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1031         }
  1032         srcp += srcskip;
  1033         dstp += dstskip;
  1034     }
  1035 #undef ONE_PIXEL_BLEND
  1036 }
  1037 
  1038 static void
  1039 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1040 {
  1041     /* XXX : 6 */
  1042     int height = info->dst_h;
  1043     Uint32 *srcp = (Uint32 *) info->src;
  1044     int srcskip = info->src_skip >> 2;
  1045     Uint32 *dstp = (Uint32 *) info->dst;
  1046     int dstskip = info->dst_skip >> 2;
  1047     SDL_PixelFormat *srcfmt = info->src_fmt;
  1048     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1049     unsigned sA = info->a;
  1050     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1051     vector unsigned char mergePermute;
  1052     vector unsigned char vsrcPermute;
  1053     vector unsigned char vdstPermute;
  1054     vector unsigned char vsdstPermute;
  1055     vector unsigned char valpha;
  1056     vector unsigned char valphamask;
  1057     vector unsigned char vbits;
  1058     vector unsigned short v1;
  1059     vector unsigned short v8;
  1060 
  1061     mergePermute = VEC_MERGE_PERMUTE();
  1062     v1 = vec_splat_u16(1);
  1063     v8 = vec_splat_u16(8);
  1064 
  1065     /* set the alpha to 255 on the destination surf */
  1066     valphamask = VEC_ALPHA_MASK();
  1067 
  1068     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1069     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1070     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1071 
  1072     /* set a vector full of alpha and 255-alpha */
  1073     ((unsigned char *) &valpha)[0] = sA;
  1074     valpha = vec_splat(valpha, 0);
  1075     vbits = (vector unsigned char) vec_splat_s8(-1);
  1076 
  1077     while (height--) {
  1078         int width = info->dst_w;
  1079 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1080             Uint32 Pixel; \
  1081             unsigned sR, sG, sB, dR, dG, dB; \
  1082             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1083             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1084             ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1085             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1086             ++srcp; \
  1087             ++dstp; \
  1088             widthvar--; \
  1089         }
  1090         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1091         if (width > 0) {
  1092             int extrawidth = (width % 4);
  1093             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1094             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1095             width -= extrawidth;
  1096             while (width) {
  1097                 vector unsigned char voverflow;
  1098                 vector unsigned char vd;
  1099 
  1100                 /* s = *srcp */
  1101                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1102                 vs = vec_perm(vs, voverflow, valigner);
  1103                 vs = vec_perm(vs, valpha, vsrcPermute);
  1104 
  1105                 /* d = *dstp */
  1106                 vd = (vector unsigned char) vec_ld(0, dstp);
  1107                 vd = vec_perm(vd, vd, vsdstPermute);
  1108 
  1109                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1110 
  1111                 /* set the alpha channel to full on */
  1112                 vd = vec_or(vd, valphamask);
  1113                 vd = vec_perm(vd, vbits, vdstPermute);
  1114 
  1115                 /* *dstp = res */
  1116                 vec_st((vector unsigned int) vd, 0, dstp);
  1117 
  1118                 srcp += 4;
  1119                 dstp += 4;
  1120                 width -= 4;
  1121                 vs = voverflow;
  1122             }
  1123             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1124         }
  1125 #undef ONE_PIXEL_BLEND
  1126 
  1127         srcp += srcskip;
  1128         dstp += dstskip;
  1129     }
  1130 
  1131 }
  1132 
  1133 
  1134 /* fast RGB888->(A)RGB888 blending */
  1135 static void
  1136 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1137 {
  1138     unsigned alpha = info->a;
  1139     int height = info->dst_h;
  1140     Uint32 *srcp = (Uint32 *) info->src;
  1141     int srcskip = info->src_skip >> 2;
  1142     Uint32 *dstp = (Uint32 *) info->dst;
  1143     int dstskip = info->dst_skip >> 2;
  1144     vector unsigned char mergePermute;
  1145     vector unsigned char valpha;
  1146     vector unsigned char valphamask;
  1147     vector unsigned short v1;
  1148     vector unsigned short v8;
  1149 
  1150     mergePermute = VEC_MERGE_PERMUTE();
  1151     v1 = vec_splat_u16(1);
  1152     v8 = vec_splat_u16(8);
  1153 
  1154     /* set the alpha to 255 on the destination surf */
  1155     valphamask = VEC_ALPHA_MASK();
  1156 
  1157     /* set a vector full of alpha and 255-alpha */
  1158     ((unsigned char *) &valpha)[0] = alpha;
  1159     valpha = vec_splat(valpha, 0);
  1160 
  1161     while (height--) {
  1162         int width = info->dst_w;
  1163 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1164             Uint32 s = *srcp; \
  1165             Uint32 d = *dstp; \
  1166             Uint32 s1 = s & 0xff00ff; \
  1167             Uint32 d1 = d & 0xff00ff; \
  1168             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1169                  & 0xff00ff; \
  1170             s &= 0xff00; \
  1171             d &= 0xff00; \
  1172             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1173             *dstp = d1 | d | 0xff000000; \
  1174             ++srcp; \
  1175             ++dstp; \
  1176             widthvar--; \
  1177         }
  1178         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1179         if (width > 0) {
  1180             int extrawidth = (width % 4);
  1181             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1182             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1183             width -= extrawidth;
  1184             while (width) {
  1185                 vector unsigned char voverflow;
  1186                 vector unsigned char vd;
  1187 
  1188                 /* s = *srcp */
  1189                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1190                 vs = vec_perm(vs, voverflow, valigner);
  1191 
  1192                 /* d = *dstp */
  1193                 vd = (vector unsigned char) vec_ld(0, dstp);
  1194 
  1195                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1196 
  1197                 /* set the alpha channel to full on */
  1198                 vd = vec_or(vd, valphamask);
  1199 
  1200                 /* *dstp = res */
  1201                 vec_st((vector unsigned int) vd, 0, dstp);
  1202 
  1203                 srcp += 4;
  1204                 dstp += 4;
  1205                 width -= 4;
  1206                 vs = voverflow;
  1207             }
  1208             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1209         }
  1210 #undef ONE_PIXEL_BLEND
  1211 
  1212         srcp += srcskip;
  1213         dstp += dstskip;
  1214     }
  1215 }
  1216 
  1217 #if __MWERKS__
  1218 #pragma altivec_model off
  1219 #endif
  1220 #endif /* SDL_ALTIVEC_BLITTERS */
  1221 
  1222 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1223 static void
  1224 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1225 {
  1226     int width = info->dst_w;
  1227     int height = info->dst_h;
  1228     Uint32 *srcp = (Uint32 *) info->src;
  1229     int srcskip = info->src_skip >> 2;
  1230     Uint32 *dstp = (Uint32 *) info->dst;
  1231     int dstskip = info->dst_skip >> 2;
  1232 
  1233     while (height--) {
  1234 	    /* *INDENT-OFF* */
  1235 	    DUFFS_LOOP4({
  1236 		    Uint32 s = *srcp++;
  1237 		    Uint32 d = *dstp;
  1238 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1239 			       + (s & d & 0x00010101)) | 0xff000000;
  1240 	    }, width);
  1241 	    /* *INDENT-ON* */
  1242         srcp += srcskip;
  1243         dstp += dstskip;
  1244     }
  1245 }
  1246 
  1247 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1248 static void
  1249 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1250 {
  1251     unsigned alpha = info->a;
  1252     if (alpha == 128) {
  1253         BlitRGBtoRGBSurfaceAlpha128(info);
  1254     } else {
  1255         int width = info->dst_w;
  1256         int height = info->dst_h;
  1257         Uint32 *srcp = (Uint32 *) info->src;
  1258         int srcskip = info->src_skip >> 2;
  1259         Uint32 *dstp = (Uint32 *) info->dst;
  1260         int dstskip = info->dst_skip >> 2;
  1261         Uint32 s;
  1262         Uint32 d;
  1263         Uint32 s1;
  1264         Uint32 d1;
  1265 
  1266         while (height--) {
  1267 			/* *INDENT-OFF* */
  1268 			DUFFS_LOOP4({
  1269 				s = *srcp;
  1270 				d = *dstp;
  1271 				s1 = s & 0xff00ff;
  1272 				d1 = d & 0xff00ff;
  1273 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1274 				     & 0xff00ff;
  1275 				s &= 0xff00;
  1276 				d &= 0xff00;
  1277 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1278 				*dstp = d1 | d | 0xff000000;
  1279 				++srcp;
  1280 				++dstp;
  1281 			}, width);
  1282 			/* *INDENT-ON* */
  1283             srcp += srcskip;
  1284             dstp += dstskip;
  1285         }
  1286     }
  1287 }
  1288 
  1289 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1290 static void
  1291 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1292 {
  1293     int width = info->dst_w;
  1294     int height = info->dst_h;
  1295     Uint32 *srcp = (Uint32 *) info->src;
  1296     int srcskip = info->src_skip >> 2;
  1297     Uint32 *dstp = (Uint32 *) info->dst;
  1298     int dstskip = info->dst_skip >> 2;
  1299 
  1300     while (height--) {
  1301 	    /* *INDENT-OFF* */
  1302 	    DUFFS_LOOP4({
  1303 		Uint32 dalpha;
  1304 		Uint32 d;
  1305 		Uint32 s1;
  1306 		Uint32 d1;
  1307 		Uint32 s = *srcp;
  1308 		Uint32 alpha = s >> 24;
  1309 		/* FIXME: Here we special-case opaque alpha since the
  1310 		   compositioning used (>>8 instead of /255) doesn't handle
  1311 		   it correctly. Also special-case alpha=0 for speed?
  1312 		   Benchmark this! */
  1313 		if(alpha) {   
  1314 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1315 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1316 		  } else {
  1317 		    /*
  1318 		     * take out the middle component (green), and process
  1319 		     * the other two in parallel. One multiply less.
  1320 		     */
  1321 		    d = *dstp;
  1322 		    dalpha = d & 0xff000000;
  1323 		    s1 = s & 0xff00ff;
  1324 		    d1 = d & 0xff00ff;
  1325 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1326 		    s &= 0xff00;
  1327 		    d &= 0xff00;
  1328 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1329 		    *dstp = d1 | d | dalpha;
  1330 		  }
  1331 		}
  1332 		++srcp;
  1333 		++dstp;
  1334 	    }, width);
  1335 	    /* *INDENT-ON* */
  1336         srcp += srcskip;
  1337         dstp += dstskip;
  1338     }
  1339 }
  1340 
  1341 #ifdef __3dNOW__
  1342 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1343 static void
  1344 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1345 {
  1346     int width = info->dst_w;
  1347     int height = info->dst_h;
  1348     Uint32 *srcp = (Uint32 *) info->src;
  1349     int srcskip = info->src_skip >> 2;
  1350     Uint32 *dstp = (Uint32 *) info->dst;
  1351     int dstskip = info->dst_skip >> 2;
  1352     SDL_PixelFormat *sf = info->src_fmt;
  1353     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1354     Uint32 amask = sf->Amask;
  1355     Uint32 ashift = sf->Ashift;
  1356     Uint64 multmask;
  1357 
  1358     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1359 
  1360     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1361     multmask = 0xFFFF;
  1362     multmask <<= (ashift * 2);
  1363     multmask = ~multmask;
  1364     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1365 
  1366     while (height--) {
  1367 	    /* *INDENT-OFF* */
  1368 	    DUFFS_LOOP4({
  1369 		Uint32 alpha;
  1370 
  1371 		_m_prefetch(srcp + 16);
  1372 		_m_prefetch(dstp + 16);
  1373 
  1374 		alpha = *srcp & amask;
  1375 		if (alpha == 0) {
  1376 			/* do nothing */
  1377 		} else if (alpha == amask) {
  1378 			/* copy RGB, keep dst alpha */
  1379 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1380 		} else {
  1381 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1382 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1383 
  1384 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1385 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1386 
  1387 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1388 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1389 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1390 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1391 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1392 
  1393 			/* blend */		    
  1394 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1395 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1396 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1397 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1398 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1399 			
  1400 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1401 		}
  1402 		++srcp;
  1403 		++dstp;
  1404 	    }, width);
  1405 	    /* *INDENT-ON* */
  1406         srcp += srcskip;
  1407         dstp += dstskip;
  1408     }
  1409     _mm_empty();
  1410 }
  1411 
  1412 #endif /* __MMX__ */
  1413 
  1414 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1415 
  1416 /* blend a single 16 bit pixel at 50% */
  1417 #define BLEND16_50(d, s, mask)						\
  1418 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1419 
  1420 /* blend two 16 bit pixels at 50% */
  1421 #define BLEND2x16_50(d, s, mask)					     \
  1422 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1423 	 + (s & d & (~(mask | mask << 16))))
  1424 
  1425 static void
  1426 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1427 {
  1428     int width = info->dst_w;
  1429     int height = info->dst_h;
  1430     Uint16 *srcp = (Uint16 *) info->src;
  1431     int srcskip = info->src_skip >> 1;
  1432     Uint16 *dstp = (Uint16 *) info->dst;
  1433     int dstskip = info->dst_skip >> 1;
  1434 
  1435     while (height--) {
  1436         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1437             /*
  1438              * Source and destination not aligned, pipeline it.
  1439              * This is mostly a win for big blits but no loss for
  1440              * small ones
  1441              */
  1442             Uint32 prev_sw;
  1443             int w = width;
  1444 
  1445             /* handle odd destination */
  1446             if ((uintptr_t) dstp & 2) {
  1447                 Uint16 d = *dstp, s = *srcp;
  1448                 *dstp = BLEND16_50(d, s, mask);
  1449                 dstp++;
  1450                 srcp++;
  1451                 w--;
  1452             }
  1453             srcp++;             /* srcp is now 32-bit aligned */
  1454 
  1455             /* bootstrap pipeline with first halfword */
  1456             prev_sw = ((Uint32 *) srcp)[-1];
  1457 
  1458             while (w > 1) {
  1459                 Uint32 sw, dw, s;
  1460                 sw = *(Uint32 *) srcp;
  1461                 dw = *(Uint32 *) dstp;
  1462 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1463                 s = (prev_sw << 16) + (sw >> 16);
  1464 #else
  1465                 s = (prev_sw >> 16) + (sw << 16);
  1466 #endif
  1467                 prev_sw = sw;
  1468                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1469                 dstp += 2;
  1470                 srcp += 2;
  1471                 w -= 2;
  1472             }
  1473 
  1474             /* final pixel if any */
  1475             if (w) {
  1476                 Uint16 d = *dstp, s;
  1477 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1478                 s = (Uint16) prev_sw;
  1479 #else
  1480                 s = (Uint16) (prev_sw >> 16);
  1481 #endif
  1482                 *dstp = BLEND16_50(d, s, mask);
  1483                 srcp++;
  1484                 dstp++;
  1485             }
  1486             srcp += srcskip - 1;
  1487             dstp += dstskip;
  1488         } else {
  1489             /* source and destination are aligned */
  1490             int w = width;
  1491 
  1492             /* first odd pixel? */
  1493             if ((uintptr_t) srcp & 2) {
  1494                 Uint16 d = *dstp, s = *srcp;
  1495                 *dstp = BLEND16_50(d, s, mask);
  1496                 srcp++;
  1497                 dstp++;
  1498                 w--;
  1499             }
  1500             /* srcp and dstp are now 32-bit aligned */
  1501 
  1502             while (w > 1) {
  1503                 Uint32 sw = *(Uint32 *) srcp;
  1504                 Uint32 dw = *(Uint32 *) dstp;
  1505                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1506                 srcp += 2;
  1507                 dstp += 2;
  1508                 w -= 2;
  1509             }
  1510 
  1511             /* last odd pixel? */
  1512             if (w) {
  1513                 Uint16 d = *dstp, s = *srcp;
  1514                 *dstp = BLEND16_50(d, s, mask);
  1515                 srcp++;
  1516                 dstp++;
  1517             }
  1518             srcp += srcskip;
  1519             dstp += dstskip;
  1520         }
  1521     }
  1522 }
  1523 
  1524 #ifdef __MMX__
  1525 
  1526 /* fast RGB565->RGB565 blending with surface alpha */
  1527 static void
  1528 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1529 {
  1530     unsigned alpha = info->a;
  1531     if (alpha == 128) {
  1532         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1533     } else {
  1534         int width = info->dst_w;
  1535         int height = info->dst_h;
  1536         Uint16 *srcp = (Uint16 *) info->src;
  1537         int srcskip = info->src_skip >> 1;
  1538         Uint16 *dstp = (Uint16 *) info->dst;
  1539         int dstskip = info->dst_skip >> 1;
  1540         Uint32 s, d;
  1541 
  1542         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  1543 
  1544         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1545         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1546         alpha >>= 3;            /* downscale alpha to 5 bits */
  1547 
  1548         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1549         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1550         /* position alpha to allow for mullo and mulhi on diff channels
  1551            to reduce the number of operations */
  1552         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1553 
  1554         /* Setup the 565 color channel masks */
  1555         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  1556         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1557 
  1558         while (height--) {
  1559 			/* *INDENT-OFF* */
  1560 			DUFFS_LOOP_124(
  1561 			{
  1562 				s = *srcp++;
  1563 				d = *dstp;
  1564 				/*
  1565 				 * shift out the middle component (green) to
  1566 				 * the high 16 bits, and process all three RGB
  1567 				 * components at the same time.
  1568 				 */
  1569 				s = (s | s << 16) & 0x07e0f81f;
  1570 				d = (d | d << 16) & 0x07e0f81f;
  1571 				d += (s - d) * alpha >> 5;
  1572 				d &= 0x07e0f81f;
  1573 				*dstp++ = (Uint16)(d | d >> 16);
  1574 			},{
  1575 				s = *srcp++;
  1576 				d = *dstp;
  1577 				/*
  1578 				 * shift out the middle component (green) to
  1579 				 * the high 16 bits, and process all three RGB
  1580 				 * components at the same time.
  1581 				 */
  1582 				s = (s | s << 16) & 0x07e0f81f;
  1583 				d = (d | d << 16) & 0x07e0f81f;
  1584 				d += (s - d) * alpha >> 5;
  1585 				d &= 0x07e0f81f;
  1586 				*dstp++ = (Uint16)(d | d >> 16);
  1587 				s = *srcp++;
  1588 				d = *dstp;
  1589 				/*
  1590 				 * shift out the middle component (green) to
  1591 				 * the high 16 bits, and process all three RGB
  1592 				 * components at the same time.
  1593 				 */
  1594 				s = (s | s << 16) & 0x07e0f81f;
  1595 				d = (d | d << 16) & 0x07e0f81f;
  1596 				d += (s - d) * alpha >> 5;
  1597 				d &= 0x07e0f81f;
  1598 				*dstp++ = (Uint16)(d | d >> 16);
  1599 			},{
  1600 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1601 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1602 
  1603 				/* red */
  1604 				src2 = src1;
  1605 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  1606 
  1607 				dst2 = dst1;
  1608 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  1609 
  1610 				/* blend */
  1611 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1612 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1613 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1614 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1615 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  1616 
  1617 				mm_res = dst2; /* RED -> mm_res */
  1618 
  1619 				/* green -- process the bits in place */
  1620 				src2 = src1;
  1621 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1622 
  1623 				dst2 = dst1;
  1624 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1625 
  1626 				/* blend */
  1627 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1628 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1629 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1630 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1631 
  1632 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1633 
  1634 				/* blue */
  1635 				src2 = src1;
  1636 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1637 
  1638 				dst2 = dst1;
  1639 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1640 
  1641 				/* blend */
  1642 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1643 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1644 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1645 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1646 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1647 
  1648 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1649 
  1650 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1651 
  1652 				srcp += 4;
  1653 				dstp += 4;
  1654 			}, width);
  1655 			/* *INDENT-ON* */
  1656             srcp += srcskip;
  1657             dstp += dstskip;
  1658         }
  1659         _mm_empty();
  1660     }
  1661 }
  1662 
  1663 /* fast RGB555->RGB555 blending with surface alpha */
  1664 static void
  1665 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  1666 {
  1667     unsigned alpha = info->a;
  1668     if (alpha == 128) {
  1669         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1670     } else {
  1671         int width = info->dst_w;
  1672         int height = info->dst_h;
  1673         Uint16 *srcp = (Uint16 *) info->src;
  1674         int srcskip = info->src_skip >> 1;
  1675         Uint16 *dstp = (Uint16 *) info->dst;
  1676         int dstskip = info->dst_skip >> 1;
  1677         Uint32 s, d;
  1678 
  1679         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  1680 
  1681         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1682         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1683         alpha >>= 3;            /* downscale alpha to 5 bits */
  1684 
  1685         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1686         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1687         /* position alpha to allow for mullo and mulhi on diff channels
  1688            to reduce the number of operations */
  1689         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1690 
  1691         /* Setup the 555 color channel masks */
  1692         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  1693         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  1694         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1695 
  1696         while (height--) {
  1697 			/* *INDENT-OFF* */
  1698 			DUFFS_LOOP_124(
  1699 			{
  1700 				s = *srcp++;
  1701 				d = *dstp;
  1702 				/*
  1703 				 * shift out the middle component (green) to
  1704 				 * the high 16 bits, and process all three RGB
  1705 				 * components at the same time.
  1706 				 */
  1707 				s = (s | s << 16) & 0x03e07c1f;
  1708 				d = (d | d << 16) & 0x03e07c1f;
  1709 				d += (s - d) * alpha >> 5;
  1710 				d &= 0x03e07c1f;
  1711 				*dstp++ = (Uint16)(d | d >> 16);
  1712 			},{
  1713 				s = *srcp++;
  1714 				d = *dstp;
  1715 				/*
  1716 				 * shift out the middle component (green) to
  1717 				 * the high 16 bits, and process all three RGB
  1718 				 * components at the same time.
  1719 				 */
  1720 				s = (s | s << 16) & 0x03e07c1f;
  1721 				d = (d | d << 16) & 0x03e07c1f;
  1722 				d += (s - d) * alpha >> 5;
  1723 				d &= 0x03e07c1f;
  1724 				*dstp++ = (Uint16)(d | d >> 16);
  1725 			        s = *srcp++;
  1726 				d = *dstp;
  1727 				/*
  1728 				 * shift out the middle component (green) to
  1729 				 * the high 16 bits, and process all three RGB
  1730 				 * components at the same time.
  1731 				 */
  1732 				s = (s | s << 16) & 0x03e07c1f;
  1733 				d = (d | d << 16) & 0x03e07c1f;
  1734 				d += (s - d) * alpha >> 5;
  1735 				d &= 0x03e07c1f;
  1736 				*dstp++ = (Uint16)(d | d >> 16);
  1737 			},{
  1738 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1739 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1740 
  1741 				/* red -- process the bits in place */
  1742 				src2 = src1;
  1743 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  1744 
  1745 				dst2 = dst1;
  1746 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  1747 
  1748 				/* blend */
  1749 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1750 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1751 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1752 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1753 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  1754 
  1755 				mm_res = dst2; /* RED -> mm_res */
  1756 				
  1757 				/* green -- process the bits in place */
  1758 				src2 = src1;
  1759 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1760 
  1761 				dst2 = dst1;
  1762 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1763 
  1764 				/* blend */
  1765 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1766 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1767 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1768 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1769 
  1770 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1771 
  1772 				/* blue */
  1773 				src2 = src1; /* src -> src2 */
  1774 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1775 
  1776 				dst2 = dst1; /* dst -> dst2 */
  1777 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1778 
  1779 				/* blend */
  1780 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1781 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1782 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1783 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1784 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1785 
  1786 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1787 
  1788 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1789 
  1790 				srcp += 4;
  1791 				dstp += 4;
  1792 			}, width);
  1793 			/* *INDENT-ON* */
  1794             srcp += srcskip;
  1795             dstp += dstskip;
  1796         }
  1797         _mm_empty();
  1798     }
  1799 }
  1800 
  1801 #endif /* __MMX__ */
  1802 
  1803 /* fast RGB565->RGB565 blending with surface alpha */
  1804 static void
  1805 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1806 {
  1807     unsigned alpha = info->a;
  1808     if (alpha == 128) {
  1809         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1810     } else {
  1811         int width = info->dst_w;
  1812         int height = info->dst_h;
  1813         Uint16 *srcp = (Uint16 *) info->src;
  1814         int srcskip = info->src_skip >> 1;
  1815         Uint16 *dstp = (Uint16 *) info->dst;
  1816         int dstskip = info->dst_skip >> 1;
  1817         alpha >>= 3;            /* downscale alpha to 5 bits */
  1818 
  1819         while (height--) {
  1820 			/* *INDENT-OFF* */
  1821 			DUFFS_LOOP4({
  1822 				Uint32 s = *srcp++;
  1823 				Uint32 d = *dstp;
  1824 				/*
  1825 				 * shift out the middle component (green) to
  1826 				 * the high 16 bits, and process all three RGB
  1827 				 * components at the same time.
  1828 				 */
  1829 				s = (s | s << 16) & 0x07e0f81f;
  1830 				d = (d | d << 16) & 0x07e0f81f;
  1831 				d += (s - d) * alpha >> 5;
  1832 				d &= 0x07e0f81f;
  1833 				*dstp++ = (Uint16)(d | d >> 16);
  1834 			}, width);
  1835 			/* *INDENT-ON* */
  1836             srcp += srcskip;
  1837             dstp += dstskip;
  1838         }
  1839     }
  1840 }
  1841 
  1842 /* fast RGB555->RGB555 blending with surface alpha */
  1843 static void
  1844 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1845 {
  1846     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1847     if (alpha == 128) {
  1848         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1849     } else {
  1850         int width = info->dst_w;
  1851         int height = info->dst_h;
  1852         Uint16 *srcp = (Uint16 *) info->src;
  1853         int srcskip = info->src_skip >> 1;
  1854         Uint16 *dstp = (Uint16 *) info->dst;
  1855         int dstskip = info->dst_skip >> 1;
  1856         alpha >>= 3;            /* downscale alpha to 5 bits */
  1857 
  1858         while (height--) {
  1859 			/* *INDENT-OFF* */
  1860 			DUFFS_LOOP4({
  1861 				Uint32 s = *srcp++;
  1862 				Uint32 d = *dstp;
  1863 				/*
  1864 				 * shift out the middle component (green) to
  1865 				 * the high 16 bits, and process all three RGB
  1866 				 * components at the same time.
  1867 				 */
  1868 				s = (s | s << 16) & 0x03e07c1f;
  1869 				d = (d | d << 16) & 0x03e07c1f;
  1870 				d += (s - d) * alpha >> 5;
  1871 				d &= 0x03e07c1f;
  1872 				*dstp++ = (Uint16)(d | d >> 16);
  1873 			}, width);
  1874 			/* *INDENT-ON* */
  1875             srcp += srcskip;
  1876             dstp += dstskip;
  1877         }
  1878     }
  1879 }
  1880 
  1881 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1882 static void
  1883 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1884 {
  1885     int width = info->dst_w;
  1886     int height = info->dst_h;
  1887     Uint32 *srcp = (Uint32 *) info->src;
  1888     int srcskip = info->src_skip >> 2;
  1889     Uint16 *dstp = (Uint16 *) info->dst;
  1890     int dstskip = info->dst_skip >> 1;
  1891 
  1892     while (height--) {
  1893 	    /* *INDENT-OFF* */
  1894 	    DUFFS_LOOP4({
  1895 		Uint32 s = *srcp;
  1896 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1897 		/* FIXME: Here we special-case opaque alpha since the
  1898 		   compositioning used (>>8 instead of /255) doesn't handle
  1899 		   it correctly. Also special-case alpha=0 for speed?
  1900 		   Benchmark this! */
  1901 		if(alpha) {   
  1902 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1903 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1904 		  } else {
  1905 		    Uint32 d = *dstp;
  1906 		    /*
  1907 		     * convert source and destination to G0RAB65565
  1908 		     * and blend all components at the same time
  1909 		     */
  1910 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1911 		      + (s >> 3 & 0x1f);
  1912 		    d = (d | d << 16) & 0x07e0f81f;
  1913 		    d += (s - d) * alpha >> 5;
  1914 		    d &= 0x07e0f81f;
  1915 		    *dstp = (Uint16)(d | d >> 16);
  1916 		  }
  1917 		}
  1918 		srcp++;
  1919 		dstp++;
  1920 	    }, width);
  1921 	    /* *INDENT-ON* */
  1922         srcp += srcskip;
  1923         dstp += dstskip;
  1924     }
  1925 }
  1926 
  1927 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1928 static void
  1929 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1930 {
  1931     int width = info->dst_w;
  1932     int height = info->dst_h;
  1933     Uint32 *srcp = (Uint32 *) info->src;
  1934     int srcskip = info->src_skip >> 2;
  1935     Uint16 *dstp = (Uint16 *) info->dst;
  1936     int dstskip = info->dst_skip >> 1;
  1937 
  1938     while (height--) {
  1939 	    /* *INDENT-OFF* */
  1940 	    DUFFS_LOOP4({
  1941 		unsigned alpha;
  1942 		Uint32 s = *srcp;
  1943 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1944 		/* FIXME: Here we special-case opaque alpha since the
  1945 		   compositioning used (>>8 instead of /255) doesn't handle
  1946 		   it correctly. Also special-case alpha=0 for speed?
  1947 		   Benchmark this! */
  1948 		if(alpha) {   
  1949 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1950 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1951 		  } else {
  1952 		    Uint32 d = *dstp;
  1953 		    /*
  1954 		     * convert source and destination to G0RAB65565
  1955 		     * and blend all components at the same time
  1956 		     */
  1957 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1958 		      + (s >> 3 & 0x1f);
  1959 		    d = (d | d << 16) & 0x03e07c1f;
  1960 		    d += (s - d) * alpha >> 5;
  1961 		    d &= 0x03e07c1f;
  1962 		    *dstp = (Uint16)(d | d >> 16);
  1963 		  }
  1964 		}
  1965 		srcp++;
  1966 		dstp++;
  1967 	    }, width);
  1968 	    /* *INDENT-ON* */
  1969         srcp += srcskip;
  1970         dstp += dstskip;
  1971     }
  1972 }
  1973 
  1974 /* General (slow) N->N blending with per-surface alpha */
  1975 static void
  1976 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1977 {
  1978     int width = info->dst_w;
  1979     int height = info->dst_h;
  1980     Uint8 *src = info->src;
  1981     int srcskip = info->src_skip;
  1982     Uint8 *dst = info->dst;
  1983     int dstskip = info->dst_skip;
  1984     SDL_PixelFormat *srcfmt = info->src_fmt;
  1985     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1986     int srcbpp = srcfmt->BytesPerPixel;
  1987     int dstbpp = dstfmt->BytesPerPixel;
  1988     unsigned sA = info->a;
  1989     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1990 
  1991     if (sA) {
  1992         while (height--) {
  1993 	    /* *INDENT-OFF* */
  1994 	    DUFFS_LOOP4(
  1995 	    {
  1996 		Uint32 Pixel;
  1997 		unsigned sR;
  1998 		unsigned sG;
  1999 		unsigned sB;
  2000 		unsigned dR;
  2001 		unsigned dG;
  2002 		unsigned dB;
  2003 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2004 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2005 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2006 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2007 		src += srcbpp;
  2008 		dst += dstbpp;
  2009 	    },
  2010 	    width);
  2011 	    /* *INDENT-ON* */
  2012             src += srcskip;
  2013             dst += dstskip;
  2014         }
  2015     }
  2016 }
  2017 
  2018 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2019 static void
  2020 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2021 {
  2022     int width = info->dst_w;
  2023     int height = info->dst_h;
  2024     Uint8 *src = info->src;
  2025     int srcskip = info->src_skip;
  2026     Uint8 *dst = info->dst;
  2027     int dstskip = info->dst_skip;
  2028     SDL_PixelFormat *srcfmt = info->src_fmt;
  2029     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2030     Uint32 ckey = info->colorkey;
  2031     int srcbpp = srcfmt->BytesPerPixel;
  2032     int dstbpp = dstfmt->BytesPerPixel;
  2033     unsigned sA = info->a;
  2034     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2035 
  2036     while (height--) {
  2037 	    /* *INDENT-OFF* */
  2038 	    DUFFS_LOOP4(
  2039 	    {
  2040 		Uint32 Pixel;
  2041 		unsigned sR;
  2042 		unsigned sG;
  2043 		unsigned sB;
  2044 		unsigned dR;
  2045 		unsigned dG;
  2046 		unsigned dB;
  2047 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2048 		if(sA && Pixel != ckey) {
  2049 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2050 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2051 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2052 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2053 		}
  2054 		src += srcbpp;
  2055 		dst += dstbpp;
  2056 	    },
  2057 	    width);
  2058 	    /* *INDENT-ON* */
  2059         src += srcskip;
  2060         dst += dstskip;
  2061     }
  2062 }
  2063 
  2064 /* General (slow) N->N blending with pixel alpha */
  2065 static void
  2066 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2067 {
  2068     int width = info->dst_w;
  2069     int height = info->dst_h;
  2070     Uint8 *src = info->src;
  2071     int srcskip = info->src_skip;
  2072     Uint8 *dst = info->dst;
  2073     int dstskip = info->dst_skip;
  2074     SDL_PixelFormat *srcfmt = info->src_fmt;
  2075     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2076 
  2077     int srcbpp;
  2078     int dstbpp;
  2079 
  2080     /* Set up some basic variables */
  2081     srcbpp = srcfmt->BytesPerPixel;
  2082     dstbpp = dstfmt->BytesPerPixel;
  2083 
  2084     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2085        quite right. for <8bpp source alpha, it gets them very wrong
  2086        (check all macros!)
  2087        It is unclear whether there is a good general solution that doesn't
  2088        need a branch (or a divide). */
  2089     while (height--) {
  2090 	    /* *INDENT-OFF* */
  2091 	    DUFFS_LOOP4(
  2092 	    {
  2093 		Uint32 Pixel;
  2094 		unsigned sR;
  2095 		unsigned sG;
  2096 		unsigned sB;
  2097 		unsigned dR;
  2098 		unsigned dG;
  2099 		unsigned dB;
  2100 		unsigned sA;
  2101 		unsigned dA;
  2102 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2103 		if(sA) {
  2104 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2105 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2106 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2107 		}
  2108 		src += srcbpp;
  2109 		dst += dstbpp;
  2110 	    },
  2111 	    width);
  2112 	    /* *INDENT-ON* */
  2113         src += srcskip;
  2114         dst += dstskip;
  2115     }
  2116 }
  2117 
  2118 
  2119 SDL_BlitFunc
  2120 SDL_CalculateBlitA(SDL_Surface * surface)
  2121 {
  2122     SDL_PixelFormat *sf = surface->format;
  2123     SDL_PixelFormat *df = surface->map->dst->format;
  2124 
  2125     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  2126     case SDL_COPY_BLEND:
  2127         /* Per-pixel alpha blits */
  2128         switch (df->BytesPerPixel) {
  2129         case 1:
  2130             return BlitNto1PixelAlpha;
  2131 
  2132         case 2:
  2133 #if SDL_ALTIVEC_BLITTERS
  2134             if (sf->BytesPerPixel == 4
  2135                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2136                 && SDL_HasAltiVec())
  2137                 return Blit32to565PixelAlphaAltivec;
  2138             else
  2139 #endif
  2140                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2141                     && sf->Gmask == 0xff00
  2142                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2143                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2144                 if (df->Gmask == 0x7e0)
  2145                     return BlitARGBto565PixelAlpha;
  2146                 else if (df->Gmask == 0x3e0)
  2147                     return BlitARGBto555PixelAlpha;
  2148             }
  2149             return BlitNtoNPixelAlpha;
  2150 
  2151         case 4:
  2152             if (sf->Rmask == df->Rmask
  2153                 && sf->Gmask == df->Gmask
  2154                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2155 #if defined(__MMX__) || defined(__3dNOW__)
  2156                 if (sf->Rshift % 8 == 0
  2157                     && sf->Gshift % 8 == 0
  2158                     && sf->Bshift % 8 == 0
  2159                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2160 #ifdef __3dNOW__
  2161                     if (SDL_Has3DNow())
  2162                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2163 #endif
  2164 #ifdef __MMX__
  2165                     if (SDL_HasMMX())
  2166                         return BlitRGBtoRGBPixelAlphaMMX;
  2167 #endif
  2168                 }
  2169 #endif /* __MMX__ || __3dNOW__ */
  2170                 if (sf->Amask == 0xff000000) {
  2171 #if SDL_ALTIVEC_BLITTERS
  2172                     if (SDL_HasAltiVec())
  2173                         return BlitRGBtoRGBPixelAlphaAltivec;
  2174 #endif
  2175                     return BlitRGBtoRGBPixelAlpha;
  2176                 }
  2177             }
  2178 #if SDL_ALTIVEC_BLITTERS
  2179             if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
  2180                 return Blit32to32PixelAlphaAltivec;
  2181             else
  2182 #endif
  2183                 return BlitNtoNPixelAlpha;
  2184 
  2185         case 3:
  2186         default:
  2187             return BlitNtoNPixelAlpha;
  2188         }
  2189         break;
  2190 
  2191     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2192         if (sf->Amask == 0) {
  2193             /* Per-surface alpha blits */
  2194             switch (df->BytesPerPixel) {
  2195             case 1:
  2196                 return BlitNto1SurfaceAlpha;
  2197 
  2198             case 2:
  2199                 if (surface->map->identity) {
  2200                     if (df->Gmask == 0x7e0) {
  2201 #ifdef __MMX__
  2202                         if (SDL_HasMMX())
  2203                             return Blit565to565SurfaceAlphaMMX;
  2204                         else
  2205 #endif
  2206                             return Blit565to565SurfaceAlpha;
  2207                     } else if (df->Gmask == 0x3e0) {
  2208 #ifdef __MMX__
  2209                         if (SDL_HasMMX())
  2210                             return Blit555to555SurfaceAlphaMMX;
  2211                         else
  2212 #endif
  2213                             return Blit555to555SurfaceAlpha;
  2214                     }
  2215                 }
  2216                 return BlitNtoNSurfaceAlpha;
  2217 
  2218             case 4:
  2219                 if (sf->Rmask == df->Rmask
  2220                     && sf->Gmask == df->Gmask
  2221                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2222 #ifdef __MMX__
  2223                     if (sf->Rshift % 8 == 0
  2224                         && sf->Gshift % 8 == 0
  2225                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2226                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2227 #endif
  2228                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2229 #if SDL_ALTIVEC_BLITTERS
  2230                         if (SDL_HasAltiVec())
  2231                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2232 #endif
  2233                         return BlitRGBtoRGBSurfaceAlpha;
  2234                     }
  2235                 }
  2236 #if SDL_ALTIVEC_BLITTERS
  2237                 if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
  2238                     return Blit32to32SurfaceAlphaAltivec;
  2239                 else
  2240 #endif
  2241                     return BlitNtoNSurfaceAlpha;
  2242 
  2243             case 3:
  2244             default:
  2245                 return BlitNtoNSurfaceAlpha;
  2246             }
  2247         }
  2248         break;
  2249 
  2250     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2251         if (sf->Amask == 0) {
  2252             if (df->BytesPerPixel == 1)
  2253                 return BlitNto1SurfaceAlphaKey;
  2254             else
  2255 #if SDL_ALTIVEC_BLITTERS
  2256             if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2257                     SDL_HasAltiVec())
  2258                 return Blit32to32SurfaceAlphaKeyAltivec;
  2259             else
  2260 #endif
  2261                 return BlitNtoNSurfaceAlphaKey;
  2262         }
  2263         break;
  2264     }
  2265 
  2266     return NULL;
  2267 }
  2268 
  2269 /* vi: set ts=4 sw=4 expandtab: */