src/video/SDL_blit_A.c
author Darren Alton <dalton@stevens.edu>
Wed, 27 Aug 2008 04:23:38 +0000
branchgsoc2008_nds
changeset 2698 e1da92da346c
parent 2291 dc3dd3a0bf02
child 2853 6258fa7cd300
permissions -rw-r--r--
Clean up.
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /* Functions to perform alpha blended blitting */
    28 
    29 /* N->1 blending with per-surface alpha */
    30 static void
    31 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    32 {
    33     int width = info->dst_w;
    34     int height = info->dst_h;
    35     Uint8 *src = info->src;
    36     int srcskip = info->src_skip;
    37     Uint8 *dst = info->dst;
    38     int dstskip = info->dst_skip;
    39     Uint8 *palmap = info->table;
    40     SDL_PixelFormat *srcfmt = info->src_fmt;
    41     SDL_PixelFormat *dstfmt = info->dst_fmt;
    42     int srcbpp = srcfmt->BytesPerPixel;
    43 
    44     const unsigned A = info->a;
    45 
    46     while (height--) {
    47 	    /* *INDENT-OFF* */
    48 	    DUFFS_LOOP4(
    49 	    {
    50 		Uint32 Pixel;
    51 		unsigned sR;
    52 		unsigned sG;
    53 		unsigned sB;
    54 		unsigned dR;
    55 		unsigned dG;
    56 		unsigned dB;
    57 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    58 		dR = dstfmt->palette->colors[*dst].r;
    59 		dG = dstfmt->palette->colors[*dst].g;
    60 		dB = dstfmt->palette->colors[*dst].b;
    61 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    62 		dR &= 0xff;
    63 		dG &= 0xff;
    64 		dB &= 0xff;
    65 		/* Pack RGB into 8bit pixel */
    66 		if ( palmap == NULL ) {
    67 		    *dst =((dR>>5)<<(3+2))|
    68 			  ((dG>>5)<<(2))|
    69 			  ((dB>>6)<<(0));
    70 		} else {
    71 		    *dst = palmap[((dR>>5)<<(3+2))|
    72 				  ((dG>>5)<<(2))  |
    73 				  ((dB>>6)<<(0))];
    74 		}
    75 		dst++;
    76 		src += srcbpp;
    77 	    },
    78 	    width);
    79 	    /* *INDENT-ON* */
    80         src += srcskip;
    81         dst += dstskip;
    82     }
    83 }
    84 
    85 /* N->1 blending with pixel alpha */
    86 static void
    87 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    88 {
    89     int width = info->dst_w;
    90     int height = info->dst_h;
    91     Uint8 *src = info->src;
    92     int srcskip = info->src_skip;
    93     Uint8 *dst = info->dst;
    94     int dstskip = info->dst_skip;
    95     Uint8 *palmap = info->table;
    96     SDL_PixelFormat *srcfmt = info->src_fmt;
    97     SDL_PixelFormat *dstfmt = info->dst_fmt;
    98     int srcbpp = srcfmt->BytesPerPixel;
    99 
   100     /* FIXME: fix alpha bit field expansion here too? */
   101     while (height--) {
   102 	    /* *INDENT-OFF* */
   103 	    DUFFS_LOOP4(
   104 	    {
   105 		Uint32 Pixel;
   106 		unsigned sR;
   107 		unsigned sG;
   108 		unsigned sB;
   109 		unsigned sA;
   110 		unsigned dR;
   111 		unsigned dG;
   112 		unsigned dB;
   113 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   114 		dR = dstfmt->palette->colors[*dst].r;
   115 		dG = dstfmt->palette->colors[*dst].g;
   116 		dB = dstfmt->palette->colors[*dst].b;
   117 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   118 		dR &= 0xff;
   119 		dG &= 0xff;
   120 		dB &= 0xff;
   121 		/* Pack RGB into 8bit pixel */
   122 		if ( palmap == NULL ) {
   123 		    *dst =((dR>>5)<<(3+2))|
   124 			  ((dG>>5)<<(2))|
   125 			  ((dB>>6)<<(0));
   126 		} else {
   127 		    *dst = palmap[((dR>>5)<<(3+2))|
   128 				  ((dG>>5)<<(2))  |
   129 				  ((dB>>6)<<(0))  ];
   130 		}
   131 		dst++;
   132 		src += srcbpp;
   133 	    },
   134 	    width);
   135 	    /* *INDENT-ON* */
   136         src += srcskip;
   137         dst += dstskip;
   138     }
   139 }
   140 
   141 /* colorkeyed N->1 blending with per-surface alpha */
   142 static void
   143 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   144 {
   145     int width = info->dst_w;
   146     int height = info->dst_h;
   147     Uint8 *src = info->src;
   148     int srcskip = info->src_skip;
   149     Uint8 *dst = info->dst;
   150     int dstskip = info->dst_skip;
   151     Uint8 *palmap = info->table;
   152     SDL_PixelFormat *srcfmt = info->src_fmt;
   153     SDL_PixelFormat *dstfmt = info->dst_fmt;
   154     int srcbpp = srcfmt->BytesPerPixel;
   155     Uint32 ckey = info->colorkey;
   156 
   157     const int A = info->a;
   158 
   159     while (height--) {
   160 	    /* *INDENT-OFF* */
   161 	    DUFFS_LOOP(
   162 	    {
   163 		Uint32 Pixel;
   164 		unsigned sR;
   165 		unsigned sG;
   166 		unsigned sB;
   167 		unsigned dR;
   168 		unsigned dG;
   169 		unsigned dB;
   170 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   171 		if ( Pixel != ckey ) {
   172 		    dR = dstfmt->palette->colors[*dst].r;
   173 		    dG = dstfmt->palette->colors[*dst].g;
   174 		    dB = dstfmt->palette->colors[*dst].b;
   175 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   176 		    dR &= 0xff;
   177 		    dG &= 0xff;
   178 		    dB &= 0xff;
   179 		    /* Pack RGB into 8bit pixel */
   180 		    if ( palmap == NULL ) {
   181 			*dst =((dR>>5)<<(3+2))|
   182 			      ((dG>>5)<<(2)) |
   183 			      ((dB>>6)<<(0));
   184 		    } else {
   185 			*dst = palmap[((dR>>5)<<(3+2))|
   186 				      ((dG>>5)<<(2))  |
   187 				      ((dB>>6)<<(0))  ];
   188 		    }
   189 		}
   190 		dst++;
   191 		src += srcbpp;
   192 	    },
   193 	    width);
   194 	    /* *INDENT-ON* */
   195         src += srcskip;
   196         dst += dstskip;
   197     }
   198 }
   199 
   200 #ifdef __MMX__
   201 
   202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   203 static void
   204 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   205 {
   206     int width = info->dst_w;
   207     int height = info->dst_h;
   208     Uint32 *srcp = (Uint32 *) info->src;
   209     int srcskip = info->src_skip >> 2;
   210     Uint32 *dstp = (Uint32 *) info->dst;
   211     int dstskip = info->dst_skip >> 2;
   212     Uint32 dalpha = info->dst_fmt->Amask;
   213 
   214     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   215 
   216     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   217     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   218     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   219 
   220     while (height--) {
   221         int n = width;
   222         if (n & 1) {
   223             Uint32 s = *srcp++;
   224             Uint32 d = *dstp;
   225             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   226                        + (s & d & 0x00010101)) | dalpha;
   227             n--;
   228         }
   229 
   230         for (n >>= 1; n > 0; --n) {
   231             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   232             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   233 
   234             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   235             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   236 
   237             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   238             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   239             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   240             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   241 
   242             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   243             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   244             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   245             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   246 
   247             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   248             dstp += 2;
   249             srcp += 2;
   250         }
   251 
   252         srcp += srcskip;
   253         dstp += dstskip;
   254     }
   255     _mm_empty();
   256 }
   257 
   258 /* fast RGB888->(A)RGB888 blending with surface alpha */
   259 static void
   260 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   261 {
   262     SDL_PixelFormat *df = info->dst_fmt;
   263     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   264     unsigned alpha = info->a;
   265 
   266     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   267         /* only call a128 version when R,G,B occupy lower bits */
   268         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   269     } else {
   270         int width = info->dst_w;
   271         int height = info->dst_h;
   272         Uint32 *srcp = (Uint32 *) info->src;
   273         int srcskip = info->src_skip >> 2;
   274         Uint32 *dstp = (Uint32 *) info->dst;
   275         int dstskip = info->dst_skip >> 2;
   276         Uint32 dalpha = df->Amask;
   277         Uint32 amult;
   278 
   279         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   280 
   281         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   282         /* form the alpha mult */
   283         amult = alpha | (alpha << 8);
   284         amult = amult | (amult << 16);
   285         chanmask =
   286             (0xff << df->Rshift) | (0xff << df->
   287                                     Gshift) | (0xff << df->Bshift);
   288         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   289         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   290         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   291         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   292 
   293         while (height--) {
   294             int n = width;
   295             if (n & 1) {
   296                 /* One Pixel Blend */
   297                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   298                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   299 
   300                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   301                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   302 
   303                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   304                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   305                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   306                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   307 
   308                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   309                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   310                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   311 
   312                 ++srcp;
   313                 ++dstp;
   314 
   315                 n--;
   316             }
   317 
   318             for (n >>= 1; n > 0; --n) {
   319                 /* Two Pixels Blend */
   320                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   321                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   322                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   323                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   324 
   325                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   326                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   327                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   328                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   329 
   330                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   331                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   332                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   333                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   334 
   335                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   336                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   337                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   338                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   339 
   340                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   341                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   342 
   343                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   344 
   345                 srcp += 2;
   346                 dstp += 2;
   347             }
   348             srcp += srcskip;
   349             dstp += dstskip;
   350         }
   351         _mm_empty();
   352     }
   353 }
   354 
   355 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   356 static void
   357 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   358 {
   359     int width = info->dst_w;
   360     int height = info->dst_h;
   361     Uint32 *srcp = (Uint32 *) info->src;
   362     int srcskip = info->src_skip >> 2;
   363     Uint32 *dstp = (Uint32 *) info->dst;
   364     int dstskip = info->dst_skip >> 2;
   365     SDL_PixelFormat *sf = info->src_fmt;
   366     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   367     Uint32 amask = sf->Amask;
   368     Uint32 ashift = sf->Ashift;
   369     Uint64 multmask;
   370 
   371     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   372 
   373     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   374     multmask = 0xFFFF;
   375     multmask <<= (ashift * 2);
   376     multmask = ~multmask;
   377     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   378 
   379     while (height--) {
   380 		/* *INDENT-OFF* */
   381 		DUFFS_LOOP4({
   382 		Uint32 alpha = *srcp & amask;
   383 		if (alpha == 0) {
   384 			/* do nothing */
   385 		} else if (alpha == amask) {
   386 			/* opaque alpha -- copy RGB, keep dst alpha */
   387 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   388 		} else {
   389 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   390 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   391 
   392 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   393 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   394 
   395 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   396 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   397 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   398 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   399 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   400 
   401 			/* blend */		    
   402 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   403 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   404 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   405 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   406 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   407 			
   408 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   409 		}
   410 		++srcp;
   411 		++dstp;
   412 	    }, width);
   413 		/* *INDENT-ON* */
   414         srcp += srcskip;
   415         dstp += dstskip;
   416     }
   417     _mm_empty();
   418 }
   419 
   420 #endif /* __MMX__ */
   421 
   422 #if SDL_ALTIVEC_BLITTERS
   423 #if __MWERKS__
   424 #pragma altivec_model on
   425 #endif
   426 #if HAVE_ALTIVEC_H
   427 #include <altivec.h>
   428 #endif
   429 #include <assert.h>
   430 
   431 #if (defined(__MACOSX__) && (__GNUC__ < 4))
   432 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   433         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
   434 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   435         (vector unsigned short) ( a,b,c,d,e,f,g,h )
   436 #else
   437 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
   438         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
   439 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
   440         (vector unsigned short) { a,b,c,d,e,f,g,h }
   441 #endif
   442 
   443 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
   444 #define VECPRINT(msg, v) do { \
   445     vector unsigned int tmpvec = (vector unsigned int)(v); \
   446     unsigned int *vp = (unsigned int *)&tmpvec; \
   447     printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
   448 } while (0)
   449 
   450 /* the permuation vector that takes the high bytes out of all the appropriate shorts 
   451     (vector unsigned char)(
   452         0x00, 0x10, 0x02, 0x12,
   453         0x04, 0x14, 0x06, 0x16,
   454         0x08, 0x18, 0x0A, 0x1A,
   455         0x0C, 0x1C, 0x0E, 0x1E );
   456 */
   457 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
   458 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
   459 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
   460 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
   461     ? vec_lvsl(0, src) \
   462     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
   463 
   464 
   465 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
   466     /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
   467     vector unsigned short vtemp1 = vec_mule(vs, valpha); \
   468     /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
   469     vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
   470     /* valpha2 is 255-alpha */ \
   471     vector unsigned char valpha2 = vec_nor(valpha, valpha); \
   472     /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
   473     vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
   474     /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
   475     vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
   476     /* add source and dest */ \
   477     vtemp1 = vec_add(vtemp1, vtemp3); \
   478     vtemp2 = vec_add(vtemp2, vtemp4); \
   479     /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
   480     vtemp1 = vec_add(vtemp1, v1_16); \
   481     vtemp3 = vec_sr(vtemp1, v8_16); \
   482     vtemp1 = vec_add(vtemp1, vtemp3); \
   483     /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
   484     vtemp2 = vec_add(vtemp2, v1_16); \
   485     vtemp4 = vec_sr(vtemp2, v8_16); \
   486     vtemp2 = vec_add(vtemp2, vtemp4); \
   487     /* (>>8) and get ARGBARGBARGBARGB */ \
   488     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   489 } while (0)
   490 
   491 /* Calculate the permute vector used for 32->32 swizzling */
   492 static vector unsigned char
   493 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   494 {
   495     /*
   496      * We have to assume that the bits that aren't used by other
   497      *  colors is alpha, and it's one complete byte, since some formats
   498      *  leave alpha with a zero mask, but we should still swizzle the bits.
   499      */
   500     /* ARGB */
   501     const static struct SDL_PixelFormat default_pixel_format = {
   502         NULL, 0, 0,
   503         0, 0, 0, 0,
   504         16, 8, 0, 24,
   505         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
   506         0, 0
   507     };
   508     if (!srcfmt) {
   509         srcfmt = &default_pixel_format;
   510     }
   511     if (!dstfmt) {
   512         dstfmt = &default_pixel_format;
   513     }
   514     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   515                                                        0x04, 0x04, 0x04, 0x04,
   516                                                        0x08, 0x08, 0x08, 0x08,
   517                                                        0x0C, 0x0C, 0x0C,
   518                                                        0x0C);
   519     vector unsigned char vswiz;
   520     vector unsigned int srcvec;
   521 #define RESHIFT(X) (3 - ((X) >> 3))
   522     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   523     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   524     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   525     Uint32 amask;
   526     /* Use zero for alpha if either surface doesn't have alpha */
   527     if (dstfmt->Amask) {
   528         amask =
   529             ((srcfmt->Amask) ? RESHIFT(srcfmt->
   530                                        Ashift) : 0x10) << (dstfmt->Ashift);
   531     } else {
   532         amask =
   533             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   534                           0xFFFFFFFF);
   535     }
   536 #undef RESHIFT
   537     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   538     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   539     return (vswiz);
   540 }
   541 
   542 static void
   543 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   544 {
   545     int height = info->dst_h;
   546     Uint8 *src = (Uint8 *) info->src;
   547     int srcskip = info->src_skip;
   548     Uint8 *dst = (Uint8 *) info->dst;
   549     int dstskip = info->dst_skip;
   550     SDL_PixelFormat *srcfmt = info->src_fmt;
   551 
   552     vector unsigned char v0 = vec_splat_u8(0);
   553     vector unsigned short v8_16 = vec_splat_u16(8);
   554     vector unsigned short v1_16 = vec_splat_u16(1);
   555     vector unsigned short v2_16 = vec_splat_u16(2);
   556     vector unsigned short v3_16 = vec_splat_u16(3);
   557     vector unsigned int v8_32 = vec_splat_u32(8);
   558     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   559     vector unsigned short v3f =
   560         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   561                           0x003f, 0x003f, 0x003f, 0x003f);
   562     vector unsigned short vfc =
   563         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   564                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   565 
   566     /* 
   567        0x10 - 0x1f is the alpha
   568        0x00 - 0x0e evens are the red
   569        0x01 - 0x0f odds are zero
   570      */
   571     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   572                                                        0x10, 0x02, 0x01, 0x01,
   573                                                        0x10, 0x04, 0x01, 0x01,
   574                                                        0x10, 0x06, 0x01,
   575                                                        0x01);
   576     vector unsigned char vredalpha2 =
   577         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   578                                         vec_sl(v8_32, v16_32))
   579         );
   580     /*
   581        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   582        0x11 - 0x0f odds are blue
   583      */
   584     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   585                                                    0x04, 0x05, 0x06, 0x13,
   586                                                    0x08, 0x09, 0x0a, 0x15,
   587                                                    0x0c, 0x0d, 0x0e, 0x17);
   588     vector unsigned char vblue2 =
   589         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   590         );
   591     /*
   592        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   593        0x10 - 0x0e evens are green
   594      */
   595     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   596                                                     0x04, 0x05, 0x12, 0x07,
   597                                                     0x08, 0x09, 0x14, 0x0b,
   598                                                     0x0c, 0x0d, 0x16, 0x0f);
   599     vector unsigned char vgreen2 =
   600         (vector unsigned
   601          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   602         );
   603     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   604                                                     0x00, 0x0a, 0x00, 0x0e,
   605                                                     0x00, 0x12, 0x00, 0x16,
   606                                                     0x00, 0x1a, 0x00, 0x1e);
   607     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   608     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   609     vector unsigned char valphaPermute =
   610         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   611 
   612     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   613     vf800 = vec_sl(vf800, vec_splat_u16(8));
   614 
   615     while (height--) {
   616         int extrawidth;
   617         vector unsigned char valigner;
   618         vector unsigned char vsrc;
   619         vector unsigned char voverflow;
   620         int width = info->dst_w;
   621 
   622 #define ONE_PIXEL_BLEND(condition, widthvar) \
   623         while (condition) { \
   624             Uint32 Pixel; \
   625             unsigned sR, sG, sB, dR, dG, dB, sA; \
   626             DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   627             if(sA) { \
   628                 unsigned short dstpixel = *((unsigned short *)dst); \
   629                 dR = (dstpixel >> 8) & 0xf8; \
   630                 dG = (dstpixel >> 3) & 0xfc; \
   631                 dB = (dstpixel << 3) & 0xf8; \
   632                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   633                 *((unsigned short *)dst) = ( \
   634                     ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   635                 ); \
   636             } \
   637             src += 4; \
   638             dst += 2; \
   639             widthvar--; \
   640         }
   641         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   642         extrawidth = (width % 8);
   643         valigner = VEC_ALIGNER(src);
   644         vsrc = (vector unsigned char) vec_ld(0, src);
   645         width -= extrawidth;
   646         while (width) {
   647             vector unsigned char valpha;
   648             vector unsigned char vsrc1, vsrc2;
   649             vector unsigned char vdst1, vdst2;
   650             vector unsigned short vR, vG, vB;
   651             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   652 
   653             /* Load 8 pixels from src as ARGB */
   654             voverflow = (vector unsigned char) vec_ld(15, src);
   655             vsrc = vec_perm(vsrc, voverflow, valigner);
   656             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   657             src += 16;
   658             vsrc = (vector unsigned char) vec_ld(15, src);
   659             voverflow = vec_perm(voverflow, vsrc, valigner);
   660             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   661             src += 16;
   662 
   663             /* Load 8 pixels from dst as XRGB */
   664             voverflow = vec_ld(0, dst);
   665             vR = vec_and((vector unsigned short) voverflow, vf800);
   666             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   667             vG = vec_sl(vB, v2_16);
   668             vdst1 =
   669                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   670                                                 (vector unsigned char) vR,
   671                                                 vredalpha1);
   672             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   673             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   674             vdst2 =
   675                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   676                                                 (vector unsigned char) vR,
   677                                                 vredalpha2);
   678             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   679             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   680 
   681             /* Alpha blend 8 pixels as ARGB */
   682             valpha = vec_perm(vsrc1, v0, valphaPermute);
   683             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   684                                v8_16);
   685             valpha = vec_perm(vsrc2, v0, valphaPermute);
   686             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   687                                v8_16);
   688 
   689             /* Convert 8 pixels to 565 */
   690             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   691                                                         vdst1,
   692                                                         (vector unsigned int)
   693                                                         vdst2);
   694             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   695             vgpixel = vec_and(vgpixel, vfc);
   696             vgpixel = vec_sl(vgpixel, v3_16);
   697             vrpixel = vec_sl(vpixel, v1_16);
   698             vrpixel = vec_and(vrpixel, vf800);
   699             vbpixel = vec_and(vpixel, v3f);
   700             vdst1 =
   701                 vec_or((vector unsigned char) vrpixel,
   702                        (vector unsigned char) vgpixel);
   703             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   704 
   705             /* Store 8 pixels */
   706             vec_st(vdst1, 0, dst);
   707 
   708             width -= 8;
   709             dst += 16;
   710         }
   711         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   712 #undef ONE_PIXEL_BLEND
   713         src += srcskip;
   714         dst += dstskip;
   715     }
   716 }
   717 
   718 static void
   719 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   720 {
   721     int height = info->dst_h;
   722     Uint32 *srcp = (Uint32 *) info->src;
   723     int srcskip = info->src_skip >> 2;
   724     Uint32 *dstp = (Uint32 *) info->dst;
   725     int dstskip = info->dst_skip >> 2;
   726     SDL_PixelFormat *srcfmt = info->src_fmt;
   727     SDL_PixelFormat *dstfmt = info->dst_fmt;
   728     unsigned sA = info->a;
   729     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   730     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   731     Uint32 ckey = info->colorkey;
   732     vector unsigned char mergePermute;
   733     vector unsigned char vsrcPermute;
   734     vector unsigned char vdstPermute;
   735     vector unsigned char vsdstPermute;
   736     vector unsigned char valpha;
   737     vector unsigned char valphamask;
   738     vector unsigned char vbits;
   739     vector unsigned char v0;
   740     vector unsigned short v1;
   741     vector unsigned short v8;
   742     vector unsigned int vckey;
   743     vector unsigned int vrgbmask;
   744 
   745     mergePermute = VEC_MERGE_PERMUTE();
   746     v0 = vec_splat_u8(0);
   747     v1 = vec_splat_u16(1);
   748     v8 = vec_splat_u16(8);
   749 
   750     /* set the alpha to 255 on the destination surf */
   751     valphamask = VEC_ALPHA_MASK();
   752 
   753     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   754     vdstPermute = calc_swizzle32(NULL, dstfmt);
   755     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   756 
   757     /* set a vector full of alpha and 255-alpha */
   758     ((unsigned char *) &valpha)[0] = sA;
   759     valpha = vec_splat(valpha, 0);
   760     vbits = (vector unsigned char) vec_splat_s8(-1);
   761 
   762     ckey &= rgbmask;
   763     ((unsigned int *) (char *) &vckey)[0] = ckey;
   764     vckey = vec_splat(vckey, 0);
   765     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
   766     vrgbmask = vec_splat(vrgbmask, 0);
   767 
   768     while (height--) {
   769         int width = info->dst_w;
   770 #define ONE_PIXEL_BLEND(condition, widthvar) \
   771         while (condition) { \
   772             Uint32 Pixel; \
   773             unsigned sR, sG, sB, dR, dG, dB; \
   774             RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
   775             if(sA && Pixel != ckey) { \
   776                 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
   777                 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
   778                 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   779                 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   780             } \
   781             dstp++; \
   782             srcp++; \
   783             widthvar--; \
   784         }
   785         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   786         if (width > 0) {
   787             int extrawidth = (width % 4);
   788             vector unsigned char valigner = VEC_ALIGNER(srcp);
   789             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   790             width -= extrawidth;
   791             while (width) {
   792                 vector unsigned char vsel;
   793                 vector unsigned char voverflow;
   794                 vector unsigned char vd;
   795                 vector unsigned char vd_orig;
   796 
   797                 /* s = *srcp */
   798                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   799                 vs = vec_perm(vs, voverflow, valigner);
   800 
   801                 /* vsel is set for items that match the key */
   802                 vsel =
   803                     (vector unsigned char) vec_and((vector unsigned int) vs,
   804                                                    vrgbmask);
   805                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
   806                                                         vsel, vckey);
   807 
   808                 /* permute to source format */
   809                 vs = vec_perm(vs, valpha, vsrcPermute);
   810 
   811                 /* d = *dstp */
   812                 vd = (vector unsigned char) vec_ld(0, dstp);
   813                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   814 
   815                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   816 
   817                 /* set the alpha channel to full on */
   818                 vd = vec_or(vd, valphamask);
   819 
   820                 /* mask out color key */
   821                 vd = vec_sel(vd, vd_orig, vsel);
   822 
   823                 /* permute to dest format */
   824                 vd = vec_perm(vd, vbits, vdstPermute);
   825 
   826                 /* *dstp = res */
   827                 vec_st((vector unsigned int) vd, 0, dstp);
   828 
   829                 srcp += 4;
   830                 dstp += 4;
   831                 width -= 4;
   832                 vs = voverflow;
   833             }
   834             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   835         }
   836 #undef ONE_PIXEL_BLEND
   837 
   838         srcp += srcskip;
   839         dstp += dstskip;
   840     }
   841 }
   842 
   843 
   844 static void
   845 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
   846 {
   847     int width = info->dst_w;
   848     int height = info->dst_h;
   849     Uint32 *srcp = (Uint32 *) info->src;
   850     int srcskip = info->src_skip >> 2;
   851     Uint32 *dstp = (Uint32 *) info->dst;
   852     int dstskip = info->dst_skip >> 2;
   853     SDL_PixelFormat *srcfmt = info->src_fmt;
   854     SDL_PixelFormat *dstfmt = info->dst_fmt;
   855     vector unsigned char mergePermute;
   856     vector unsigned char valphaPermute;
   857     vector unsigned char vsrcPermute;
   858     vector unsigned char vdstPermute;
   859     vector unsigned char vsdstPermute;
   860     vector unsigned char valphamask;
   861     vector unsigned char vpixelmask;
   862     vector unsigned char v0;
   863     vector unsigned short v1;
   864     vector unsigned short v8;
   865 
   866     v0 = vec_splat_u8(0);
   867     v1 = vec_splat_u16(1);
   868     v8 = vec_splat_u16(8);
   869     mergePermute = VEC_MERGE_PERMUTE();
   870     valphamask = VEC_ALPHA_MASK();
   871     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   872     vpixelmask = vec_nor(valphamask, v0);
   873     vsrcPermute = calc_swizzle32(srcfmt, NULL);
   874     vdstPermute = calc_swizzle32(NULL, dstfmt);
   875     vsdstPermute = calc_swizzle32(dstfmt, NULL);
   876 
   877     while (height--) {
   878         width = info->dst_w;
   879 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   880             Uint32 Pixel; \
   881             unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   882             DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
   883             if(sA) { \
   884               DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
   885               ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   886               ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   887             } \
   888             ++srcp; \
   889             ++dstp; \
   890             widthvar--; \
   891         }
   892         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   893         if (width > 0) {
   894             /* vsrcPermute */
   895             /* vdstPermute */
   896             int extrawidth = (width % 4);
   897             vector unsigned char valigner = VEC_ALIGNER(srcp);
   898             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
   899             width -= extrawidth;
   900             while (width) {
   901                 vector unsigned char voverflow;
   902                 vector unsigned char vd;
   903                 vector unsigned char valpha;
   904                 vector unsigned char vdstalpha;
   905                 /* s = *srcp */
   906                 voverflow = (vector unsigned char) vec_ld(15, srcp);
   907                 vs = vec_perm(vs, voverflow, valigner);
   908                 vs = vec_perm(vs, v0, vsrcPermute);
   909 
   910                 valpha = vec_perm(vs, v0, valphaPermute);
   911 
   912                 /* d = *dstp */
   913                 vd = (vector unsigned char) vec_ld(0, dstp);
   914                 vd = vec_perm(vd, v0, vsdstPermute);
   915                 vdstalpha = vec_and(vd, valphamask);
   916 
   917                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   918 
   919                 /* set the alpha to the dest alpha */
   920                 vd = vec_and(vd, vpixelmask);
   921                 vd = vec_or(vd, vdstalpha);
   922                 vd = vec_perm(vd, v0, vdstPermute);
   923 
   924                 /* *dstp = res */
   925                 vec_st((vector unsigned int) vd, 0, dstp);
   926 
   927                 srcp += 4;
   928                 dstp += 4;
   929                 width -= 4;
   930                 vs = voverflow;
   931 
   932             }
   933             ONE_PIXEL_BLEND((extrawidth), extrawidth);
   934         }
   935         srcp += srcskip;
   936         dstp += dstskip;
   937 #undef ONE_PIXEL_BLEND
   938     }
   939 }
   940 
   941 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   942 static void
   943 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
   944 {
   945     int width = info->dst_w;
   946     int height = info->dst_h;
   947     Uint32 *srcp = (Uint32 *) info->src;
   948     int srcskip = info->src_skip >> 2;
   949     Uint32 *dstp = (Uint32 *) info->dst;
   950     int dstskip = info->dst_skip >> 2;
   951     vector unsigned char mergePermute;
   952     vector unsigned char valphaPermute;
   953     vector unsigned char valphamask;
   954     vector unsigned char vpixelmask;
   955     vector unsigned char v0;
   956     vector unsigned short v1;
   957     vector unsigned short v8;
   958     v0 = vec_splat_u8(0);
   959     v1 = vec_splat_u16(1);
   960     v8 = vec_splat_u16(8);
   961     mergePermute = VEC_MERGE_PERMUTE();
   962     valphamask = VEC_ALPHA_MASK();
   963     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   964 
   965 
   966     vpixelmask = vec_nor(valphamask, v0);
   967     while (height--) {
   968         width = info->dst_w;
   969 #define ONE_PIXEL_BLEND(condition, widthvar) \
   970         while ((condition)) { \
   971             Uint32 dalpha; \
   972             Uint32 d; \
   973             Uint32 s1; \
   974             Uint32 d1; \
   975             Uint32 s = *srcp; \
   976             Uint32 alpha = s >> 24; \
   977             if(alpha) { \
   978               if(alpha == SDL_ALPHA_OPAQUE) { \
   979                 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   980               } else { \
   981                 d = *dstp; \
   982                 dalpha = d & 0xff000000; \
   983                 s1 = s & 0xff00ff; \
   984                 d1 = d & 0xff00ff; \
   985                 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   986                 s &= 0xff00; \
   987                 d &= 0xff00; \
   988                 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   989                 *dstp = d1 | d | dalpha; \
   990               } \
   991             } \
   992             ++srcp; \
   993             ++dstp; \
   994             widthvar--; \
   995 	    }
   996         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   997         if (width > 0) {
   998             int extrawidth = (width % 4);
   999             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1000             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1001             width -= extrawidth;
  1002             while (width) {
  1003                 vector unsigned char voverflow;
  1004                 vector unsigned char vd;
  1005                 vector unsigned char valpha;
  1006                 vector unsigned char vdstalpha;
  1007                 /* s = *srcp */
  1008                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1009                 vs = vec_perm(vs, voverflow, valigner);
  1010 
  1011                 valpha = vec_perm(vs, v0, valphaPermute);
  1012 
  1013                 /* d = *dstp */
  1014                 vd = (vector unsigned char) vec_ld(0, dstp);
  1015                 vdstalpha = vec_and(vd, valphamask);
  1016 
  1017                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1018 
  1019                 /* set the alpha to the dest alpha */
  1020                 vd = vec_and(vd, vpixelmask);
  1021                 vd = vec_or(vd, vdstalpha);
  1022 
  1023                 /* *dstp = res */
  1024                 vec_st((vector unsigned int) vd, 0, dstp);
  1025 
  1026                 srcp += 4;
  1027                 dstp += 4;
  1028                 width -= 4;
  1029                 vs = voverflow;
  1030             }
  1031             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1032         }
  1033         srcp += srcskip;
  1034         dstp += dstskip;
  1035     }
  1036 #undef ONE_PIXEL_BLEND
  1037 }
  1038 
  1039 static void
  1040 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1041 {
  1042     /* XXX : 6 */
  1043     int height = info->dst_h;
  1044     Uint32 *srcp = (Uint32 *) info->src;
  1045     int srcskip = info->src_skip >> 2;
  1046     Uint32 *dstp = (Uint32 *) info->dst;
  1047     int dstskip = info->dst_skip >> 2;
  1048     SDL_PixelFormat *srcfmt = info->src_fmt;
  1049     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1050     unsigned sA = info->a;
  1051     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1052     vector unsigned char mergePermute;
  1053     vector unsigned char vsrcPermute;
  1054     vector unsigned char vdstPermute;
  1055     vector unsigned char vsdstPermute;
  1056     vector unsigned char valpha;
  1057     vector unsigned char valphamask;
  1058     vector unsigned char vbits;
  1059     vector unsigned short v1;
  1060     vector unsigned short v8;
  1061 
  1062     mergePermute = VEC_MERGE_PERMUTE();
  1063     v1 = vec_splat_u16(1);
  1064     v8 = vec_splat_u16(8);
  1065 
  1066     /* set the alpha to 255 on the destination surf */
  1067     valphamask = VEC_ALPHA_MASK();
  1068 
  1069     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1070     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1071     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1072 
  1073     /* set a vector full of alpha and 255-alpha */
  1074     ((unsigned char *) &valpha)[0] = sA;
  1075     valpha = vec_splat(valpha, 0);
  1076     vbits = (vector unsigned char) vec_splat_s8(-1);
  1077 
  1078     while (height--) {
  1079         int width = info->dst_w;
  1080 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1081             Uint32 Pixel; \
  1082             unsigned sR, sG, sB, dR, dG, dB; \
  1083             DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
  1084             DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
  1085             ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
  1086             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1087             ++srcp; \
  1088             ++dstp; \
  1089             widthvar--; \
  1090         }
  1091         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1092         if (width > 0) {
  1093             int extrawidth = (width % 4);
  1094             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1095             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1096             width -= extrawidth;
  1097             while (width) {
  1098                 vector unsigned char voverflow;
  1099                 vector unsigned char vd;
  1100 
  1101                 /* s = *srcp */
  1102                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1103                 vs = vec_perm(vs, voverflow, valigner);
  1104                 vs = vec_perm(vs, valpha, vsrcPermute);
  1105 
  1106                 /* d = *dstp */
  1107                 vd = (vector unsigned char) vec_ld(0, dstp);
  1108                 vd = vec_perm(vd, vd, vsdstPermute);
  1109 
  1110                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1111 
  1112                 /* set the alpha channel to full on */
  1113                 vd = vec_or(vd, valphamask);
  1114                 vd = vec_perm(vd, vbits, vdstPermute);
  1115 
  1116                 /* *dstp = res */
  1117                 vec_st((vector unsigned int) vd, 0, dstp);
  1118 
  1119                 srcp += 4;
  1120                 dstp += 4;
  1121                 width -= 4;
  1122                 vs = voverflow;
  1123             }
  1124             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1125         }
  1126 #undef ONE_PIXEL_BLEND
  1127 
  1128         srcp += srcskip;
  1129         dstp += dstskip;
  1130     }
  1131 
  1132 }
  1133 
  1134 
  1135 /* fast RGB888->(A)RGB888 blending */
  1136 static void
  1137 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1138 {
  1139     unsigned alpha = info->a;
  1140     int height = info->dst_h;
  1141     Uint32 *srcp = (Uint32 *) info->src;
  1142     int srcskip = info->src_skip >> 2;
  1143     Uint32 *dstp = (Uint32 *) info->dst;
  1144     int dstskip = info->dst_skip >> 2;
  1145     vector unsigned char mergePermute;
  1146     vector unsigned char valpha;
  1147     vector unsigned char valphamask;
  1148     vector unsigned short v1;
  1149     vector unsigned short v8;
  1150 
  1151     mergePermute = VEC_MERGE_PERMUTE();
  1152     v1 = vec_splat_u16(1);
  1153     v8 = vec_splat_u16(8);
  1154 
  1155     /* set the alpha to 255 on the destination surf */
  1156     valphamask = VEC_ALPHA_MASK();
  1157 
  1158     /* set a vector full of alpha and 255-alpha */
  1159     ((unsigned char *) &valpha)[0] = alpha;
  1160     valpha = vec_splat(valpha, 0);
  1161 
  1162     while (height--) {
  1163         int width = info->dst_w;
  1164 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1165             Uint32 s = *srcp; \
  1166             Uint32 d = *dstp; \
  1167             Uint32 s1 = s & 0xff00ff; \
  1168             Uint32 d1 = d & 0xff00ff; \
  1169             d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
  1170                  & 0xff00ff; \
  1171             s &= 0xff00; \
  1172             d &= 0xff00; \
  1173             d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
  1174             *dstp = d1 | d | 0xff000000; \
  1175             ++srcp; \
  1176             ++dstp; \
  1177             widthvar--; \
  1178         }
  1179         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1180         if (width > 0) {
  1181             int extrawidth = (width % 4);
  1182             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1183             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1184             width -= extrawidth;
  1185             while (width) {
  1186                 vector unsigned char voverflow;
  1187                 vector unsigned char vd;
  1188 
  1189                 /* s = *srcp */
  1190                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1191                 vs = vec_perm(vs, voverflow, valigner);
  1192 
  1193                 /* d = *dstp */
  1194                 vd = (vector unsigned char) vec_ld(0, dstp);
  1195 
  1196                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1197 
  1198                 /* set the alpha channel to full on */
  1199                 vd = vec_or(vd, valphamask);
  1200 
  1201                 /* *dstp = res */
  1202                 vec_st((vector unsigned int) vd, 0, dstp);
  1203 
  1204                 srcp += 4;
  1205                 dstp += 4;
  1206                 width -= 4;
  1207                 vs = voverflow;
  1208             }
  1209             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1210         }
  1211 #undef ONE_PIXEL_BLEND
  1212 
  1213         srcp += srcskip;
  1214         dstp += dstskip;
  1215     }
  1216 }
  1217 
  1218 #if __MWERKS__
  1219 #pragma altivec_model off
  1220 #endif
  1221 #endif /* SDL_ALTIVEC_BLITTERS */
  1222 
  1223 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1224 static void
  1225 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1226 {
  1227     int width = info->dst_w;
  1228     int height = info->dst_h;
  1229     Uint32 *srcp = (Uint32 *) info->src;
  1230     int srcskip = info->src_skip >> 2;
  1231     Uint32 *dstp = (Uint32 *) info->dst;
  1232     int dstskip = info->dst_skip >> 2;
  1233 
  1234     while (height--) {
  1235 	    /* *INDENT-OFF* */
  1236 	    DUFFS_LOOP4({
  1237 		    Uint32 s = *srcp++;
  1238 		    Uint32 d = *dstp;
  1239 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
  1240 			       + (s & d & 0x00010101)) | 0xff000000;
  1241 	    }, width);
  1242 	    /* *INDENT-ON* */
  1243         srcp += srcskip;
  1244         dstp += dstskip;
  1245     }
  1246 }
  1247 
  1248 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1249 static void
  1250 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1251 {
  1252     unsigned alpha = info->a;
  1253     if (alpha == 128) {
  1254         BlitRGBtoRGBSurfaceAlpha128(info);
  1255     } else {
  1256         int width = info->dst_w;
  1257         int height = info->dst_h;
  1258         Uint32 *srcp = (Uint32 *) info->src;
  1259         int srcskip = info->src_skip >> 2;
  1260         Uint32 *dstp = (Uint32 *) info->dst;
  1261         int dstskip = info->dst_skip >> 2;
  1262         Uint32 s;
  1263         Uint32 d;
  1264         Uint32 s1;
  1265         Uint32 d1;
  1266 
  1267         while (height--) {
  1268 			/* *INDENT-OFF* */
  1269 			DUFFS_LOOP_DOUBLE2({
  1270 				/* One Pixel Blend */
  1271 				s = *srcp;
  1272 				d = *dstp;
  1273 				s1 = s & 0xff00ff;
  1274 				d1 = d & 0xff00ff;
  1275 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
  1276 				     & 0xff00ff;
  1277 				s &= 0xff00;
  1278 				d &= 0xff00;
  1279 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1280 				*dstp = d1 | d | 0xff000000;
  1281 				++srcp;
  1282 				++dstp;
  1283 			},{
  1284 			        /* Two Pixels Blend */
  1285 				s = *srcp;
  1286 				d = *dstp;
  1287 				s1 = s & 0xff00ff;
  1288 				d1 = d & 0xff00ff;
  1289 				d1 += (s1 - d1) * alpha >> 8;
  1290 				d1 &= 0xff00ff;
  1291 				     
  1292 				s = ((s & 0xff00) >> 8) | 
  1293 					((srcp[1] & 0xff00) << 8);
  1294 				d = ((d & 0xff00) >> 8) |
  1295 					((dstp[1] & 0xff00) << 8);
  1296 				d += (s - d) * alpha >> 8;
  1297 				d &= 0x00ff00ff;
  1298 				
  1299 				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
  1300 				++srcp;
  1301 				
  1302 			        s1 = *srcp;
  1303 				d1 = *dstp;
  1304 				s1 &= 0xff00ff;
  1305 				d1 &= 0xff00ff;
  1306 				d1 += (s1 - d1) * alpha >> 8;
  1307 				d1 &= 0xff00ff;
  1308 				
  1309 				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
  1310 				++srcp;
  1311 				++dstp;
  1312 			}, width);
  1313 			/* *INDENT-ON* */
  1314             srcp += srcskip;
  1315             dstp += dstskip;
  1316         }
  1317     }
  1318 }
  1319 
  1320 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1321 static void
  1322 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1323 {
  1324     int width = info->dst_w;
  1325     int height = info->dst_h;
  1326     Uint32 *srcp = (Uint32 *) info->src;
  1327     int srcskip = info->src_skip >> 2;
  1328     Uint32 *dstp = (Uint32 *) info->dst;
  1329     int dstskip = info->dst_skip >> 2;
  1330 
  1331     while (height--) {
  1332 	    /* *INDENT-OFF* */
  1333 	    DUFFS_LOOP4({
  1334 		Uint32 dalpha;
  1335 		Uint32 d;
  1336 		Uint32 s1;
  1337 		Uint32 d1;
  1338 		Uint32 s = *srcp;
  1339 		Uint32 alpha = s >> 24;
  1340 		/* FIXME: Here we special-case opaque alpha since the
  1341 		   compositioning used (>>8 instead of /255) doesn't handle
  1342 		   it correctly. Also special-case alpha=0 for speed?
  1343 		   Benchmark this! */
  1344 		if(alpha) {   
  1345 		  if(alpha == SDL_ALPHA_OPAQUE) {
  1346 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
  1347 		  } else {
  1348 		    /*
  1349 		     * take out the middle component (green), and process
  1350 		     * the other two in parallel. One multiply less.
  1351 		     */
  1352 		    d = *dstp;
  1353 		    dalpha = d & 0xff000000;
  1354 		    s1 = s & 0xff00ff;
  1355 		    d1 = d & 0xff00ff;
  1356 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
  1357 		    s &= 0xff00;
  1358 		    d &= 0xff00;
  1359 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
  1360 		    *dstp = d1 | d | dalpha;
  1361 		  }
  1362 		}
  1363 		++srcp;
  1364 		++dstp;
  1365 	    }, width);
  1366 	    /* *INDENT-ON* */
  1367         srcp += srcskip;
  1368         dstp += dstskip;
  1369     }
  1370 }
  1371 
  1372 #ifdef __3dNOW__
  1373 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1374 static void
  1375 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1376 {
  1377     int width = info->dst_w;
  1378     int height = info->dst_h;
  1379     Uint32 *srcp = (Uint32 *) info->src;
  1380     int srcskip = info->src_skip >> 2;
  1381     Uint32 *dstp = (Uint32 *) info->dst;
  1382     int dstskip = info->dst_skip >> 2;
  1383     SDL_PixelFormat *sf = info->src_fmt;
  1384     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
  1385     Uint32 amask = sf->Amask;
  1386     Uint32 ashift = sf->Ashift;
  1387     Uint64 multmask;
  1388 
  1389     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1390 
  1391     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1392     multmask = 0xFFFF;
  1393     multmask <<= (ashift * 2);
  1394     multmask = ~multmask;
  1395     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1396 
  1397     while (height--) {
  1398 	    /* *INDENT-OFF* */
  1399 	    DUFFS_LOOP4({
  1400 		Uint32 alpha;
  1401 
  1402 		_m_prefetch(srcp + 16);
  1403 		_m_prefetch(dstp + 16);
  1404 
  1405 		alpha = *srcp & amask;
  1406 		if (alpha == 0) {
  1407 			/* do nothing */
  1408 		} else if (alpha == amask) {
  1409 			/* copy RGB, keep dst alpha */
  1410 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
  1411 		} else {
  1412 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
  1413 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
  1414 
  1415 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
  1416 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
  1417 
  1418 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
  1419 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
  1420 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
  1421 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
  1422 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
  1423 
  1424 			/* blend */		    
  1425 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
  1426 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
  1427 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
  1428 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
  1429 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
  1430 			
  1431 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
  1432 		}
  1433 		++srcp;
  1434 		++dstp;
  1435 	    }, width);
  1436 	    /* *INDENT-ON* */
  1437         srcp += srcskip;
  1438         dstp += dstskip;
  1439     }
  1440     _mm_empty();
  1441 }
  1442 
  1443 #endif /* __MMX__ */
  1444 
  1445 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
  1446 
  1447 /* blend a single 16 bit pixel at 50% */
  1448 #define BLEND16_50(d, s, mask)						\
  1449 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
  1450 
  1451 /* blend two 16 bit pixels at 50% */
  1452 #define BLEND2x16_50(d, s, mask)					     \
  1453 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1454 	 + (s & d & (~(mask | mask << 16))))
  1455 
  1456 static void
  1457 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1458 {
  1459     int width = info->dst_w;
  1460     int height = info->dst_h;
  1461     Uint16 *srcp = (Uint16 *) info->src;
  1462     int srcskip = info->src_skip >> 1;
  1463     Uint16 *dstp = (Uint16 *) info->dst;
  1464     int dstskip = info->dst_skip >> 1;
  1465 
  1466     while (height--) {
  1467         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
  1468             /*
  1469              * Source and destination not aligned, pipeline it.
  1470              * This is mostly a win for big blits but no loss for
  1471              * small ones
  1472              */
  1473             Uint32 prev_sw;
  1474             int w = width;
  1475 
  1476             /* handle odd destination */
  1477             if ((uintptr_t) dstp & 2) {
  1478                 Uint16 d = *dstp, s = *srcp;
  1479                 *dstp = BLEND16_50(d, s, mask);
  1480                 dstp++;
  1481                 srcp++;
  1482                 w--;
  1483             }
  1484             srcp++;             /* srcp is now 32-bit aligned */
  1485 
  1486             /* bootstrap pipeline with first halfword */
  1487             prev_sw = ((Uint32 *) srcp)[-1];
  1488 
  1489             while (w > 1) {
  1490                 Uint32 sw, dw, s;
  1491                 sw = *(Uint32 *) srcp;
  1492                 dw = *(Uint32 *) dstp;
  1493 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1494                 s = (prev_sw << 16) + (sw >> 16);
  1495 #else
  1496                 s = (prev_sw >> 16) + (sw << 16);
  1497 #endif
  1498                 prev_sw = sw;
  1499                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1500                 dstp += 2;
  1501                 srcp += 2;
  1502                 w -= 2;
  1503             }
  1504 
  1505             /* final pixel if any */
  1506             if (w) {
  1507                 Uint16 d = *dstp, s;
  1508 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1509                 s = (Uint16) prev_sw;
  1510 #else
  1511                 s = (Uint16) (prev_sw >> 16);
  1512 #endif
  1513                 *dstp = BLEND16_50(d, s, mask);
  1514                 srcp++;
  1515                 dstp++;
  1516             }
  1517             srcp += srcskip - 1;
  1518             dstp += dstskip;
  1519         } else {
  1520             /* source and destination are aligned */
  1521             int w = width;
  1522 
  1523             /* first odd pixel? */
  1524             if ((uintptr_t) srcp & 2) {
  1525                 Uint16 d = *dstp, s = *srcp;
  1526                 *dstp = BLEND16_50(d, s, mask);
  1527                 srcp++;
  1528                 dstp++;
  1529                 w--;
  1530             }
  1531             /* srcp and dstp are now 32-bit aligned */
  1532 
  1533             while (w > 1) {
  1534                 Uint32 sw = *(Uint32 *) srcp;
  1535                 Uint32 dw = *(Uint32 *) dstp;
  1536                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1537                 srcp += 2;
  1538                 dstp += 2;
  1539                 w -= 2;
  1540             }
  1541 
  1542             /* last odd pixel? */
  1543             if (w) {
  1544                 Uint16 d = *dstp, s = *srcp;
  1545                 *dstp = BLEND16_50(d, s, mask);
  1546                 srcp++;
  1547                 dstp++;
  1548             }
  1549             srcp += srcskip;
  1550             dstp += dstskip;
  1551         }
  1552     }
  1553 }
  1554 
  1555 #ifdef __MMX__
  1556 
  1557 /* fast RGB565->RGB565 blending with surface alpha */
  1558 static void
  1559 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1560 {
  1561     unsigned alpha = info->a;
  1562     if (alpha == 128) {
  1563         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1564     } else {
  1565         int width = info->dst_w;
  1566         int height = info->dst_h;
  1567         Uint16 *srcp = (Uint16 *) info->src;
  1568         int srcskip = info->src_skip >> 1;
  1569         Uint16 *dstp = (Uint16 *) info->dst;
  1570         int dstskip = info->dst_skip >> 1;
  1571         Uint32 s, d;
  1572 
  1573         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  1574 
  1575         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1576         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1577         alpha >>= 3;            /* downscale alpha to 5 bits */
  1578 
  1579         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1580         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1581         /* position alpha to allow for mullo and mulhi on diff channels
  1582            to reduce the number of operations */
  1583         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1584 
  1585         /* Setup the 565 color channel masks */
  1586         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  1587         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1588 
  1589         while (height--) {
  1590 			/* *INDENT-OFF* */
  1591 			DUFFS_LOOP_QUATRO2(
  1592 			{
  1593 				s = *srcp++;
  1594 				d = *dstp;
  1595 				/*
  1596 				 * shift out the middle component (green) to
  1597 				 * the high 16 bits, and process all three RGB
  1598 				 * components at the same time.
  1599 				 */
  1600 				s = (s | s << 16) & 0x07e0f81f;
  1601 				d = (d | d << 16) & 0x07e0f81f;
  1602 				d += (s - d) * alpha >> 5;
  1603 				d &= 0x07e0f81f;
  1604 				*dstp++ = (Uint16)(d | d >> 16);
  1605 			},{
  1606 				s = *srcp++;
  1607 				d = *dstp;
  1608 				/*
  1609 				 * shift out the middle component (green) to
  1610 				 * the high 16 bits, and process all three RGB
  1611 				 * components at the same time.
  1612 				 */
  1613 				s = (s | s << 16) & 0x07e0f81f;
  1614 				d = (d | d << 16) & 0x07e0f81f;
  1615 				d += (s - d) * alpha >> 5;
  1616 				d &= 0x07e0f81f;
  1617 				*dstp++ = (Uint16)(d | d >> 16);
  1618 				s = *srcp++;
  1619 				d = *dstp;
  1620 				/*
  1621 				 * shift out the middle component (green) to
  1622 				 * the high 16 bits, and process all three RGB
  1623 				 * components at the same time.
  1624 				 */
  1625 				s = (s | s << 16) & 0x07e0f81f;
  1626 				d = (d | d << 16) & 0x07e0f81f;
  1627 				d += (s - d) * alpha >> 5;
  1628 				d &= 0x07e0f81f;
  1629 				*dstp++ = (Uint16)(d | d >> 16);
  1630 			},{
  1631 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1632 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1633 
  1634 				/* red */
  1635 				src2 = src1;
  1636 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
  1637 
  1638 				dst2 = dst1;
  1639 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
  1640 
  1641 				/* blend */
  1642 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1643 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1644 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1645 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1646 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
  1647 
  1648 				mm_res = dst2; /* RED -> mm_res */
  1649 
  1650 				/* green -- process the bits in place */
  1651 				src2 = src1;
  1652 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1653 
  1654 				dst2 = dst1;
  1655 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1656 
  1657 				/* blend */
  1658 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1659 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1660 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1661 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1662 
  1663 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1664 
  1665 				/* blue */
  1666 				src2 = src1;
  1667 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1668 
  1669 				dst2 = dst1;
  1670 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1671 
  1672 				/* blend */
  1673 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1674 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1675 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1676 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1677 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1678 
  1679 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1680 
  1681 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1682 
  1683 				srcp += 4;
  1684 				dstp += 4;
  1685 			}, width);
  1686 			/* *INDENT-ON* */
  1687             srcp += srcskip;
  1688             dstp += dstskip;
  1689         }
  1690         _mm_empty();
  1691     }
  1692 }
  1693 
  1694 /* fast RGB555->RGB555 blending with surface alpha */
  1695 static void
  1696 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  1697 {
  1698     unsigned alpha = info->a;
  1699     if (alpha == 128) {
  1700         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1701     } else {
  1702         int width = info->dst_w;
  1703         int height = info->dst_h;
  1704         Uint16 *srcp = (Uint16 *) info->src;
  1705         int srcskip = info->src_skip >> 1;
  1706         Uint16 *dstp = (Uint16 *) info->dst;
  1707         int dstskip = info->dst_skip >> 1;
  1708         Uint32 s, d;
  1709 
  1710         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  1711 
  1712         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1713         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  1714         alpha >>= 3;            /* downscale alpha to 5 bits */
  1715 
  1716         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  1717         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  1718         /* position alpha to allow for mullo and mulhi on diff channels
  1719            to reduce the number of operations */
  1720         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  1721 
  1722         /* Setup the 555 color channel masks */
  1723         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  1724         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  1725         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  1726 
  1727         while (height--) {
  1728 			/* *INDENT-OFF* */
  1729 			DUFFS_LOOP_QUATRO2(
  1730 			{
  1731 				s = *srcp++;
  1732 				d = *dstp;
  1733 				/*
  1734 				 * shift out the middle component (green) to
  1735 				 * the high 16 bits, and process all three RGB
  1736 				 * components at the same time.
  1737 				 */
  1738 				s = (s | s << 16) & 0x03e07c1f;
  1739 				d = (d | d << 16) & 0x03e07c1f;
  1740 				d += (s - d) * alpha >> 5;
  1741 				d &= 0x03e07c1f;
  1742 				*dstp++ = (Uint16)(d | d >> 16);
  1743 			},{
  1744 				s = *srcp++;
  1745 				d = *dstp;
  1746 				/*
  1747 				 * shift out the middle component (green) to
  1748 				 * the high 16 bits, and process all three RGB
  1749 				 * components at the same time.
  1750 				 */
  1751 				s = (s | s << 16) & 0x03e07c1f;
  1752 				d = (d | d << 16) & 0x03e07c1f;
  1753 				d += (s - d) * alpha >> 5;
  1754 				d &= 0x03e07c1f;
  1755 				*dstp++ = (Uint16)(d | d >> 16);
  1756 			        s = *srcp++;
  1757 				d = *dstp;
  1758 				/*
  1759 				 * shift out the middle component (green) to
  1760 				 * the high 16 bits, and process all three RGB
  1761 				 * components at the same time.
  1762 				 */
  1763 				s = (s | s << 16) & 0x03e07c1f;
  1764 				d = (d | d << 16) & 0x03e07c1f;
  1765 				d += (s - d) * alpha >> 5;
  1766 				d &= 0x03e07c1f;
  1767 				*dstp++ = (Uint16)(d | d >> 16);
  1768 			},{
  1769 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
  1770 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
  1771 
  1772 				/* red -- process the bits in place */
  1773 				src2 = src1;
  1774 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
  1775 
  1776 				dst2 = dst1;
  1777 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
  1778 
  1779 				/* blend */
  1780 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1781 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1782 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1783 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1784 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
  1785 
  1786 				mm_res = dst2; /* RED -> mm_res */
  1787 				
  1788 				/* green -- process the bits in place */
  1789 				src2 = src1;
  1790 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
  1791 
  1792 				dst2 = dst1;
  1793 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1794 
  1795 				/* blend */
  1796 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1797 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1798 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1799 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1800 
  1801 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1802 
  1803 				/* blue */
  1804 				src2 = src1; /* src -> src2 */
  1805 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1806 
  1807 				dst2 = dst1; /* dst -> dst2 */
  1808 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1809 
  1810 				/* blend */
  1811 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1812 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1813 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1814 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1815 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1816 
  1817 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1818 
  1819 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1820 
  1821 				srcp += 4;
  1822 				dstp += 4;
  1823 			}, width);
  1824 			/* *INDENT-ON* */
  1825             srcp += srcskip;
  1826             dstp += dstskip;
  1827         }
  1828         _mm_empty();
  1829     }
  1830 }
  1831 
  1832 #endif /* __MMX__ */
  1833 
  1834 /* fast RGB565->RGB565 blending with surface alpha */
  1835 static void
  1836 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1837 {
  1838     unsigned alpha = info->a;
  1839     if (alpha == 128) {
  1840         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1841     } else {
  1842         int width = info->dst_w;
  1843         int height = info->dst_h;
  1844         Uint16 *srcp = (Uint16 *) info->src;
  1845         int srcskip = info->src_skip >> 1;
  1846         Uint16 *dstp = (Uint16 *) info->dst;
  1847         int dstskip = info->dst_skip >> 1;
  1848         alpha >>= 3;            /* downscale alpha to 5 bits */
  1849 
  1850         while (height--) {
  1851 			/* *INDENT-OFF* */
  1852 			DUFFS_LOOP4({
  1853 				Uint32 s = *srcp++;
  1854 				Uint32 d = *dstp;
  1855 				/*
  1856 				 * shift out the middle component (green) to
  1857 				 * the high 16 bits, and process all three RGB
  1858 				 * components at the same time.
  1859 				 */
  1860 				s = (s | s << 16) & 0x07e0f81f;
  1861 				d = (d | d << 16) & 0x07e0f81f;
  1862 				d += (s - d) * alpha >> 5;
  1863 				d &= 0x07e0f81f;
  1864 				*dstp++ = (Uint16)(d | d >> 16);
  1865 			}, width);
  1866 			/* *INDENT-ON* */
  1867             srcp += srcskip;
  1868             dstp += dstskip;
  1869         }
  1870     }
  1871 }
  1872 
  1873 /* fast RGB555->RGB555 blending with surface alpha */
  1874 static void
  1875 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1876 {
  1877     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1878     if (alpha == 128) {
  1879         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1880     } else {
  1881         int width = info->dst_w;
  1882         int height = info->dst_h;
  1883         Uint16 *srcp = (Uint16 *) info->src;
  1884         int srcskip = info->src_skip >> 1;
  1885         Uint16 *dstp = (Uint16 *) info->dst;
  1886         int dstskip = info->dst_skip >> 1;
  1887         alpha >>= 3;            /* downscale alpha to 5 bits */
  1888 
  1889         while (height--) {
  1890 			/* *INDENT-OFF* */
  1891 			DUFFS_LOOP4({
  1892 				Uint32 s = *srcp++;
  1893 				Uint32 d = *dstp;
  1894 				/*
  1895 				 * shift out the middle component (green) to
  1896 				 * the high 16 bits, and process all three RGB
  1897 				 * components at the same time.
  1898 				 */
  1899 				s = (s | s << 16) & 0x03e07c1f;
  1900 				d = (d | d << 16) & 0x03e07c1f;
  1901 				d += (s - d) * alpha >> 5;
  1902 				d &= 0x03e07c1f;
  1903 				*dstp++ = (Uint16)(d | d >> 16);
  1904 			}, width);
  1905 			/* *INDENT-ON* */
  1906             srcp += srcskip;
  1907             dstp += dstskip;
  1908         }
  1909     }
  1910 }
  1911 
  1912 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1913 static void
  1914 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1915 {
  1916     int width = info->dst_w;
  1917     int height = info->dst_h;
  1918     Uint32 *srcp = (Uint32 *) info->src;
  1919     int srcskip = info->src_skip >> 2;
  1920     Uint16 *dstp = (Uint16 *) info->dst;
  1921     int dstskip = info->dst_skip >> 1;
  1922 
  1923     while (height--) {
  1924 	    /* *INDENT-OFF* */
  1925 	    DUFFS_LOOP4({
  1926 		Uint32 s = *srcp;
  1927 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1928 		/* FIXME: Here we special-case opaque alpha since the
  1929 		   compositioning used (>>8 instead of /255) doesn't handle
  1930 		   it correctly. Also special-case alpha=0 for speed?
  1931 		   Benchmark this! */
  1932 		if(alpha) {   
  1933 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1934 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1935 		  } else {
  1936 		    Uint32 d = *dstp;
  1937 		    /*
  1938 		     * convert source and destination to G0RAB65565
  1939 		     * and blend all components at the same time
  1940 		     */
  1941 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1942 		      + (s >> 3 & 0x1f);
  1943 		    d = (d | d << 16) & 0x07e0f81f;
  1944 		    d += (s - d) * alpha >> 5;
  1945 		    d &= 0x07e0f81f;
  1946 		    *dstp = (Uint16)(d | d >> 16);
  1947 		  }
  1948 		}
  1949 		srcp++;
  1950 		dstp++;
  1951 	    }, width);
  1952 	    /* *INDENT-ON* */
  1953         srcp += srcskip;
  1954         dstp += dstskip;
  1955     }
  1956 }
  1957 
  1958 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1959 static void
  1960 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1961 {
  1962     int width = info->dst_w;
  1963     int height = info->dst_h;
  1964     Uint32 *srcp = (Uint32 *) info->src;
  1965     int srcskip = info->src_skip >> 2;
  1966     Uint16 *dstp = (Uint16 *) info->dst;
  1967     int dstskip = info->dst_skip >> 1;
  1968 
  1969     while (height--) {
  1970 	    /* *INDENT-OFF* */
  1971 	    DUFFS_LOOP4({
  1972 		unsigned alpha;
  1973 		Uint32 s = *srcp;
  1974 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1975 		/* FIXME: Here we special-case opaque alpha since the
  1976 		   compositioning used (>>8 instead of /255) doesn't handle
  1977 		   it correctly. Also special-case alpha=0 for speed?
  1978 		   Benchmark this! */
  1979 		if(alpha) {   
  1980 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1981 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1982 		  } else {
  1983 		    Uint32 d = *dstp;
  1984 		    /*
  1985 		     * convert source and destination to G0RAB65565
  1986 		     * and blend all components at the same time
  1987 		     */
  1988 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1989 		      + (s >> 3 & 0x1f);
  1990 		    d = (d | d << 16) & 0x03e07c1f;
  1991 		    d += (s - d) * alpha >> 5;
  1992 		    d &= 0x03e07c1f;
  1993 		    *dstp = (Uint16)(d | d >> 16);
  1994 		  }
  1995 		}
  1996 		srcp++;
  1997 		dstp++;
  1998 	    }, width);
  1999 	    /* *INDENT-ON* */
  2000         srcp += srcskip;
  2001         dstp += dstskip;
  2002     }
  2003 }
  2004 
  2005 /* General (slow) N->N blending with per-surface alpha */
  2006 static void
  2007 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2008 {
  2009     int width = info->dst_w;
  2010     int height = info->dst_h;
  2011     Uint8 *src = info->src;
  2012     int srcskip = info->src_skip;
  2013     Uint8 *dst = info->dst;
  2014     int dstskip = info->dst_skip;
  2015     SDL_PixelFormat *srcfmt = info->src_fmt;
  2016     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2017     int srcbpp = srcfmt->BytesPerPixel;
  2018     int dstbpp = dstfmt->BytesPerPixel;
  2019     unsigned sA = info->a;
  2020     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2021 
  2022     if (sA) {
  2023         while (height--) {
  2024 	    /* *INDENT-OFF* */
  2025 	    DUFFS_LOOP4(
  2026 	    {
  2027 		Uint32 Pixel;
  2028 		unsigned sR;
  2029 		unsigned sG;
  2030 		unsigned sB;
  2031 		unsigned dR;
  2032 		unsigned dG;
  2033 		unsigned dB;
  2034 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  2035 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2036 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2037 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2038 		src += srcbpp;
  2039 		dst += dstbpp;
  2040 	    },
  2041 	    width);
  2042 	    /* *INDENT-ON* */
  2043             src += srcskip;
  2044             dst += dstskip;
  2045         }
  2046     }
  2047 }
  2048 
  2049 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2050 static void
  2051 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2052 {
  2053     int width = info->dst_w;
  2054     int height = info->dst_h;
  2055     Uint8 *src = info->src;
  2056     int srcskip = info->src_skip;
  2057     Uint8 *dst = info->dst;
  2058     int dstskip = info->dst_skip;
  2059     SDL_PixelFormat *srcfmt = info->src_fmt;
  2060     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2061     Uint32 ckey = info->colorkey;
  2062     int srcbpp = srcfmt->BytesPerPixel;
  2063     int dstbpp = dstfmt->BytesPerPixel;
  2064     unsigned sA = info->a;
  2065     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  2066 
  2067     while (height--) {
  2068 	    /* *INDENT-OFF* */
  2069 	    DUFFS_LOOP4(
  2070 	    {
  2071 		Uint32 Pixel;
  2072 		unsigned sR;
  2073 		unsigned sG;
  2074 		unsigned sB;
  2075 		unsigned dR;
  2076 		unsigned dG;
  2077 		unsigned dB;
  2078 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  2079 		if(sA && Pixel != ckey) {
  2080 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  2081 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  2082 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2083 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2084 		}
  2085 		src += srcbpp;
  2086 		dst += dstbpp;
  2087 	    },
  2088 	    width);
  2089 	    /* *INDENT-ON* */
  2090         src += srcskip;
  2091         dst += dstskip;
  2092     }
  2093 }
  2094 
  2095 /* General (slow) N->N blending with pixel alpha */
  2096 static void
  2097 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2098 {
  2099     int width = info->dst_w;
  2100     int height = info->dst_h;
  2101     Uint8 *src = info->src;
  2102     int srcskip = info->src_skip;
  2103     Uint8 *dst = info->dst;
  2104     int dstskip = info->dst_skip;
  2105     SDL_PixelFormat *srcfmt = info->src_fmt;
  2106     SDL_PixelFormat *dstfmt = info->dst_fmt;
  2107 
  2108     int srcbpp;
  2109     int dstbpp;
  2110 
  2111     /* Set up some basic variables */
  2112     srcbpp = srcfmt->BytesPerPixel;
  2113     dstbpp = dstfmt->BytesPerPixel;
  2114 
  2115     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  2116        quite right. for <8bpp source alpha, it gets them very wrong
  2117        (check all macros!)
  2118        It is unclear whether there is a good general solution that doesn't
  2119        need a branch (or a divide). */
  2120     while (height--) {
  2121 	    /* *INDENT-OFF* */
  2122 	    DUFFS_LOOP4(
  2123 	    {
  2124 		Uint32 Pixel;
  2125 		unsigned sR;
  2126 		unsigned sG;
  2127 		unsigned sB;
  2128 		unsigned dR;
  2129 		unsigned dG;
  2130 		unsigned dB;
  2131 		unsigned sA;
  2132 		unsigned dA;
  2133 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  2134 		if(sA) {
  2135 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  2136 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  2137 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  2138 		}
  2139 		src += srcbpp;
  2140 		dst += dstbpp;
  2141 	    },
  2142 	    width);
  2143 	    /* *INDENT-ON* */
  2144         src += srcskip;
  2145         dst += dstskip;
  2146     }
  2147 }
  2148 
  2149 
  2150 SDL_BlitFunc
  2151 SDL_CalculateBlitA(SDL_Surface * surface)
  2152 {
  2153     SDL_PixelFormat *sf = surface->format;
  2154     SDL_PixelFormat *df = surface->map->dst->format;
  2155 
  2156     switch (surface->map->info.flags) {
  2157     case SDL_COPY_BLEND:
  2158         /* Per-pixel alpha blits */
  2159         switch (df->BytesPerPixel) {
  2160         case 1:
  2161             return BlitNto1PixelAlpha;
  2162 
  2163         case 2:
  2164 #if SDL_ALTIVEC_BLITTERS
  2165             if (sf->BytesPerPixel == 4
  2166                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2167                 && SDL_HasAltiVec())
  2168                 return Blit32to565PixelAlphaAltivec;
  2169             else
  2170 #endif
  2171                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2172                     && sf->Gmask == 0xff00
  2173                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  2174                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  2175                 if (df->Gmask == 0x7e0)
  2176                     return BlitARGBto565PixelAlpha;
  2177                 else if (df->Gmask == 0x3e0)
  2178                     return BlitARGBto555PixelAlpha;
  2179             }
  2180             return BlitNtoNPixelAlpha;
  2181 
  2182         case 4:
  2183             if (sf->Rmask == df->Rmask
  2184                 && sf->Gmask == df->Gmask
  2185                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2186 #if defined(__MMX__) || defined(__3dNOW__)
  2187                 if (sf->Rshift % 8 == 0
  2188                     && sf->Gshift % 8 == 0
  2189                     && sf->Bshift % 8 == 0
  2190                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2191 #ifdef __3dNOW__
  2192                     if (SDL_Has3DNow())
  2193                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2194 #endif
  2195 #ifdef __MMX__
  2196                     if (SDL_HasMMX())
  2197                         return BlitRGBtoRGBPixelAlphaMMX;
  2198 #endif
  2199                 }
  2200 #endif /* __MMX__ || __3dNOW__ */
  2201                 if (sf->Amask == 0xff000000) {
  2202 #if SDL_ALTIVEC_BLITTERS
  2203                     if (SDL_HasAltiVec())
  2204                         return BlitRGBtoRGBPixelAlphaAltivec;
  2205 #endif
  2206                     return BlitRGBtoRGBPixelAlpha;
  2207                 }
  2208             }
  2209 #if SDL_ALTIVEC_BLITTERS
  2210             if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
  2211                 return Blit32to32PixelAlphaAltivec;
  2212             else
  2213 #endif
  2214                 return BlitNtoNPixelAlpha;
  2215 
  2216         case 3:
  2217         default:
  2218             return BlitNtoNPixelAlpha;
  2219         }
  2220         break;
  2221 
  2222     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2223         if (sf->Amask == 0) {
  2224             /* Per-surface alpha blits */
  2225             switch (df->BytesPerPixel) {
  2226             case 1:
  2227                 return BlitNto1SurfaceAlpha;
  2228 
  2229             case 2:
  2230                 if (surface->map->identity) {
  2231                     if (df->Gmask == 0x7e0) {
  2232 #ifdef __MMX__
  2233                         if (SDL_HasMMX())
  2234                             return Blit565to565SurfaceAlphaMMX;
  2235                         else
  2236 #endif
  2237                             return Blit565to565SurfaceAlpha;
  2238                     } else if (df->Gmask == 0x3e0) {
  2239 #ifdef __MMX__
  2240                         if (SDL_HasMMX())
  2241                             return Blit555to555SurfaceAlphaMMX;
  2242                         else
  2243 #endif
  2244                             return Blit555to555SurfaceAlpha;
  2245                     }
  2246                 }
  2247                 return BlitNtoNSurfaceAlpha;
  2248 
  2249             case 4:
  2250                 if (sf->Rmask == df->Rmask
  2251                     && sf->Gmask == df->Gmask
  2252                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2253 #ifdef __MMX__
  2254                     if (sf->Rshift % 8 == 0
  2255                         && sf->Gshift % 8 == 0
  2256                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2257                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2258 #endif
  2259                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2260 #if SDL_ALTIVEC_BLITTERS
  2261                         if (SDL_HasAltiVec())
  2262                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2263 #endif
  2264                         return BlitRGBtoRGBSurfaceAlpha;
  2265                     }
  2266                 }
  2267 #if SDL_ALTIVEC_BLITTERS
  2268                 if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
  2269                     return Blit32to32SurfaceAlphaAltivec;
  2270                 else
  2271 #endif
  2272                     return BlitNtoNSurfaceAlpha;
  2273 
  2274             case 3:
  2275             default:
  2276                 return BlitNtoNSurfaceAlpha;
  2277             }
  2278         }
  2279         break;
  2280 
  2281     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  2282         if (sf->Amask == 0) {
  2283             if (df->BytesPerPixel == 1)
  2284                 return BlitNto1SurfaceAlphaKey;
  2285             else
  2286 #if SDL_ALTIVEC_BLITTERS
  2287             if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2288                     SDL_HasAltiVec())
  2289                 return Blit32to32SurfaceAlphaKeyAltivec;
  2290             else
  2291 #endif
  2292                 return BlitNtoNSurfaceAlphaKey;
  2293         }
  2294         break;
  2295     }
  2296 
  2297     return NULL;
  2298 }
  2299 
  2300 /* vi: set ts=4 sw=4 expandtab: */