src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Fri, 11 Feb 2011 22:37:15 -0800
changeset 5262 b530ef003506
parent 5259 6a65c1fc07af
child 5389 24903690f48a
permissions -rw-r--r--
Happy 2011! :)
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2011 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #include "SDL_video.h"
    25 #include "SDL_blit.h"
    26 
    27 /* Functions to perform alpha blended blitting */
    28 
    29 /* N->1 blending with per-surface alpha */
    30 static void
    31 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    32 {
    33     int width = info->dst_w;
    34     int height = info->dst_h;
    35     Uint8 *src = info->src;
    36     int srcskip = info->src_skip;
    37     Uint8 *dst = info->dst;
    38     int dstskip = info->dst_skip;
    39     Uint8 *palmap = info->table;
    40     SDL_PixelFormat *srcfmt = info->src_fmt;
    41     SDL_PixelFormat *dstfmt = info->dst_fmt;
    42     int srcbpp = srcfmt->BytesPerPixel;
    43 
    44     const unsigned A = info->a;
    45 
    46     while (height--) {
    47 	    /* *INDENT-OFF* */
    48 	    DUFFS_LOOP4(
    49 	    {
    50 		Uint32 Pixel;
    51 		unsigned sR;
    52 		unsigned sG;
    53 		unsigned sB;
    54 		unsigned dR;
    55 		unsigned dG;
    56 		unsigned dB;
    57 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    58 		dR = dstfmt->palette->colors[*dst].r;
    59 		dG = dstfmt->palette->colors[*dst].g;
    60 		dB = dstfmt->palette->colors[*dst].b;
    61 		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
    62 		dR &= 0xff;
    63 		dG &= 0xff;
    64 		dB &= 0xff;
    65 		/* Pack RGB into 8bit pixel */
    66 		if ( palmap == NULL ) {
    67 		    *dst =((dR>>5)<<(3+2))|
    68 			  ((dG>>5)<<(2))|
    69 			  ((dB>>6)<<(0));
    70 		} else {
    71 		    *dst = palmap[((dR>>5)<<(3+2))|
    72 				  ((dG>>5)<<(2))  |
    73 				  ((dB>>6)<<(0))];
    74 		}
    75 		dst++;
    76 		src += srcbpp;
    77 	    },
    78 	    width);
    79 	    /* *INDENT-ON* */
    80         src += srcskip;
    81         dst += dstskip;
    82     }
    83 }
    84 
    85 /* N->1 blending with pixel alpha */
    86 static void
    87 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    88 {
    89     int width = info->dst_w;
    90     int height = info->dst_h;
    91     Uint8 *src = info->src;
    92     int srcskip = info->src_skip;
    93     Uint8 *dst = info->dst;
    94     int dstskip = info->dst_skip;
    95     Uint8 *palmap = info->table;
    96     SDL_PixelFormat *srcfmt = info->src_fmt;
    97     SDL_PixelFormat *dstfmt = info->dst_fmt;
    98     int srcbpp = srcfmt->BytesPerPixel;
    99 
   100     /* FIXME: fix alpha bit field expansion here too? */
   101     while (height--) {
   102 	    /* *INDENT-OFF* */
   103 	    DUFFS_LOOP4(
   104 	    {
   105 		Uint32 Pixel;
   106 		unsigned sR;
   107 		unsigned sG;
   108 		unsigned sB;
   109 		unsigned sA;
   110 		unsigned dR;
   111 		unsigned dG;
   112 		unsigned dB;
   113 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   114 		dR = dstfmt->palette->colors[*dst].r;
   115 		dG = dstfmt->palette->colors[*dst].g;
   116 		dB = dstfmt->palette->colors[*dst].b;
   117 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
   118 		dR &= 0xff;
   119 		dG &= 0xff;
   120 		dB &= 0xff;
   121 		/* Pack RGB into 8bit pixel */
   122 		if ( palmap == NULL ) {
   123 		    *dst =((dR>>5)<<(3+2))|
   124 			  ((dG>>5)<<(2))|
   125 			  ((dB>>6)<<(0));
   126 		} else {
   127 		    *dst = palmap[((dR>>5)<<(3+2))|
   128 				  ((dG>>5)<<(2))  |
   129 				  ((dB>>6)<<(0))  ];
   130 		}
   131 		dst++;
   132 		src += srcbpp;
   133 	    },
   134 	    width);
   135 	    /* *INDENT-ON* */
   136         src += srcskip;
   137         dst += dstskip;
   138     }
   139 }
   140 
   141 /* colorkeyed N->1 blending with per-surface alpha */
   142 static void
   143 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   144 {
   145     int width = info->dst_w;
   146     int height = info->dst_h;
   147     Uint8 *src = info->src;
   148     int srcskip = info->src_skip;
   149     Uint8 *dst = info->dst;
   150     int dstskip = info->dst_skip;
   151     Uint8 *palmap = info->table;
   152     SDL_PixelFormat *srcfmt = info->src_fmt;
   153     SDL_PixelFormat *dstfmt = info->dst_fmt;
   154     int srcbpp = srcfmt->BytesPerPixel;
   155     Uint32 ckey = info->colorkey;
   156 
   157     const int A = info->a;
   158 
   159     while (height--) {
   160 	    /* *INDENT-OFF* */
   161 	    DUFFS_LOOP(
   162 	    {
   163 		Uint32 Pixel;
   164 		unsigned sR;
   165 		unsigned sG;
   166 		unsigned sB;
   167 		unsigned dR;
   168 		unsigned dG;
   169 		unsigned dB;
   170 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   171 		if ( Pixel != ckey ) {
   172 		    dR = dstfmt->palette->colors[*dst].r;
   173 		    dG = dstfmt->palette->colors[*dst].g;
   174 		    dB = dstfmt->palette->colors[*dst].b;
   175 		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
   176 		    dR &= 0xff;
   177 		    dG &= 0xff;
   178 		    dB &= 0xff;
   179 		    /* Pack RGB into 8bit pixel */
   180 		    if ( palmap == NULL ) {
   181 			*dst =((dR>>5)<<(3+2))|
   182 			      ((dG>>5)<<(2)) |
   183 			      ((dB>>6)<<(0));
   184 		    } else {
   185 			*dst = palmap[((dR>>5)<<(3+2))|
   186 				      ((dG>>5)<<(2))  |
   187 				      ((dB>>6)<<(0))  ];
   188 		    }
   189 		}
   190 		dst++;
   191 		src += srcbpp;
   192 	    },
   193 	    width);
   194 	    /* *INDENT-ON* */
   195         src += srcskip;
   196         dst += dstskip;
   197     }
   198 }
   199 
   200 #ifdef __MMX__
   201 
   202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   203 static void
   204 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   205 {
   206     int width = info->dst_w;
   207     int height = info->dst_h;
   208     Uint32 *srcp = (Uint32 *) info->src;
   209     int srcskip = info->src_skip >> 2;
   210     Uint32 *dstp = (Uint32 *) info->dst;
   211     int dstskip = info->dst_skip >> 2;
   212     Uint32 dalpha = info->dst_fmt->Amask;
   213 
   214     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   215 
   216     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   217     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   218     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   219 
   220     while (height--) {
   221         int n = width;
   222         if (n & 1) {
   223             Uint32 s = *srcp++;
   224             Uint32 d = *dstp;
   225             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   226                        + (s & d & 0x00010101)) | dalpha;
   227             n--;
   228         }
   229 
   230         for (n >>= 1; n > 0; --n) {
   231             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   232             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   233 
   234             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   235             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   236 
   237             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   238             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   239             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   240             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   241 
   242             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   243             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   244             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   245             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   246 
   247             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   248             dstp += 2;
   249             srcp += 2;
   250         }
   251 
   252         srcp += srcskip;
   253         dstp += dstskip;
   254     }
   255     _mm_empty();
   256 }
   257 
   258 /* fast RGB888->(A)RGB888 blending with surface alpha */
   259 static void
   260 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   261 {
   262     SDL_PixelFormat *df = info->dst_fmt;
   263     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   264     unsigned alpha = info->a;
   265 
   266     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   267         /* only call a128 version when R,G,B occupy lower bits */
   268         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   269     } else {
   270         int width = info->dst_w;
   271         int height = info->dst_h;
   272         Uint32 *srcp = (Uint32 *) info->src;
   273         int srcskip = info->src_skip >> 2;
   274         Uint32 *dstp = (Uint32 *) info->dst;
   275         int dstskip = info->dst_skip >> 2;
   276         Uint32 dalpha = df->Amask;
   277         Uint32 amult;
   278 
   279         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   280 
   281         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   282         /* form the alpha mult */
   283         amult = alpha | (alpha << 8);
   284         amult = amult | (amult << 16);
   285         chanmask =
   286             (0xff << df->Rshift) | (0xff << df->
   287                                     Gshift) | (0xff << df->Bshift);
   288         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   289         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   290         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   291         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   292 
   293         while (height--) {
   294             int n = width;
   295             if (n & 1) {
   296                 /* One Pixel Blend */
   297                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   298                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   299 
   300                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   301                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   302 
   303                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   304                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   305                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   306                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   307 
   308                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   309                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   310                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   311 
   312                 ++srcp;
   313                 ++dstp;
   314 
   315                 n--;
   316             }
   317 
   318             for (n >>= 1; n > 0; --n) {
   319                 /* Two Pixels Blend */
   320                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   321                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   322                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   323                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   324 
   325                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   326                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   327                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   328                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   329 
   330                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   331                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   332                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   333                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   334 
   335                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   336                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   337                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   338                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   339 
   340                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   341                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   342 
   343                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   344 
   345                 srcp += 2;
   346                 dstp += 2;
   347             }
   348             srcp += srcskip;
   349             dstp += dstskip;
   350         }
   351         _mm_empty();
   352     }
   353 }
   354 
   355 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   356 static void
   357 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   358 {
   359     int width = info->dst_w;
   360     int height = info->dst_h;
   361     Uint32 *srcp = (Uint32 *) info->src;
   362     int srcskip = info->src_skip >> 2;
   363     Uint32 *dstp = (Uint32 *) info->dst;
   364     int dstskip = info->dst_skip >> 2;
   365     SDL_PixelFormat *sf = info->src_fmt;
   366     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   367     Uint32 amask = sf->Amask;
   368     Uint32 ashift = sf->Ashift;
   369     Uint64 multmask;
   370 
   371     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   372 
   373     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   374     multmask = 0xFFFF;
   375     multmask <<= (ashift * 2);
   376     multmask = ~multmask;
   377     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   378 
   379     while (height--) {
   380 		/* *INDENT-OFF* */
   381 		DUFFS_LOOP4({
   382 		Uint32 alpha = *srcp & amask;
   383 		if (alpha == 0) {
   384 			/* do nothing */
   385 		} else if (alpha == amask) {
   386 			/* opaque alpha -- copy RGB, keep dst alpha */
   387 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   388 		} else {
   389 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   390 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   391 
   392 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   393 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   394 
   395 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   396 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   397 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   398 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   399 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   400 
   401 			/* blend */		    
   402 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   403 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   404 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   405 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   406 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   407 			
   408 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   409 		}
   410 		++srcp;
   411 		++dstp;
   412 	    }, width);
   413 		/* *INDENT-ON* */
   414         srcp += srcskip;
   415         dstp += dstskip;
   416     }
   417     _mm_empty();
   418 }
   419 
   420 #endif /* __MMX__ */
   421 
   422 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   423 static void
   424 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
   425 {
   426     int width = info->dst_w;
   427     int height = info->dst_h;
   428     Uint32 *srcp = (Uint32 *) info->src;
   429     int srcskip = info->src_skip >> 2;
   430     Uint32 *dstp = (Uint32 *) info->dst;
   431     int dstskip = info->dst_skip >> 2;
   432 
   433     while (height--) {
   434 	    /* *INDENT-OFF* */
   435 	    DUFFS_LOOP4({
   436 		    Uint32 s = *srcp++;
   437 		    Uint32 d = *dstp;
   438 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   439 			       + (s & d & 0x00010101)) | 0xff000000;
   440 	    }, width);
   441 	    /* *INDENT-ON* */
   442         srcp += srcskip;
   443         dstp += dstskip;
   444     }
   445 }
   446 
   447 /* fast RGB888->(A)RGB888 blending with surface alpha */
   448 static void
   449 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
   450 {
   451     unsigned alpha = info->a;
   452     if (alpha == 128) {
   453         BlitRGBtoRGBSurfaceAlpha128(info);
   454     } else {
   455         int width = info->dst_w;
   456         int height = info->dst_h;
   457         Uint32 *srcp = (Uint32 *) info->src;
   458         int srcskip = info->src_skip >> 2;
   459         Uint32 *dstp = (Uint32 *) info->dst;
   460         int dstskip = info->dst_skip >> 2;
   461         Uint32 s;
   462         Uint32 d;
   463         Uint32 s1;
   464         Uint32 d1;
   465 
   466         while (height--) {
   467 			/* *INDENT-OFF* */
   468 			DUFFS_LOOP4({
   469 				s = *srcp;
   470 				d = *dstp;
   471 				s1 = s & 0xff00ff;
   472 				d1 = d & 0xff00ff;
   473 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
   474 				     & 0xff00ff;
   475 				s &= 0xff00;
   476 				d &= 0xff00;
   477 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   478 				*dstp = d1 | d | 0xff000000;
   479 				++srcp;
   480 				++dstp;
   481 			}, width);
   482 			/* *INDENT-ON* */
   483             srcp += srcskip;
   484             dstp += dstskip;
   485         }
   486     }
   487 }
   488 
   489 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   490 static void
   491 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
   492 {
   493     int width = info->dst_w;
   494     int height = info->dst_h;
   495     Uint32 *srcp = (Uint32 *) info->src;
   496     int srcskip = info->src_skip >> 2;
   497     Uint32 *dstp = (Uint32 *) info->dst;
   498     int dstskip = info->dst_skip >> 2;
   499 
   500     while (height--) {
   501 	    /* *INDENT-OFF* */
   502 	    DUFFS_LOOP4({
   503 		Uint32 dalpha;
   504 		Uint32 d;
   505 		Uint32 s1;
   506 		Uint32 d1;
   507 		Uint32 s = *srcp;
   508 		Uint32 alpha = s >> 24;
   509 		/* FIXME: Here we special-case opaque alpha since the
   510 		   compositioning used (>>8 instead of /255) doesn't handle
   511 		   it correctly. Also special-case alpha=0 for speed?
   512 		   Benchmark this! */
   513 		if(alpha) {   
   514 		  if(alpha == SDL_ALPHA_OPAQUE) {
   515 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
   516 		  } else {
   517 		    /*
   518 		     * take out the middle component (green), and process
   519 		     * the other two in parallel. One multiply less.
   520 		     */
   521 		    d = *dstp;
   522 		    dalpha = d & 0xff000000;
   523 		    s1 = s & 0xff00ff;
   524 		    d1 = d & 0xff00ff;
   525 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   526 		    s &= 0xff00;
   527 		    d &= 0xff00;
   528 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   529 		    *dstp = d1 | d | dalpha;
   530 		  }
   531 		}
   532 		++srcp;
   533 		++dstp;
   534 	    }, width);
   535 	    /* *INDENT-ON* */
   536         srcp += srcskip;
   537         dstp += dstskip;
   538     }
   539 }
   540 
   541 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   542 
   543 /* blend a single 16 bit pixel at 50% */
   544 #define BLEND16_50(d, s, mask)						\
   545 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   546 
   547 /* blend two 16 bit pixels at 50% */
   548 #define BLEND2x16_50(d, s, mask)					     \
   549 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   550 	 + (s & d & (~(mask | mask << 16))))
   551 
   552 static void
   553 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
   554 {
   555     int width = info->dst_w;
   556     int height = info->dst_h;
   557     Uint16 *srcp = (Uint16 *) info->src;
   558     int srcskip = info->src_skip >> 1;
   559     Uint16 *dstp = (Uint16 *) info->dst;
   560     int dstskip = info->dst_skip >> 1;
   561 
   562     while (height--) {
   563         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
   564             /*
   565              * Source and destination not aligned, pipeline it.
   566              * This is mostly a win for big blits but no loss for
   567              * small ones
   568              */
   569             Uint32 prev_sw;
   570             int w = width;
   571 
   572             /* handle odd destination */
   573             if ((uintptr_t) dstp & 2) {
   574                 Uint16 d = *dstp, s = *srcp;
   575                 *dstp = BLEND16_50(d, s, mask);
   576                 dstp++;
   577                 srcp++;
   578                 w--;
   579             }
   580             srcp++;             /* srcp is now 32-bit aligned */
   581 
   582             /* bootstrap pipeline with first halfword */
   583             prev_sw = ((Uint32 *) srcp)[-1];
   584 
   585             while (w > 1) {
   586                 Uint32 sw, dw, s;
   587                 sw = *(Uint32 *) srcp;
   588                 dw = *(Uint32 *) dstp;
   589 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   590                 s = (prev_sw << 16) + (sw >> 16);
   591 #else
   592                 s = (prev_sw >> 16) + (sw << 16);
   593 #endif
   594                 prev_sw = sw;
   595                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
   596                 dstp += 2;
   597                 srcp += 2;
   598                 w -= 2;
   599             }
   600 
   601             /* final pixel if any */
   602             if (w) {
   603                 Uint16 d = *dstp, s;
   604 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   605                 s = (Uint16) prev_sw;
   606 #else
   607                 s = (Uint16) (prev_sw >> 16);
   608 #endif
   609                 *dstp = BLEND16_50(d, s, mask);
   610                 srcp++;
   611                 dstp++;
   612             }
   613             srcp += srcskip - 1;
   614             dstp += dstskip;
   615         } else {
   616             /* source and destination are aligned */
   617             int w = width;
   618 
   619             /* first odd pixel? */
   620             if ((uintptr_t) srcp & 2) {
   621                 Uint16 d = *dstp, s = *srcp;
   622                 *dstp = BLEND16_50(d, s, mask);
   623                 srcp++;
   624                 dstp++;
   625                 w--;
   626             }
   627             /* srcp and dstp are now 32-bit aligned */
   628 
   629             while (w > 1) {
   630                 Uint32 sw = *(Uint32 *) srcp;
   631                 Uint32 dw = *(Uint32 *) dstp;
   632                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
   633                 srcp += 2;
   634                 dstp += 2;
   635                 w -= 2;
   636             }
   637 
   638             /* last odd pixel? */
   639             if (w) {
   640                 Uint16 d = *dstp, s = *srcp;
   641                 *dstp = BLEND16_50(d, s, mask);
   642                 srcp++;
   643                 dstp++;
   644             }
   645             srcp += srcskip;
   646             dstp += dstskip;
   647         }
   648     }
   649 }
   650 
   651 #ifdef __MMX__
   652 
   653 /* fast RGB565->RGB565 blending with surface alpha */
   654 static void
   655 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   656 {
   657     unsigned alpha = info->a;
   658     if (alpha == 128) {
   659         Blit16to16SurfaceAlpha128(info, 0xf7de);
   660     } else {
   661         int width = info->dst_w;
   662         int height = info->dst_h;
   663         Uint16 *srcp = (Uint16 *) info->src;
   664         int srcskip = info->src_skip >> 1;
   665         Uint16 *dstp = (Uint16 *) info->dst;
   666         int dstskip = info->dst_skip >> 1;
   667         Uint32 s, d;
   668 
   669         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   670 
   671         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   672         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   673         alpha >>= 3;            /* downscale alpha to 5 bits */
   674 
   675         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   676         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   677         /* position alpha to allow for mullo and mulhi on diff channels
   678            to reduce the number of operations */
   679         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   680 
   681         /* Setup the 565 color channel masks */
   682         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
   683         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   684 
   685         while (height--) {
   686 			/* *INDENT-OFF* */
   687 			DUFFS_LOOP_124(
   688 			{
   689 				s = *srcp++;
   690 				d = *dstp;
   691 				/*
   692 				 * shift out the middle component (green) to
   693 				 * the high 16 bits, and process all three RGB
   694 				 * components at the same time.
   695 				 */
   696 				s = (s | s << 16) & 0x07e0f81f;
   697 				d = (d | d << 16) & 0x07e0f81f;
   698 				d += (s - d) * alpha >> 5;
   699 				d &= 0x07e0f81f;
   700 				*dstp++ = (Uint16)(d | d >> 16);
   701 			},{
   702 				s = *srcp++;
   703 				d = *dstp;
   704 				/*
   705 				 * shift out the middle component (green) to
   706 				 * the high 16 bits, and process all three RGB
   707 				 * components at the same time.
   708 				 */
   709 				s = (s | s << 16) & 0x07e0f81f;
   710 				d = (d | d << 16) & 0x07e0f81f;
   711 				d += (s - d) * alpha >> 5;
   712 				d &= 0x07e0f81f;
   713 				*dstp++ = (Uint16)(d | d >> 16);
   714 				s = *srcp++;
   715 				d = *dstp;
   716 				/*
   717 				 * shift out the middle component (green) to
   718 				 * the high 16 bits, and process all three RGB
   719 				 * components at the same time.
   720 				 */
   721 				s = (s | s << 16) & 0x07e0f81f;
   722 				d = (d | d << 16) & 0x07e0f81f;
   723 				d += (s - d) * alpha >> 5;
   724 				d &= 0x07e0f81f;
   725 				*dstp++ = (Uint16)(d | d >> 16);
   726 			},{
   727 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   728 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   729 
   730 				/* red */
   731 				src2 = src1;
   732 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   733 
   734 				dst2 = dst1;
   735 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   736 
   737 				/* blend */
   738 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   739 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   740 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   741 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   742 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   743 
   744 				mm_res = dst2; /* RED -> mm_res */
   745 
   746 				/* green -- process the bits in place */
   747 				src2 = src1;
   748 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   749 
   750 				dst2 = dst1;
   751 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   752 
   753 				/* blend */
   754 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   755 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   756 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   757 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   758 
   759 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   760 
   761 				/* blue */
   762 				src2 = src1;
   763 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   764 
   765 				dst2 = dst1;
   766 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   767 
   768 				/* blend */
   769 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   770 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   771 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   772 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   773 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   774 
   775 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   776 
   777 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   778 
   779 				srcp += 4;
   780 				dstp += 4;
   781 			}, width);
   782 			/* *INDENT-ON* */
   783             srcp += srcskip;
   784             dstp += dstskip;
   785         }
   786         _mm_empty();
   787     }
   788 }
   789 
   790 /* fast RGB555->RGB555 blending with surface alpha */
   791 static void
   792 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   793 {
   794     unsigned alpha = info->a;
   795     if (alpha == 128) {
   796         Blit16to16SurfaceAlpha128(info, 0xfbde);
   797     } else {
   798         int width = info->dst_w;
   799         int height = info->dst_h;
   800         Uint16 *srcp = (Uint16 *) info->src;
   801         int srcskip = info->src_skip >> 1;
   802         Uint16 *dstp = (Uint16 *) info->dst;
   803         int dstskip = info->dst_skip >> 1;
   804         Uint32 s, d;
   805 
   806         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   807 
   808         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   809         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   810         alpha >>= 3;            /* downscale alpha to 5 bits */
   811 
   812         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   813         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   814         /* position alpha to allow for mullo and mulhi on diff channels
   815            to reduce the number of operations */
   816         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   817 
   818         /* Setup the 555 color channel masks */
   819         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
   820         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
   821         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   822 
   823         while (height--) {
   824 			/* *INDENT-OFF* */
   825 			DUFFS_LOOP_124(
   826 			{
   827 				s = *srcp++;
   828 				d = *dstp;
   829 				/*
   830 				 * shift out the middle component (green) to
   831 				 * the high 16 bits, and process all three RGB
   832 				 * components at the same time.
   833 				 */
   834 				s = (s | s << 16) & 0x03e07c1f;
   835 				d = (d | d << 16) & 0x03e07c1f;
   836 				d += (s - d) * alpha >> 5;
   837 				d &= 0x03e07c1f;
   838 				*dstp++ = (Uint16)(d | d >> 16);
   839 			},{
   840 				s = *srcp++;
   841 				d = *dstp;
   842 				/*
   843 				 * shift out the middle component (green) to
   844 				 * the high 16 bits, and process all three RGB
   845 				 * components at the same time.
   846 				 */
   847 				s = (s | s << 16) & 0x03e07c1f;
   848 				d = (d | d << 16) & 0x03e07c1f;
   849 				d += (s - d) * alpha >> 5;
   850 				d &= 0x03e07c1f;
   851 				*dstp++ = (Uint16)(d | d >> 16);
   852 			        s = *srcp++;
   853 				d = *dstp;
   854 				/*
   855 				 * shift out the middle component (green) to
   856 				 * the high 16 bits, and process all three RGB
   857 				 * components at the same time.
   858 				 */
   859 				s = (s | s << 16) & 0x03e07c1f;
   860 				d = (d | d << 16) & 0x03e07c1f;
   861 				d += (s - d) * alpha >> 5;
   862 				d &= 0x03e07c1f;
   863 				*dstp++ = (Uint16)(d | d >> 16);
   864 			},{
   865 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   866 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   867 
   868 				/* red -- process the bits in place */
   869 				src2 = src1;
   870 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   871 
   872 				dst2 = dst1;
   873 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   874 
   875 				/* blend */
   876 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   877 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   878 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   879 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   880 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   881 
   882 				mm_res = dst2; /* RED -> mm_res */
   883 				
   884 				/* green -- process the bits in place */
   885 				src2 = src1;
   886 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   887 
   888 				dst2 = dst1;
   889 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   890 
   891 				/* blend */
   892 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   893 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   894 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   895 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   896 
   897 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   898 
   899 				/* blue */
   900 				src2 = src1; /* src -> src2 */
   901 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   902 
   903 				dst2 = dst1; /* dst -> dst2 */
   904 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   905 
   906 				/* blend */
   907 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   908 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   909 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   910 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   911 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   912 
   913 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   914 
   915 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   916 
   917 				srcp += 4;
   918 				dstp += 4;
   919 			}, width);
   920 			/* *INDENT-ON* */
   921             srcp += srcskip;
   922             dstp += dstskip;
   923         }
   924         _mm_empty();
   925     }
   926 }
   927 
   928 #endif /* __MMX__ */
   929 
   930 /* fast RGB565->RGB565 blending with surface alpha */
   931 static void
   932 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
   933 {
   934     unsigned alpha = info->a;
   935     if (alpha == 128) {
   936         Blit16to16SurfaceAlpha128(info, 0xf7de);
   937     } else {
   938         int width = info->dst_w;
   939         int height = info->dst_h;
   940         Uint16 *srcp = (Uint16 *) info->src;
   941         int srcskip = info->src_skip >> 1;
   942         Uint16 *dstp = (Uint16 *) info->dst;
   943         int dstskip = info->dst_skip >> 1;
   944         alpha >>= 3;            /* downscale alpha to 5 bits */
   945 
   946         while (height--) {
   947 			/* *INDENT-OFF* */
   948 			DUFFS_LOOP4({
   949 				Uint32 s = *srcp++;
   950 				Uint32 d = *dstp;
   951 				/*
   952 				 * shift out the middle component (green) to
   953 				 * the high 16 bits, and process all three RGB
   954 				 * components at the same time.
   955 				 */
   956 				s = (s | s << 16) & 0x07e0f81f;
   957 				d = (d | d << 16) & 0x07e0f81f;
   958 				d += (s - d) * alpha >> 5;
   959 				d &= 0x07e0f81f;
   960 				*dstp++ = (Uint16)(d | d >> 16);
   961 			}, width);
   962 			/* *INDENT-ON* */
   963             srcp += srcskip;
   964             dstp += dstskip;
   965         }
   966     }
   967 }
   968 
   969 /* fast RGB555->RGB555 blending with surface alpha */
   970 static void
   971 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
   972 {
   973     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
   974     if (alpha == 128) {
   975         Blit16to16SurfaceAlpha128(info, 0xfbde);
   976     } else {
   977         int width = info->dst_w;
   978         int height = info->dst_h;
   979         Uint16 *srcp = (Uint16 *) info->src;
   980         int srcskip = info->src_skip >> 1;
   981         Uint16 *dstp = (Uint16 *) info->dst;
   982         int dstskip = info->dst_skip >> 1;
   983         alpha >>= 3;            /* downscale alpha to 5 bits */
   984 
   985         while (height--) {
   986 			/* *INDENT-OFF* */
   987 			DUFFS_LOOP4({
   988 				Uint32 s = *srcp++;
   989 				Uint32 d = *dstp;
   990 				/*
   991 				 * shift out the middle component (green) to
   992 				 * the high 16 bits, and process all three RGB
   993 				 * components at the same time.
   994 				 */
   995 				s = (s | s << 16) & 0x03e07c1f;
   996 				d = (d | d << 16) & 0x03e07c1f;
   997 				d += (s - d) * alpha >> 5;
   998 				d &= 0x03e07c1f;
   999 				*dstp++ = (Uint16)(d | d >> 16);
  1000 			}, width);
  1001 			/* *INDENT-ON* */
  1002             srcp += srcskip;
  1003             dstp += dstskip;
  1004         }
  1005     }
  1006 }
  1007 
  1008 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1009 static void
  1010 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1011 {
  1012     int width = info->dst_w;
  1013     int height = info->dst_h;
  1014     Uint32 *srcp = (Uint32 *) info->src;
  1015     int srcskip = info->src_skip >> 2;
  1016     Uint16 *dstp = (Uint16 *) info->dst;
  1017     int dstskip = info->dst_skip >> 1;
  1018 
  1019     while (height--) {
  1020 	    /* *INDENT-OFF* */
  1021 	    DUFFS_LOOP4({
  1022 		Uint32 s = *srcp;
  1023 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1024 		/* FIXME: Here we special-case opaque alpha since the
  1025 		   compositioning used (>>8 instead of /255) doesn't handle
  1026 		   it correctly. Also special-case alpha=0 for speed?
  1027 		   Benchmark this! */
  1028 		if(alpha) {   
  1029 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1030 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1031 		  } else {
  1032 		    Uint32 d = *dstp;
  1033 		    /*
  1034 		     * convert source and destination to G0RAB65565
  1035 		     * and blend all components at the same time
  1036 		     */
  1037 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1038 		      + (s >> 3 & 0x1f);
  1039 		    d = (d | d << 16) & 0x07e0f81f;
  1040 		    d += (s - d) * alpha >> 5;
  1041 		    d &= 0x07e0f81f;
  1042 		    *dstp = (Uint16)(d | d >> 16);
  1043 		  }
  1044 		}
  1045 		srcp++;
  1046 		dstp++;
  1047 	    }, width);
  1048 	    /* *INDENT-ON* */
  1049         srcp += srcskip;
  1050         dstp += dstskip;
  1051     }
  1052 }
  1053 
  1054 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1055 static void
  1056 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1057 {
  1058     int width = info->dst_w;
  1059     int height = info->dst_h;
  1060     Uint32 *srcp = (Uint32 *) info->src;
  1061     int srcskip = info->src_skip >> 2;
  1062     Uint16 *dstp = (Uint16 *) info->dst;
  1063     int dstskip = info->dst_skip >> 1;
  1064 
  1065     while (height--) {
  1066 	    /* *INDENT-OFF* */
  1067 	    DUFFS_LOOP4({
  1068 		unsigned alpha;
  1069 		Uint32 s = *srcp;
  1070 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1071 		/* FIXME: Here we special-case opaque alpha since the
  1072 		   compositioning used (>>8 instead of /255) doesn't handle
  1073 		   it correctly. Also special-case alpha=0 for speed?
  1074 		   Benchmark this! */
  1075 		if(alpha) {   
  1076 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1077 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1078 		  } else {
  1079 		    Uint32 d = *dstp;
  1080 		    /*
  1081 		     * convert source and destination to G0RAB65565
  1082 		     * and blend all components at the same time
  1083 		     */
  1084 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1085 		      + (s >> 3 & 0x1f);
  1086 		    d = (d | d << 16) & 0x03e07c1f;
  1087 		    d += (s - d) * alpha >> 5;
  1088 		    d &= 0x03e07c1f;
  1089 		    *dstp = (Uint16)(d | d >> 16);
  1090 		  }
  1091 		}
  1092 		srcp++;
  1093 		dstp++;
  1094 	    }, width);
  1095 	    /* *INDENT-ON* */
  1096         srcp += srcskip;
  1097         dstp += dstskip;
  1098     }
  1099 }
  1100 
  1101 /* General (slow) N->N blending with per-surface alpha */
  1102 static void
  1103 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1104 {
  1105     int width = info->dst_w;
  1106     int height = info->dst_h;
  1107     Uint8 *src = info->src;
  1108     int srcskip = info->src_skip;
  1109     Uint8 *dst = info->dst;
  1110     int dstskip = info->dst_skip;
  1111     SDL_PixelFormat *srcfmt = info->src_fmt;
  1112     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1113     int srcbpp = srcfmt->BytesPerPixel;
  1114     int dstbpp = dstfmt->BytesPerPixel;
  1115     unsigned sA = info->a;
  1116     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1117 
  1118     if (sA) {
  1119         while (height--) {
  1120 	    /* *INDENT-OFF* */
  1121 	    DUFFS_LOOP4(
  1122 	    {
  1123 		Uint32 Pixel;
  1124 		unsigned sR;
  1125 		unsigned sG;
  1126 		unsigned sB;
  1127 		unsigned dR;
  1128 		unsigned dG;
  1129 		unsigned dB;
  1130 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  1131 		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  1132 		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  1133 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1134 		src += srcbpp;
  1135 		dst += dstbpp;
  1136 	    },
  1137 	    width);
  1138 	    /* *INDENT-ON* */
  1139             src += srcskip;
  1140             dst += dstskip;
  1141         }
  1142     }
  1143 }
  1144 
  1145 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  1146 static void
  1147 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  1148 {
  1149     int width = info->dst_w;
  1150     int height = info->dst_h;
  1151     Uint8 *src = info->src;
  1152     int srcskip = info->src_skip;
  1153     Uint8 *dst = info->dst;
  1154     int dstskip = info->dst_skip;
  1155     SDL_PixelFormat *srcfmt = info->src_fmt;
  1156     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1157     Uint32 ckey = info->colorkey;
  1158     int srcbpp = srcfmt->BytesPerPixel;
  1159     int dstbpp = dstfmt->BytesPerPixel;
  1160     unsigned sA = info->a;
  1161     unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
  1162 
  1163     while (height--) {
  1164 	    /* *INDENT-OFF* */
  1165 	    DUFFS_LOOP4(
  1166 	    {
  1167 		Uint32 Pixel;
  1168 		unsigned sR;
  1169 		unsigned sG;
  1170 		unsigned sB;
  1171 		unsigned dR;
  1172 		unsigned dG;
  1173 		unsigned dB;
  1174 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  1175 		if(sA && Pixel != ckey) {
  1176 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  1177 		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
  1178 		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  1179 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1180 		}
  1181 		src += srcbpp;
  1182 		dst += dstbpp;
  1183 	    },
  1184 	    width);
  1185 	    /* *INDENT-ON* */
  1186         src += srcskip;
  1187         dst += dstskip;
  1188     }
  1189 }
  1190 
  1191 /* General (slow) N->N blending with pixel alpha */
  1192 static void
  1193 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  1194 {
  1195     int width = info->dst_w;
  1196     int height = info->dst_h;
  1197     Uint8 *src = info->src;
  1198     int srcskip = info->src_skip;
  1199     Uint8 *dst = info->dst;
  1200     int dstskip = info->dst_skip;
  1201     SDL_PixelFormat *srcfmt = info->src_fmt;
  1202     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1203 
  1204     int srcbpp;
  1205     int dstbpp;
  1206 
  1207     /* Set up some basic variables */
  1208     srcbpp = srcfmt->BytesPerPixel;
  1209     dstbpp = dstfmt->BytesPerPixel;
  1210 
  1211     /* FIXME: for 8bpp source alpha, this doesn't get opaque values
  1212        quite right. for <8bpp source alpha, it gets them very wrong
  1213        (check all macros!)
  1214        It is unclear whether there is a good general solution that doesn't
  1215        need a branch (or a divide). */
  1216     while (height--) {
  1217 	    /* *INDENT-OFF* */
  1218 	    DUFFS_LOOP4(
  1219 	    {
  1220 		Uint32 Pixel;
  1221 		unsigned sR;
  1222 		unsigned sG;
  1223 		unsigned sB;
  1224 		unsigned dR;
  1225 		unsigned dG;
  1226 		unsigned dB;
  1227 		unsigned sA;
  1228 		unsigned dA;
  1229 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1230 		if(sA) {
  1231 		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1232 		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
  1233 		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1234 		}
  1235 		src += srcbpp;
  1236 		dst += dstbpp;
  1237 	    },
  1238 	    width);
  1239 	    /* *INDENT-ON* */
  1240         src += srcskip;
  1241         dst += dstskip;
  1242     }
  1243 }
  1244 
  1245 
  1246 SDL_BlitFunc
  1247 SDL_CalculateBlitA(SDL_Surface * surface)
  1248 {
  1249     SDL_PixelFormat *sf = surface->format;
  1250     SDL_PixelFormat *df = surface->map->dst->format;
  1251 
  1252     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  1253     case SDL_COPY_BLEND:
  1254         /* Per-pixel alpha blits */
  1255         switch (df->BytesPerPixel) {
  1256         case 1:
  1257             return BlitNto1PixelAlpha;
  1258 
  1259         case 2:
  1260             if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1261                 && sf->Gmask == 0xff00
  1262                 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1263                     || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1264                 if (df->Gmask == 0x7e0)
  1265                     return BlitARGBto565PixelAlpha;
  1266                 else if (df->Gmask == 0x3e0)
  1267                     return BlitARGBto555PixelAlpha;
  1268             }
  1269             return BlitNtoNPixelAlpha;
  1270 
  1271         case 4:
  1272             if (sf->Rmask == df->Rmask
  1273                 && sf->Gmask == df->Gmask
  1274                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1275 #if defined(__MMX__)
  1276                 if (sf->Rshift % 8 == 0
  1277                     && sf->Gshift % 8 == 0
  1278                     && sf->Bshift % 8 == 0
  1279                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  1280                     if (SDL_HasMMX())
  1281                         return BlitRGBtoRGBPixelAlphaMMX;
  1282                 }
  1283 #endif /* __MMX__ */
  1284                 if (sf->Amask == 0xff000000) {
  1285                     return BlitRGBtoRGBPixelAlpha;
  1286                 }
  1287             }
  1288             return BlitNtoNPixelAlpha;
  1289 
  1290         case 3:
  1291         default:
  1292             return BlitNtoNPixelAlpha;
  1293         }
  1294         break;
  1295 
  1296     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1297         if (sf->Amask == 0) {
  1298             /* Per-surface alpha blits */
  1299             switch (df->BytesPerPixel) {
  1300             case 1:
  1301                 return BlitNto1SurfaceAlpha;
  1302 
  1303             case 2:
  1304                 if (surface->map->identity) {
  1305                     if (df->Gmask == 0x7e0) {
  1306 #ifdef __MMX__
  1307                         if (SDL_HasMMX())
  1308                             return Blit565to565SurfaceAlphaMMX;
  1309                         else
  1310 #endif
  1311                             return Blit565to565SurfaceAlpha;
  1312                     } else if (df->Gmask == 0x3e0) {
  1313 #ifdef __MMX__
  1314                         if (SDL_HasMMX())
  1315                             return Blit555to555SurfaceAlphaMMX;
  1316                         else
  1317 #endif
  1318                             return Blit555to555SurfaceAlpha;
  1319                     }
  1320                 }
  1321                 return BlitNtoNSurfaceAlpha;
  1322 
  1323             case 4:
  1324                 if (sf->Rmask == df->Rmask
  1325                     && sf->Gmask == df->Gmask
  1326                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1327 #ifdef __MMX__
  1328                     if (sf->Rshift % 8 == 0
  1329                         && sf->Gshift % 8 == 0
  1330                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  1331                         return BlitRGBtoRGBSurfaceAlphaMMX;
  1332 #endif
  1333                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1334                         return BlitRGBtoRGBSurfaceAlpha;
  1335                     }
  1336                 }
  1337                 return BlitNtoNSurfaceAlpha;
  1338 
  1339             case 3:
  1340             default:
  1341                 return BlitNtoNSurfaceAlpha;
  1342             }
  1343         }
  1344         break;
  1345 
  1346     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1347         if (sf->Amask == 0) {
  1348             if (df->BytesPerPixel == 1)
  1349                 return BlitNto1SurfaceAlphaKey;
  1350             else
  1351                 return BlitNtoNSurfaceAlphaKey;
  1352         }
  1353         break;
  1354     }
  1355 
  1356     return NULL;
  1357 }
  1358 
  1359 /* vi: set ts=4 sw=4 expandtab: */