src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Sun, 14 Jul 2013 11:28:44 -0700
changeset 7776 d4a39491577f
parent 7677 871d43c6968a
child 7790 8136ce6b3950
permissions -rw-r--r--
Added the platform specific messagebox function to the video function list
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2013 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 #include "SDL_config.h"
    22 
    23 #include "SDL_video.h"
    24 #include "SDL_blit.h"
    25 
    26 /* Functions to perform alpha blended blitting */
    27 
    28 /* N->1 blending with per-surface alpha */
    29 static void
    30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    31 {
    32     int width = info->dst_w;
    33     int height = info->dst_h;
    34     Uint8 *src = info->src;
    35     int srcskip = info->src_skip;
    36     Uint8 *dst = info->dst;
    37     int dstskip = info->dst_skip;
    38     Uint8 *palmap = info->table;
    39     SDL_PixelFormat *srcfmt = info->src_fmt;
    40     SDL_PixelFormat *dstfmt = info->dst_fmt;
    41     int srcbpp = srcfmt->BytesPerPixel;
    42     Uint32 Pixel;
    43     unsigned sR, sG, sB;
    44     unsigned dR, dG, dB;
    45     const unsigned A = info->a;
    46 
    47     while (height--) {
    48 	    /* *INDENT-OFF* */
    49 	    DUFFS_LOOP4(
    50 	    {
    51 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    52 		dR = dstfmt->palette->colors[*dst].r;
    53 		dG = dstfmt->palette->colors[*dst].g;
    54 		dB = dstfmt->palette->colors[*dst].b;
    55 		ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    56 		dR &= 0xff;
    57 		dG &= 0xff;
    58 		dB &= 0xff;
    59 		/* Pack RGB into 8bit pixel */
    60 		if ( palmap == NULL ) {
    61 		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    62 		} else {
    63 		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    64 		}
    65 		dst++;
    66 		src += srcbpp;
    67 	    },
    68 	    width);
    69 	    /* *INDENT-ON* */
    70         src += srcskip;
    71         dst += dstskip;
    72     }
    73 }
    74 
    75 /* N->1 blending with pixel alpha */
    76 static void
    77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    78 {
    79     int width = info->dst_w;
    80     int height = info->dst_h;
    81     Uint8 *src = info->src;
    82     int srcskip = info->src_skip;
    83     Uint8 *dst = info->dst;
    84     int dstskip = info->dst_skip;
    85     Uint8 *palmap = info->table;
    86     SDL_PixelFormat *srcfmt = info->src_fmt;
    87     SDL_PixelFormat *dstfmt = info->dst_fmt;
    88     int srcbpp = srcfmt->BytesPerPixel;
    89     Uint32 Pixel;
    90     unsigned sR, sG, sB, sA;
    91     unsigned dR, dG, dB;
    92 
    93     while (height--) {
    94 	    /* *INDENT-OFF* */
    95 	    DUFFS_LOOP4(
    96 	    {
    97 		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    98 		dR = dstfmt->palette->colors[*dst].r;
    99 		dG = dstfmt->palette->colors[*dst].g;
   100 		dB = dstfmt->palette->colors[*dst].b;
   101 		ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
   102 		dR &= 0xff;
   103 		dG &= 0xff;
   104 		dB &= 0xff;
   105 		/* Pack RGB into 8bit pixel */
   106 		if ( palmap == NULL ) {
   107 		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   108 		} else {
   109 		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   110 		}
   111 		dst++;
   112 		src += srcbpp;
   113 	    },
   114 	    width);
   115 	    /* *INDENT-ON* */
   116         src += srcskip;
   117         dst += dstskip;
   118     }
   119 }
   120 
   121 /* colorkeyed N->1 blending with per-surface alpha */
   122 static void
   123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   124 {
   125     int width = info->dst_w;
   126     int height = info->dst_h;
   127     Uint8 *src = info->src;
   128     int srcskip = info->src_skip;
   129     Uint8 *dst = info->dst;
   130     int dstskip = info->dst_skip;
   131     Uint8 *palmap = info->table;
   132     SDL_PixelFormat *srcfmt = info->src_fmt;
   133     SDL_PixelFormat *dstfmt = info->dst_fmt;
   134     int srcbpp = srcfmt->BytesPerPixel;
   135     Uint32 ckey = info->colorkey;
   136     Uint32 Pixel;
   137     unsigned sR, sG, sB;
   138     unsigned dR, dG, dB;
   139     const unsigned A = info->a;
   140 
   141     while (height--) {
   142 	    /* *INDENT-OFF* */
   143 	    DUFFS_LOOP(
   144 	    {
   145 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   146 		if ( Pixel != ckey ) {
   147 		    dR = dstfmt->palette->colors[*dst].r;
   148 		    dG = dstfmt->palette->colors[*dst].g;
   149 		    dB = dstfmt->palette->colors[*dst].b;
   150 		    ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
   151 		    dR &= 0xff;
   152 		    dG &= 0xff;
   153 		    dB &= 0xff;
   154 		    /* Pack RGB into 8bit pixel */
   155 		    if ( palmap == NULL ) {
   156                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   157 		    } else {
   158                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   159 		    }
   160 		}
   161 		dst++;
   162 		src += srcbpp;
   163 	    },
   164 	    width);
   165 	    /* *INDENT-ON* */
   166         src += srcskip;
   167         dst += dstskip;
   168     }
   169 }
   170 
   171 #ifdef __MMX__
   172 
   173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   174 static void
   175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   176 {
   177     int width = info->dst_w;
   178     int height = info->dst_h;
   179     Uint32 *srcp = (Uint32 *) info->src;
   180     int srcskip = info->src_skip >> 2;
   181     Uint32 *dstp = (Uint32 *) info->dst;
   182     int dstskip = info->dst_skip >> 2;
   183     Uint32 dalpha = info->dst_fmt->Amask;
   184 
   185     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   186 
   187     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   188     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   189     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   190 
   191     while (height--) {
   192         int n = width;
   193         if (n & 1) {
   194             Uint32 s = *srcp++;
   195             Uint32 d = *dstp;
   196             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   197                        + (s & d & 0x00010101)) | dalpha;
   198             n--;
   199         }
   200 
   201         for (n >>= 1; n > 0; --n) {
   202             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   203             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   204 
   205             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   206             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   207 
   208             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   209             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   210             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   211             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   212 
   213             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   214             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   215             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   216             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   217 
   218             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   219             dstp += 2;
   220             srcp += 2;
   221         }
   222 
   223         srcp += srcskip;
   224         dstp += dstskip;
   225     }
   226     _mm_empty();
   227 }
   228 
   229 /* fast RGB888->(A)RGB888 blending with surface alpha */
   230 static void
   231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   232 {
   233     SDL_PixelFormat *df = info->dst_fmt;
   234     Uint32 chanmask;
   235     unsigned alpha = info->a;
   236 
   237     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   238         /* only call a128 version when R,G,B occupy lower bits */
   239         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   240     } else {
   241         int width = info->dst_w;
   242         int height = info->dst_h;
   243         Uint32 *srcp = (Uint32 *) info->src;
   244         int srcskip = info->src_skip >> 2;
   245         Uint32 *dstp = (Uint32 *) info->dst;
   246         int dstskip = info->dst_skip >> 2;
   247         Uint32 dalpha = df->Amask;
   248         Uint32 amult;
   249 
   250         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   251 
   252         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   253         /* form the alpha mult */
   254         amult = alpha | (alpha << 8);
   255         amult = amult | (amult << 16);
   256         chanmask =
   257             (0xff << df->Rshift) | (0xff << df->
   258                                     Gshift) | (0xff << df->Bshift);
   259         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   260         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   261         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   262         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   263 
   264         while (height--) {
   265             int n = width;
   266             if (n & 1) {
   267                 /* One Pixel Blend */
   268                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   269                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   270 
   271                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   272                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   273 
   274                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   275                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   276                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   277                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   278 
   279                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   280                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   281                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   282 
   283                 ++srcp;
   284                 ++dstp;
   285 
   286                 n--;
   287             }
   288 
   289             for (n >>= 1; n > 0; --n) {
   290                 /* Two Pixels Blend */
   291                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   292                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   293                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   294                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   295 
   296                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   297                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   298                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   299                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   300 
   301                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   302                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   303                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   304                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   305 
   306                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   307                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   308                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   309                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   310 
   311                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   312                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   313 
   314                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   315 
   316                 srcp += 2;
   317                 dstp += 2;
   318             }
   319             srcp += srcskip;
   320             dstp += dstskip;
   321         }
   322         _mm_empty();
   323     }
   324 }
   325 
   326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   327 static void
   328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   329 {
   330     int width = info->dst_w;
   331     int height = info->dst_h;
   332     Uint32 *srcp = (Uint32 *) info->src;
   333     int srcskip = info->src_skip >> 2;
   334     Uint32 *dstp = (Uint32 *) info->dst;
   335     int dstskip = info->dst_skip >> 2;
   336     SDL_PixelFormat *sf = info->src_fmt;
   337     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   338     Uint32 amask = sf->Amask;
   339     Uint32 ashift = sf->Ashift;
   340     Uint64 multmask, multmask2;
   341 
   342     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   343 
   344     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   345     multmask = 0x00FF;
   346 	multmask <<= (ashift * 2);
   347 	multmask2 = 0x00FF00FF00FF00FF;
   348 
   349     while (height--) {
   350 		/* *INDENT-OFF* */
   351 		DUFFS_LOOP4({
   352 		Uint32 alpha = *srcp & amask;
   353 		if (alpha == 0) {
   354 			/* do nothing */
   355 		} else if (alpha == amask) {
   356 			*dstp = *srcp;
   357 		} else {
   358 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   359 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   360 
   361 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   362 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   363 
   364 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   365 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   366 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   367 			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   368 			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
   369 			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
   370 
   371 			/* blend */		    
   372 			src1 = _mm_mullo_pi16(src1, mm_alpha);
   373 			src1 = _mm_srli_pi16(src1, 8);
   374 			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   375 			dst1 = _mm_srli_pi16(dst1, 8);
   376 			dst1 = _mm_add_pi16(src1, dst1);
   377 			dst1 = _mm_packs_pu16(dst1, mm_zero);
   378 			
   379 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   380 		}
   381 		++srcp;
   382 		++dstp;
   383 	    }, width);
   384 		/* *INDENT-ON* */
   385         srcp += srcskip;
   386         dstp += dstskip;
   387     }
   388     _mm_empty();
   389 }
   390 
   391 #endif /* __MMX__ */
   392 
   393 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   394 static void
   395 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
   396 {
   397     int width = info->dst_w;
   398     int height = info->dst_h;
   399     Uint32 *srcp = (Uint32 *) info->src;
   400     int srcskip = info->src_skip >> 2;
   401     Uint32 *dstp = (Uint32 *) info->dst;
   402     int dstskip = info->dst_skip >> 2;
   403 
   404     while (height--) {
   405 	    /* *INDENT-OFF* */
   406 	    DUFFS_LOOP4({
   407 		    Uint32 s = *srcp++;
   408 		    Uint32 d = *dstp;
   409 		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   410 			       + (s & d & 0x00010101)) | 0xff000000;
   411 	    }, width);
   412 	    /* *INDENT-ON* */
   413         srcp += srcskip;
   414         dstp += dstskip;
   415     }
   416 }
   417 
   418 /* fast RGB888->(A)RGB888 blending with surface alpha */
   419 static void
   420 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
   421 {
   422     unsigned alpha = info->a;
   423     if (alpha == 128) {
   424         BlitRGBtoRGBSurfaceAlpha128(info);
   425     } else {
   426         int width = info->dst_w;
   427         int height = info->dst_h;
   428         Uint32 *srcp = (Uint32 *) info->src;
   429         int srcskip = info->src_skip >> 2;
   430         Uint32 *dstp = (Uint32 *) info->dst;
   431         int dstskip = info->dst_skip >> 2;
   432         Uint32 s;
   433         Uint32 d;
   434         Uint32 s1;
   435         Uint32 d1;
   436 
   437         while (height--) {
   438 			/* *INDENT-OFF* */
   439 			DUFFS_LOOP4({
   440 				s = *srcp;
   441 				d = *dstp;
   442 				s1 = s & 0xff00ff;
   443 				d1 = d & 0xff00ff;
   444 				d1 = (d1 + ((s1 - d1) * alpha >> 8))
   445 				     & 0xff00ff;
   446 				s &= 0xff00;
   447 				d &= 0xff00;
   448 				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   449 				*dstp = d1 | d | 0xff000000;
   450 				++srcp;
   451 				++dstp;
   452 			}, width);
   453 			/* *INDENT-ON* */
   454             srcp += srcskip;
   455             dstp += dstskip;
   456         }
   457     }
   458 }
   459 
   460 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   461 static void
   462 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
   463 {
   464     int width = info->dst_w;
   465     int height = info->dst_h;
   466     Uint32 *srcp = (Uint32 *) info->src;
   467     int srcskip = info->src_skip >> 2;
   468     Uint32 *dstp = (Uint32 *) info->dst;
   469     int dstskip = info->dst_skip >> 2;
   470 
   471     while (height--) {
   472 	    /* *INDENT-OFF* */
   473 	    DUFFS_LOOP4({
   474 		Uint32 dalpha;
   475 		Uint32 d;
   476 		Uint32 s1;
   477 		Uint32 d1;
   478 		Uint32 s = *srcp;
   479 		Uint32 alpha = s >> 24;
   480 		/* FIXME: Here we special-case opaque alpha since the
   481 		   compositioning used (>>8 instead of /255) doesn't handle
   482 		   it correctly. Also special-case alpha=0 for speed?
   483 		   Benchmark this! */
   484 		if (alpha) {
   485 		  if (alpha == SDL_ALPHA_OPAQUE) {
   486 			  *dstp = *srcp;
   487 		  } else {
   488 		    /*
   489 		     * take out the middle component (green), and process
   490 		     * the other two in parallel. One multiply less.
   491 		     */
   492 		    d = *dstp;
   493 			dalpha = d >> 24;
   494 		    s1 = s & 0xff00ff;
   495 		    d1 = d & 0xff00ff;
   496 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   497 		    s &= 0xff00;
   498 		    d &= 0xff00;
   499 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   500 			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
   501 		    *dstp = d1 | d | (dalpha << 24);
   502 		  }
   503 		}
   504 		++srcp;
   505 		++dstp;
   506 	    }, width);
   507 	    /* *INDENT-ON* */
   508         srcp += srcskip;
   509         dstp += dstskip;
   510     }
   511 }
   512 
   513 #ifdef __3dNOW__
   514 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   515 static void
   516 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   517 {
   518     int width = info->dst_w;
   519     int height = info->dst_h;
   520     Uint32 *srcp = (Uint32 *) info->src;
   521     int srcskip = info->src_skip >> 2;
   522     Uint32 *dstp = (Uint32 *) info->dst;
   523     int dstskip = info->dst_skip >> 2;
   524     SDL_PixelFormat *sf = info->src_fmt;
   525     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   526     Uint32 amask = sf->Amask;
   527     Uint32 ashift = sf->Ashift;
   528     Uint64 multmask, multmask2;
   529 
   530     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   531 
   532     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   533     multmask = 0x00FF;
   534     multmask <<= (ashift * 2);
   535     multmask2 = 0x00FF00FF00FF00FF;
   536 
   537     while (height--) {
   538 	    /* *INDENT-OFF* */
   539 	    DUFFS_LOOP4({
   540 		Uint32 alpha;
   541 
   542 		_m_prefetch(srcp + 16);
   543 		_m_prefetch(dstp + 16);
   544 
   545 		alpha = *srcp & amask;
   546 		if (alpha == 0) {
   547 			/* do nothing */
   548 		} else if (alpha == amask) {
   549 			*dstp = *srcp;
   550 		} else {
   551 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   552 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   553 
   554 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   555 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   556 
   557 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   558 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   559 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   560 			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   561 			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
   562 			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
   563 
   564 
   565 			/* blend */		    
   566 			src1 = _mm_mullo_pi16(src1, mm_alpha);
   567 			src1 = _mm_srli_pi16(src1, 8);
   568 			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   569 			dst1 = _mm_srli_pi16(dst1, 8);
   570 			dst1 = _mm_add_pi16(src1, dst1);
   571 			dst1 = _mm_packs_pu16(dst1, mm_zero);
   572 			
   573 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   574 		}
   575 		++srcp;
   576 		++dstp;
   577 	    }, width);
   578 	    /* *INDENT-ON* */
   579         srcp += srcskip;
   580         dstp += dstskip;
   581     }
   582     _mm_empty();
   583 }
   584 
   585 #endif /* __MMX__ */
   586 
   587 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   588 
   589 /* blend a single 16 bit pixel at 50% */
   590 #define BLEND16_50(d, s, mask)						\
   591 	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   592 
   593 /* blend two 16 bit pixels at 50% */
   594 #define BLEND2x16_50(d, s, mask)					     \
   595 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   596 	 + (s & d & (~(mask | mask << 16))))
   597 
   598 static void
   599 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
   600 {
   601     int width = info->dst_w;
   602     int height = info->dst_h;
   603     Uint16 *srcp = (Uint16 *) info->src;
   604     int srcskip = info->src_skip >> 1;
   605     Uint16 *dstp = (Uint16 *) info->dst;
   606     int dstskip = info->dst_skip >> 1;
   607 
   608     while (height--) {
   609         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
   610             /*
   611              * Source and destination not aligned, pipeline it.
   612              * This is mostly a win for big blits but no loss for
   613              * small ones
   614              */
   615             Uint32 prev_sw;
   616             int w = width;
   617 
   618             /* handle odd destination */
   619             if ((uintptr_t) dstp & 2) {
   620                 Uint16 d = *dstp, s = *srcp;
   621                 *dstp = BLEND16_50(d, s, mask);
   622                 dstp++;
   623                 srcp++;
   624                 w--;
   625             }
   626             srcp++;             /* srcp is now 32-bit aligned */
   627 
   628             /* bootstrap pipeline with first halfword */
   629             prev_sw = ((Uint32 *) srcp)[-1];
   630 
   631             while (w > 1) {
   632                 Uint32 sw, dw, s;
   633                 sw = *(Uint32 *) srcp;
   634                 dw = *(Uint32 *) dstp;
   635 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   636                 s = (prev_sw << 16) + (sw >> 16);
   637 #else
   638                 s = (prev_sw >> 16) + (sw << 16);
   639 #endif
   640                 prev_sw = sw;
   641                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
   642                 dstp += 2;
   643                 srcp += 2;
   644                 w -= 2;
   645             }
   646 
   647             /* final pixel if any */
   648             if (w) {
   649                 Uint16 d = *dstp, s;
   650 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   651                 s = (Uint16) prev_sw;
   652 #else
   653                 s = (Uint16) (prev_sw >> 16);
   654 #endif
   655                 *dstp = BLEND16_50(d, s, mask);
   656                 srcp++;
   657                 dstp++;
   658             }
   659             srcp += srcskip - 1;
   660             dstp += dstskip;
   661         } else {
   662             /* source and destination are aligned */
   663             int w = width;
   664 
   665             /* first odd pixel? */
   666             if ((uintptr_t) srcp & 2) {
   667                 Uint16 d = *dstp, s = *srcp;
   668                 *dstp = BLEND16_50(d, s, mask);
   669                 srcp++;
   670                 dstp++;
   671                 w--;
   672             }
   673             /* srcp and dstp are now 32-bit aligned */
   674 
   675             while (w > 1) {
   676                 Uint32 sw = *(Uint32 *) srcp;
   677                 Uint32 dw = *(Uint32 *) dstp;
   678                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
   679                 srcp += 2;
   680                 dstp += 2;
   681                 w -= 2;
   682             }
   683 
   684             /* last odd pixel? */
   685             if (w) {
   686                 Uint16 d = *dstp, s = *srcp;
   687                 *dstp = BLEND16_50(d, s, mask);
   688                 srcp++;
   689                 dstp++;
   690             }
   691             srcp += srcskip;
   692             dstp += dstskip;
   693         }
   694     }
   695 }
   696 
   697 #ifdef __MMX__
   698 
   699 /* fast RGB565->RGB565 blending with surface alpha */
   700 static void
   701 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   702 {
   703     unsigned alpha = info->a;
   704     if (alpha == 128) {
   705         Blit16to16SurfaceAlpha128(info, 0xf7de);
   706     } else {
   707         int width = info->dst_w;
   708         int height = info->dst_h;
   709         Uint16 *srcp = (Uint16 *) info->src;
   710         int srcskip = info->src_skip >> 1;
   711         Uint16 *dstp = (Uint16 *) info->dst;
   712         int dstskip = info->dst_skip >> 1;
   713         Uint32 s, d;
   714 
   715         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   716 
   717         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   718         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   719         alpha >>= 3;            /* downscale alpha to 5 bits */
   720 
   721         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   722         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   723         /* position alpha to allow for mullo and mulhi on diff channels
   724            to reduce the number of operations */
   725         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   726 
   727         /* Setup the 565 color channel masks */
   728         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
   729         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   730 
   731         while (height--) {
   732 			/* *INDENT-OFF* */
   733 			DUFFS_LOOP_124(
   734 			{
   735 				s = *srcp++;
   736 				d = *dstp;
   737 				/*
   738 				 * shift out the middle component (green) to
   739 				 * the high 16 bits, and process all three RGB
   740 				 * components at the same time.
   741 				 */
   742 				s = (s | s << 16) & 0x07e0f81f;
   743 				d = (d | d << 16) & 0x07e0f81f;
   744 				d += (s - d) * alpha >> 5;
   745 				d &= 0x07e0f81f;
   746 				*dstp++ = (Uint16)(d | d >> 16);
   747 			},{
   748 				s = *srcp++;
   749 				d = *dstp;
   750 				/*
   751 				 * shift out the middle component (green) to
   752 				 * the high 16 bits, and process all three RGB
   753 				 * components at the same time.
   754 				 */
   755 				s = (s | s << 16) & 0x07e0f81f;
   756 				d = (d | d << 16) & 0x07e0f81f;
   757 				d += (s - d) * alpha >> 5;
   758 				d &= 0x07e0f81f;
   759 				*dstp++ = (Uint16)(d | d >> 16);
   760 				s = *srcp++;
   761 				d = *dstp;
   762 				/*
   763 				 * shift out the middle component (green) to
   764 				 * the high 16 bits, and process all three RGB
   765 				 * components at the same time.
   766 				 */
   767 				s = (s | s << 16) & 0x07e0f81f;
   768 				d = (d | d << 16) & 0x07e0f81f;
   769 				d += (s - d) * alpha >> 5;
   770 				d &= 0x07e0f81f;
   771 				*dstp++ = (Uint16)(d | d >> 16);
   772 			},{
   773 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   774 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   775 
   776 				/* red */
   777 				src2 = src1;
   778 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   779 
   780 				dst2 = dst1;
   781 				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   782 
   783 				/* blend */
   784 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   785 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   786 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   787 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   788 				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   789 
   790 				mm_res = dst2; /* RED -> mm_res */
   791 
   792 				/* green -- process the bits in place */
   793 				src2 = src1;
   794 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   795 
   796 				dst2 = dst1;
   797 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   798 
   799 				/* blend */
   800 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   801 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   802 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   803 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   804 
   805 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   806 
   807 				/* blue */
   808 				src2 = src1;
   809 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   810 
   811 				dst2 = dst1;
   812 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   813 
   814 				/* blend */
   815 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   816 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   817 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   818 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   819 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   820 
   821 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   822 
   823 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   824 
   825 				srcp += 4;
   826 				dstp += 4;
   827 			}, width);
   828 			/* *INDENT-ON* */
   829             srcp += srcskip;
   830             dstp += dstskip;
   831         }
   832         _mm_empty();
   833     }
   834 }
   835 
   836 /* fast RGB555->RGB555 blending with surface alpha */
   837 static void
   838 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   839 {
   840     unsigned alpha = info->a;
   841     if (alpha == 128) {
   842         Blit16to16SurfaceAlpha128(info, 0xfbde);
   843     } else {
   844         int width = info->dst_w;
   845         int height = info->dst_h;
   846         Uint16 *srcp = (Uint16 *) info->src;
   847         int srcskip = info->src_skip >> 1;
   848         Uint16 *dstp = (Uint16 *) info->dst;
   849         int dstskip = info->dst_skip >> 1;
   850         Uint32 s, d;
   851 
   852         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   853 
   854         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   855         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   856         alpha >>= 3;            /* downscale alpha to 5 bits */
   857 
   858         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   859         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   860         /* position alpha to allow for mullo and mulhi on diff channels
   861            to reduce the number of operations */
   862         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   863 
   864         /* Setup the 555 color channel masks */
   865         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
   866         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
   867         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   868 
   869         while (height--) {
   870 			/* *INDENT-OFF* */
   871 			DUFFS_LOOP_124(
   872 			{
   873 				s = *srcp++;
   874 				d = *dstp;
   875 				/*
   876 				 * shift out the middle component (green) to
   877 				 * the high 16 bits, and process all three RGB
   878 				 * components at the same time.
   879 				 */
   880 				s = (s | s << 16) & 0x03e07c1f;
   881 				d = (d | d << 16) & 0x03e07c1f;
   882 				d += (s - d) * alpha >> 5;
   883 				d &= 0x03e07c1f;
   884 				*dstp++ = (Uint16)(d | d >> 16);
   885 			},{
   886 				s = *srcp++;
   887 				d = *dstp;
   888 				/*
   889 				 * shift out the middle component (green) to
   890 				 * the high 16 bits, and process all three RGB
   891 				 * components at the same time.
   892 				 */
   893 				s = (s | s << 16) & 0x03e07c1f;
   894 				d = (d | d << 16) & 0x03e07c1f;
   895 				d += (s - d) * alpha >> 5;
   896 				d &= 0x03e07c1f;
   897 				*dstp++ = (Uint16)(d | d >> 16);
   898 			        s = *srcp++;
   899 				d = *dstp;
   900 				/*
   901 				 * shift out the middle component (green) to
   902 				 * the high 16 bits, and process all three RGB
   903 				 * components at the same time.
   904 				 */
   905 				s = (s | s << 16) & 0x03e07c1f;
   906 				d = (d | d << 16) & 0x03e07c1f;
   907 				d += (s - d) * alpha >> 5;
   908 				d &= 0x03e07c1f;
   909 				*dstp++ = (Uint16)(d | d >> 16);
   910 			},{
   911 				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   912 				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   913 
   914 				/* red -- process the bits in place */
   915 				src2 = src1;
   916 				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   917 
   918 				dst2 = dst1;
   919 				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   920 
   921 				/* blend */
   922 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   923 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   924 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   925 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   926 				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   927 
   928 				mm_res = dst2; /* RED -> mm_res */
   929 				
   930 				/* green -- process the bits in place */
   931 				src2 = src1;
   932 				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   933 
   934 				dst2 = dst1;
   935 				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   936 
   937 				/* blend */
   938 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   939 				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   940 				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   941 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   942 
   943 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   944 
   945 				/* blue */
   946 				src2 = src1; /* src -> src2 */
   947 				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   948 
   949 				dst2 = dst1; /* dst -> dst2 */
   950 				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   951 
   952 				/* blend */
   953 				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   954 				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   955 				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   956 				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   957 				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   958 
   959 				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   960 
   961 				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   962 
   963 				srcp += 4;
   964 				dstp += 4;
   965 			}, width);
   966 			/* *INDENT-ON* */
   967             srcp += srcskip;
   968             dstp += dstskip;
   969         }
   970         _mm_empty();
   971     }
   972 }
   973 
   974 #endif /* __MMX__ */
   975 
   976 /* fast RGB565->RGB565 blending with surface alpha */
   977 static void
   978 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
   979 {
   980     unsigned alpha = info->a;
   981     if (alpha == 128) {
   982         Blit16to16SurfaceAlpha128(info, 0xf7de);
   983     } else {
   984         int width = info->dst_w;
   985         int height = info->dst_h;
   986         Uint16 *srcp = (Uint16 *) info->src;
   987         int srcskip = info->src_skip >> 1;
   988         Uint16 *dstp = (Uint16 *) info->dst;
   989         int dstskip = info->dst_skip >> 1;
   990         alpha >>= 3;            /* downscale alpha to 5 bits */
   991 
   992         while (height--) {
   993 			/* *INDENT-OFF* */
   994 			DUFFS_LOOP4({
   995 				Uint32 s = *srcp++;
   996 				Uint32 d = *dstp;
   997 				/*
   998 				 * shift out the middle component (green) to
   999 				 * the high 16 bits, and process all three RGB
  1000 				 * components at the same time.
  1001 				 */
  1002 				s = (s | s << 16) & 0x07e0f81f;
  1003 				d = (d | d << 16) & 0x07e0f81f;
  1004 				d += (s - d) * alpha >> 5;
  1005 				d &= 0x07e0f81f;
  1006 				*dstp++ = (Uint16)(d | d >> 16);
  1007 			}, width);
  1008 			/* *INDENT-ON* */
  1009             srcp += srcskip;
  1010             dstp += dstskip;
  1011         }
  1012     }
  1013 }
  1014 
  1015 /* fast RGB555->RGB555 blending with surface alpha */
  1016 static void
  1017 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1018 {
  1019     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1020     if (alpha == 128) {
  1021         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1022     } else {
  1023         int width = info->dst_w;
  1024         int height = info->dst_h;
  1025         Uint16 *srcp = (Uint16 *) info->src;
  1026         int srcskip = info->src_skip >> 1;
  1027         Uint16 *dstp = (Uint16 *) info->dst;
  1028         int dstskip = info->dst_skip >> 1;
  1029         alpha >>= 3;            /* downscale alpha to 5 bits */
  1030 
  1031         while (height--) {
  1032 			/* *INDENT-OFF* */
  1033 			DUFFS_LOOP4({
  1034 				Uint32 s = *srcp++;
  1035 				Uint32 d = *dstp;
  1036 				/*
  1037 				 * shift out the middle component (green) to
  1038 				 * the high 16 bits, and process all three RGB
  1039 				 * components at the same time.
  1040 				 */
  1041 				s = (s | s << 16) & 0x03e07c1f;
  1042 				d = (d | d << 16) & 0x03e07c1f;
  1043 				d += (s - d) * alpha >> 5;
  1044 				d &= 0x03e07c1f;
  1045 				*dstp++ = (Uint16)(d | d >> 16);
  1046 			}, width);
  1047 			/* *INDENT-ON* */
  1048             srcp += srcskip;
  1049             dstp += dstskip;
  1050         }
  1051     }
  1052 }
  1053 
  1054 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1055 static void
  1056 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1057 {
  1058     int width = info->dst_w;
  1059     int height = info->dst_h;
  1060     Uint32 *srcp = (Uint32 *) info->src;
  1061     int srcskip = info->src_skip >> 2;
  1062     Uint16 *dstp = (Uint16 *) info->dst;
  1063     int dstskip = info->dst_skip >> 1;
  1064 
  1065     while (height--) {
  1066 	    /* *INDENT-OFF* */
  1067 	    DUFFS_LOOP4({
  1068 		Uint32 s = *srcp;
  1069 		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1070 		/* FIXME: Here we special-case opaque alpha since the
  1071 		   compositioning used (>>8 instead of /255) doesn't handle
  1072 		   it correctly. Also special-case alpha=0 for speed?
  1073 		   Benchmark this! */
  1074 		if(alpha) {   
  1075 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1076 		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1077 		  } else {
  1078 		    Uint32 d = *dstp;
  1079 		    /*
  1080 		     * convert source and destination to G0RAB65565
  1081 		     * and blend all components at the same time
  1082 		     */
  1083 		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1084 		      + (s >> 3 & 0x1f);
  1085 		    d = (d | d << 16) & 0x07e0f81f;
  1086 		    d += (s - d) * alpha >> 5;
  1087 		    d &= 0x07e0f81f;
  1088 		    *dstp = (Uint16)(d | d >> 16);
  1089 		  }
  1090 		}
  1091 		srcp++;
  1092 		dstp++;
  1093 	    }, width);
  1094 	    /* *INDENT-ON* */
  1095         srcp += srcskip;
  1096         dstp += dstskip;
  1097     }
  1098 }
  1099 
  1100 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1101 static void
  1102 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1103 {
  1104     int width = info->dst_w;
  1105     int height = info->dst_h;
  1106     Uint32 *srcp = (Uint32 *) info->src;
  1107     int srcskip = info->src_skip >> 2;
  1108     Uint16 *dstp = (Uint16 *) info->dst;
  1109     int dstskip = info->dst_skip >> 1;
  1110 
  1111     while (height--) {
  1112 	    /* *INDENT-OFF* */
  1113 	    DUFFS_LOOP4({
  1114 		unsigned alpha;
  1115 		Uint32 s = *srcp;
  1116 		alpha = s >> 27; /* downscale alpha to 5 bits */
  1117 		/* FIXME: Here we special-case opaque alpha since the
  1118 		   compositioning used (>>8 instead of /255) doesn't handle
  1119 		   it correctly. Also special-case alpha=0 for speed?
  1120 		   Benchmark this! */
  1121 		if(alpha) {   
  1122 		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1123 		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1124 		  } else {
  1125 		    Uint32 d = *dstp;
  1126 		    /*
  1127 		     * convert source and destination to G0RAB65565
  1128 		     * and blend all components at the same time
  1129 		     */
  1130 		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1131 		      + (s >> 3 & 0x1f);
  1132 		    d = (d | d << 16) & 0x03e07c1f;
  1133 		    d += (s - d) * alpha >> 5;
  1134 		    d &= 0x03e07c1f;
  1135 		    *dstp = (Uint16)(d | d >> 16);
  1136 		  }
  1137 		}
  1138 		srcp++;
  1139 		dstp++;
  1140 	    }, width);
  1141 	    /* *INDENT-ON* */
  1142         srcp += srcskip;
  1143         dstp += dstskip;
  1144     }
  1145 }
  1146 
  1147 /* General (slow) N->N blending with per-surface alpha */
  1148 static void
  1149 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1150 {
  1151     int width = info->dst_w;
  1152     int height = info->dst_h;
  1153     Uint8 *src = info->src;
  1154     int srcskip = info->src_skip;
  1155     Uint8 *dst = info->dst;
  1156     int dstskip = info->dst_skip;
  1157     SDL_PixelFormat *srcfmt = info->src_fmt;
  1158     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1159     int srcbpp = srcfmt->BytesPerPixel;
  1160     int dstbpp = dstfmt->BytesPerPixel;
  1161     Uint32 Pixel;
  1162     unsigned sR, sG, sB;
  1163     unsigned dR, dG, dB, dA;
  1164     const unsigned sA = info->a;
  1165 
  1166     if (sA) {
  1167         while (height--) {
  1168 	    /* *INDENT-OFF* */
  1169 	    DUFFS_LOOP4(
  1170 	    {
  1171 		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  1172 		DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1173 		ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1174 		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1175 		src += srcbpp;
  1176 		dst += dstbpp;
  1177 	    },
  1178 	    width);
  1179 	    /* *INDENT-ON* */
  1180             src += srcskip;
  1181             dst += dstskip;
  1182         }
  1183     }
  1184 }
  1185 
  1186 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  1187 static void
  1188 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  1189 {
  1190     int width = info->dst_w;
  1191     int height = info->dst_h;
  1192     Uint8 *src = info->src;
  1193     int srcskip = info->src_skip;
  1194     Uint8 *dst = info->dst;
  1195     int dstskip = info->dst_skip;
  1196     SDL_PixelFormat *srcfmt = info->src_fmt;
  1197     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1198     Uint32 ckey = info->colorkey;
  1199     int srcbpp = srcfmt->BytesPerPixel;
  1200     int dstbpp = dstfmt->BytesPerPixel;
  1201     Uint32 Pixel;
  1202     unsigned sR, sG, sB;
  1203     unsigned dR, dG, dB, dA;
  1204     const unsigned sA = info->a;
  1205 
  1206     while (height--) {
  1207 	    /* *INDENT-OFF* */
  1208 	    DUFFS_LOOP4(
  1209 	    {
  1210 		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  1211 		if(sA && Pixel != ckey) {
  1212 		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  1213 		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1214 		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1215 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1216 		}
  1217 		src += srcbpp;
  1218 		dst += dstbpp;
  1219 	    },
  1220 	    width);
  1221 	    /* *INDENT-ON* */
  1222         src += srcskip;
  1223         dst += dstskip;
  1224     }
  1225 }
  1226 
  1227 /* General (slow) N->N blending with pixel alpha */
  1228 static void
  1229 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  1230 {
  1231     int width = info->dst_w;
  1232     int height = info->dst_h;
  1233     Uint8 *src = info->src;
  1234     int srcskip = info->src_skip;
  1235     Uint8 *dst = info->dst;
  1236     int dstskip = info->dst_skip;
  1237     SDL_PixelFormat *srcfmt = info->src_fmt;
  1238     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1239     int srcbpp;
  1240     int dstbpp;
  1241     Uint32 Pixel;
  1242     unsigned sR, sG, sB, sA;
  1243     unsigned dR, dG, dB, dA;
  1244 
  1245     /* Set up some basic variables */
  1246     srcbpp = srcfmt->BytesPerPixel;
  1247     dstbpp = dstfmt->BytesPerPixel;
  1248 
  1249     while (height--) {
  1250 	    /* *INDENT-OFF* */
  1251 	    DUFFS_LOOP4(
  1252 	    {
  1253 		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1254 		if(sA) {
  1255 		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1256 		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1257 		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1258 		}
  1259 		src += srcbpp;
  1260 		dst += dstbpp;
  1261 	    },
  1262 	    width);
  1263 	    /* *INDENT-ON* */
  1264         src += srcskip;
  1265         dst += dstskip;
  1266     }
  1267 }
  1268 
  1269 
  1270 SDL_BlitFunc
  1271 SDL_CalculateBlitA(SDL_Surface * surface)
  1272 {
  1273     SDL_PixelFormat *sf = surface->format;
  1274     SDL_PixelFormat *df = surface->map->dst->format;
  1275 
  1276     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  1277     case SDL_COPY_BLEND:
  1278         /* Per-pixel alpha blits */
  1279         switch (df->BytesPerPixel) {
  1280         case 1:
  1281             return BlitNto1PixelAlpha;
  1282 
  1283         case 2:
  1284                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1285                     && sf->Gmask == 0xff00
  1286                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1287                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1288                 if (df->Gmask == 0x7e0)
  1289                     return BlitARGBto565PixelAlpha;
  1290                 else if (df->Gmask == 0x3e0)
  1291                     return BlitARGBto555PixelAlpha;
  1292             }
  1293             return BlitNtoNPixelAlpha;
  1294 
  1295         case 4:
  1296             if (sf->Rmask == df->Rmask
  1297                 && sf->Gmask == df->Gmask
  1298                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1299 #if defined(__MMX__) || defined(__3dNOW__)
  1300                 if (sf->Rshift % 8 == 0
  1301                     && sf->Gshift % 8 == 0
  1302                     && sf->Bshift % 8 == 0
  1303                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  1304 #ifdef __3dNOW__
  1305                     if (SDL_Has3DNow())
  1306                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  1307 #endif
  1308 #ifdef __MMX__
  1309                     if (SDL_HasMMX())
  1310                         return BlitRGBtoRGBPixelAlphaMMX;
  1311 #endif
  1312                 }
  1313 #endif /* __MMX__ || __3dNOW__ */
  1314                 if (sf->Amask == 0xff000000) {
  1315                     return BlitRGBtoRGBPixelAlpha;
  1316                 }
  1317             }
  1318             return BlitNtoNPixelAlpha;
  1319 
  1320         case 3:
  1321         default:
  1322             return BlitNtoNPixelAlpha;
  1323         }
  1324         break;
  1325 
  1326     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1327         if (sf->Amask == 0) {
  1328             /* Per-surface alpha blits */
  1329             switch (df->BytesPerPixel) {
  1330             case 1:
  1331                 return BlitNto1SurfaceAlpha;
  1332 
  1333             case 2:
  1334                 if (surface->map->identity) {
  1335                     if (df->Gmask == 0x7e0) {
  1336 #ifdef __MMX__
  1337                         if (SDL_HasMMX())
  1338                             return Blit565to565SurfaceAlphaMMX;
  1339                         else
  1340 #endif
  1341                             return Blit565to565SurfaceAlpha;
  1342                     } else if (df->Gmask == 0x3e0) {
  1343 #ifdef __MMX__
  1344                         if (SDL_HasMMX())
  1345                             return Blit555to555SurfaceAlphaMMX;
  1346                         else
  1347 #endif
  1348                             return Blit555to555SurfaceAlpha;
  1349                     }
  1350                 }
  1351                 return BlitNtoNSurfaceAlpha;
  1352 
  1353             case 4:
  1354                 if (sf->Rmask == df->Rmask
  1355                     && sf->Gmask == df->Gmask
  1356                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1357 #ifdef __MMX__
  1358                     if (sf->Rshift % 8 == 0
  1359                         && sf->Gshift % 8 == 0
  1360                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  1361                         return BlitRGBtoRGBSurfaceAlphaMMX;
  1362 #endif
  1363                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1364                         return BlitRGBtoRGBSurfaceAlpha;
  1365                     }
  1366                 }
  1367                 return BlitNtoNSurfaceAlpha;
  1368 
  1369             case 3:
  1370             default:
  1371                 return BlitNtoNSurfaceAlpha;
  1372             }
  1373         }
  1374         break;
  1375 
  1376     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1377         if (sf->Amask == 0) {
  1378             if (df->BytesPerPixel == 1) {
  1379                 return BlitNto1SurfaceAlphaKey;
  1380             } else {
  1381                 return BlitNtoNSurfaceAlphaKey;
  1382             }
  1383         }
  1384         break;
  1385     }
  1386 
  1387     return NULL;
  1388 }
  1389 
  1390 /* vi: set ts=4 sw=4 expandtab: */