src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Thu, 12 Dec 2019 19:07:26 -0800
changeset 13347 99ecd178999f
parent 13182 1d8fafac75cc
permissions -rw-r--r--
Fixed binding D-pad on NES30 controller
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2019 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 #include "../SDL_internal.h"
    22 
    23 #include "SDL_video.h"
    24 #include "SDL_blit.h"
    25 
    26 /* Functions to perform alpha blended blitting */
    27 
    28 /* N->1 blending with per-surface alpha */
    29 static void
    30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    31 {
    32     int width = info->dst_w;
    33     int height = info->dst_h;
    34     Uint8 *src = info->src;
    35     int srcskip = info->src_skip;
    36     Uint8 *dst = info->dst;
    37     int dstskip = info->dst_skip;
    38     Uint8 *palmap = info->table;
    39     SDL_PixelFormat *srcfmt = info->src_fmt;
    40     SDL_PixelFormat *dstfmt = info->dst_fmt;
    41     int srcbpp = srcfmt->BytesPerPixel;
    42     Uint32 Pixel;
    43     unsigned sR, sG, sB;
    44     unsigned dR, dG, dB;
    45     const unsigned A = info->a;
    46 
    47     while (height--) {
    48         /* *INDENT-OFF* */
    49         DUFFS_LOOP4(
    50         {
    51         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    52         dR = dstfmt->palette->colors[*dst].r;
    53         dG = dstfmt->palette->colors[*dst].g;
    54         dB = dstfmt->palette->colors[*dst].b;
    55         ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    56         dR &= 0xff;
    57         dG &= 0xff;
    58         dB &= 0xff;
    59         /* Pack RGB into 8bit pixel */
    60         if ( palmap == NULL ) {
    61             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    62         } else {
    63             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    64         }
    65         dst++;
    66         src += srcbpp;
    67         },
    68         width);
    69         /* *INDENT-ON* */
    70         src += srcskip;
    71         dst += dstskip;
    72     }
    73 }
    74 
    75 /* N->1 blending with pixel alpha */
    76 static void
    77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    78 {
    79     int width = info->dst_w;
    80     int height = info->dst_h;
    81     Uint8 *src = info->src;
    82     int srcskip = info->src_skip;
    83     Uint8 *dst = info->dst;
    84     int dstskip = info->dst_skip;
    85     Uint8 *palmap = info->table;
    86     SDL_PixelFormat *srcfmt = info->src_fmt;
    87     SDL_PixelFormat *dstfmt = info->dst_fmt;
    88     int srcbpp = srcfmt->BytesPerPixel;
    89     Uint32 Pixel;
    90     unsigned sR, sG, sB, sA;
    91     unsigned dR, dG, dB;
    92 
    93     while (height--) {
    94         /* *INDENT-OFF* */
    95         DUFFS_LOOP4(
    96         {
    97         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    98         dR = dstfmt->palette->colors[*dst].r;
    99         dG = dstfmt->palette->colors[*dst].g;
   100         dB = dstfmt->palette->colors[*dst].b;
   101         ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
   102         dR &= 0xff;
   103         dG &= 0xff;
   104         dB &= 0xff;
   105         /* Pack RGB into 8bit pixel */
   106         if ( palmap == NULL ) {
   107             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   108         } else {
   109             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   110         }
   111         dst++;
   112         src += srcbpp;
   113         },
   114         width);
   115         /* *INDENT-ON* */
   116         src += srcskip;
   117         dst += dstskip;
   118     }
   119 }
   120 
   121 /* colorkeyed N->1 blending with per-surface alpha */
   122 static void
   123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   124 {
   125     int width = info->dst_w;
   126     int height = info->dst_h;
   127     Uint8 *src = info->src;
   128     int srcskip = info->src_skip;
   129     Uint8 *dst = info->dst;
   130     int dstskip = info->dst_skip;
   131     Uint8 *palmap = info->table;
   132     SDL_PixelFormat *srcfmt = info->src_fmt;
   133     SDL_PixelFormat *dstfmt = info->dst_fmt;
   134     int srcbpp = srcfmt->BytesPerPixel;
   135     Uint32 ckey = info->colorkey;
   136     Uint32 Pixel;
   137     unsigned sR, sG, sB;
   138     unsigned dR, dG, dB;
   139     const unsigned A = info->a;
   140 
   141     while (height--) {
   142         /* *INDENT-OFF* */
   143         DUFFS_LOOP(
   144         {
   145         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   146         if ( Pixel != ckey ) {
   147             dR = dstfmt->palette->colors[*dst].r;
   148             dG = dstfmt->palette->colors[*dst].g;
   149             dB = dstfmt->palette->colors[*dst].b;
   150             ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
   151             dR &= 0xff;
   152             dG &= 0xff;
   153             dB &= 0xff;
   154             /* Pack RGB into 8bit pixel */
   155             if ( palmap == NULL ) {
   156                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   157             } else {
   158                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   159             }
   160         }
   161         dst++;
   162         src += srcbpp;
   163         },
   164         width);
   165         /* *INDENT-ON* */
   166         src += srcskip;
   167         dst += dstskip;
   168     }
   169 }
   170 
   171 #ifdef __MMX__
   172 
   173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   174 static void
   175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   176 {
   177     int width = info->dst_w;
   178     int height = info->dst_h;
   179     Uint32 *srcp = (Uint32 *) info->src;
   180     int srcskip = info->src_skip >> 2;
   181     Uint32 *dstp = (Uint32 *) info->dst;
   182     int dstskip = info->dst_skip >> 2;
   183     Uint32 dalpha = info->dst_fmt->Amask;
   184 
   185     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   186 
   187     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   188     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   189     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   190 
   191     while (height--) {
   192         int n = width;
   193         if (n & 1) {
   194             Uint32 s = *srcp++;
   195             Uint32 d = *dstp;
   196             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   197                        + (s & d & 0x00010101)) | dalpha;
   198             n--;
   199         }
   200 
   201         for (n >>= 1; n > 0; --n) {
   202             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   203             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   204 
   205             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   206             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   207 
   208             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   209             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   210             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   211             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   212 
   213             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   214             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   215             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   216             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   217 
   218             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   219             dstp += 2;
   220             srcp += 2;
   221         }
   222 
   223         srcp += srcskip;
   224         dstp += dstskip;
   225     }
   226     _mm_empty();
   227 }
   228 
   229 /* fast RGB888->(A)RGB888 blending with surface alpha */
   230 static void
   231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   232 {
   233     SDL_PixelFormat *df = info->dst_fmt;
   234     Uint32 chanmask;
   235     unsigned alpha = info->a;
   236 
   237     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   238         /* only call a128 version when R,G,B occupy lower bits */
   239         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   240     } else {
   241         int width = info->dst_w;
   242         int height = info->dst_h;
   243         Uint32 *srcp = (Uint32 *) info->src;
   244         int srcskip = info->src_skip >> 2;
   245         Uint32 *dstp = (Uint32 *) info->dst;
   246         int dstskip = info->dst_skip >> 2;
   247         Uint32 dalpha = df->Amask;
   248         Uint32 amult;
   249 
   250         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   251 
   252         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   253         /* form the alpha mult */
   254         amult = alpha | (alpha << 8);
   255         amult = amult | (amult << 16);
   256         chanmask =
   257             (0xff << df->Rshift) | (0xff << df->
   258                                     Gshift) | (0xff << df->Bshift);
   259         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   260         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   261         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   262         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   263 
   264         while (height--) {
   265             int n = width;
   266             if (n & 1) {
   267                 /* One Pixel Blend */
   268                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   269                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   270 
   271                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   272                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   273 
   274                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   275                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   276                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   277                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   278 
   279                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   280                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   281                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   282 
   283                 ++srcp;
   284                 ++dstp;
   285 
   286                 n--;
   287             }
   288 
   289             for (n >>= 1; n > 0; --n) {
   290                 /* Two Pixels Blend */
   291                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   292                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   293                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   294                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   295 
   296                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   297                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   298                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   299                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   300 
   301                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   302                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   303                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   304                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   305 
   306                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   307                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   308                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   309                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   310 
   311                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   312                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   313 
   314                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   315 
   316                 srcp += 2;
   317                 dstp += 2;
   318             }
   319             srcp += srcskip;
   320             dstp += dstskip;
   321         }
   322         _mm_empty();
   323     }
   324 }
   325 
   326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   327 static void
   328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   329 {
   330     int width = info->dst_w;
   331     int height = info->dst_h;
   332     Uint32 *srcp = (Uint32 *) info->src;
   333     int srcskip = info->src_skip >> 2;
   334     Uint32 *dstp = (Uint32 *) info->dst;
   335     int dstskip = info->dst_skip >> 2;
   336     SDL_PixelFormat *sf = info->src_fmt;
   337     Uint32 amask = sf->Amask;
   338     Uint32 ashift = sf->Ashift;
   339     Uint64 multmask, multmask2;
   340 
   341     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   342 
   343     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   344     multmask = 0x00FF;
   345     multmask <<= (ashift * 2);
   346     multmask2 = 0x00FF00FF00FF00FFULL;
   347 
   348     while (height--) {
   349         /* *INDENT-OFF* */
   350         DUFFS_LOOP4({
   351         Uint32 alpha = *srcp & amask;
   352         if (alpha == 0) {
   353             /* do nothing */
   354         } else if (alpha == amask) {
   355             *dstp = *srcp;
   356         } else {
   357             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   358             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   359 
   360             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   361             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   362 
   363             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   364             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   365             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   366             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   367             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   368             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   369 
   370             /* blend */            
   371             src1 = _mm_mullo_pi16(src1, mm_alpha);
   372             src1 = _mm_srli_pi16(src1, 8);
   373             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   374             dst1 = _mm_srli_pi16(dst1, 8);
   375             dst1 = _mm_add_pi16(src1, dst1);
   376             dst1 = _mm_packs_pu16(dst1, mm_zero);
   377             
   378             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   379         }
   380         ++srcp;
   381         ++dstp;
   382         }, width);
   383         /* *INDENT-ON* */
   384         srcp += srcskip;
   385         dstp += dstskip;
   386     }
   387     _mm_empty();
   388 }
   389 
   390 #endif /* __MMX__ */
   391 
   392 #if SDL_ARM_SIMD_BLITTERS
   393 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   394 
   395 static void
   396 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
   397 {
   398 	int32_t width = info->dst_w;
   399 	int32_t height = info->dst_h;
   400 	uint16_t *dstp = (uint16_t *)info->dst;
   401 	int32_t dststride = width + (info->dst_skip >> 1);
   402 	uint32_t *srcp = (uint32_t *)info->src;
   403 	int32_t srcstride = width + (info->src_skip >> 2);
   404 
   405 	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
   406 }
   407 
   408 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   409 
   410 static void
   411 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
   412 {
   413     int32_t width = info->dst_w;
   414     int32_t height = info->dst_h;
   415     uint32_t *dstp = (uint32_t *)info->dst;
   416     int32_t dststride = width + (info->dst_skip >> 2);
   417     uint32_t *srcp = (uint32_t *)info->src;
   418     int32_t srcstride = width + (info->src_skip >> 2);
   419 
   420     BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
   421 }
   422 #endif
   423 
   424 #if SDL_ARM_NEON_BLITTERS
   425 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   426 
   427 static void
   428 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
   429 {
   430     int32_t width = info->dst_w;
   431     int32_t height = info->dst_h;
   432     uint16_t *dstp = (uint16_t *)info->dst;
   433     int32_t dststride = width + (info->dst_skip >> 1);
   434     uint32_t *srcp = (uint32_t *)info->src;
   435     int32_t srcstride = width + (info->src_skip >> 2);
   436 
   437     BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
   438 }
   439 
   440 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   441 
   442 static void
   443 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
   444 {
   445 	int32_t width = info->dst_w;
   446 	int32_t height = info->dst_h;
   447 	uint32_t *dstp = (uint32_t *)info->dst;
   448 	int32_t dststride = width + (info->dst_skip >> 2);
   449 	uint32_t *srcp = (uint32_t *)info->src;
   450 	int32_t srcstride = width + (info->src_skip >> 2);
   451 
   452 	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
   453 }
   454 #endif
   455 
   456 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   457 static void
   458 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
   459 {
   460     int width = info->dst_w;
   461     int height = info->dst_h;
   462     Uint32 *srcp = (Uint32 *) info->src;
   463     int srcskip = info->src_skip >> 2;
   464     Uint32 *dstp = (Uint32 *) info->dst;
   465     int dstskip = info->dst_skip >> 2;
   466 
   467     while (height--) {
   468         /* *INDENT-OFF* */
   469         DUFFS_LOOP4({
   470             Uint32 s = *srcp++;
   471             Uint32 d = *dstp;
   472             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   473                    + (s & d & 0x00010101)) | 0xff000000;
   474         }, width);
   475         /* *INDENT-ON* */
   476         srcp += srcskip;
   477         dstp += dstskip;
   478     }
   479 }
   480 
   481 /* fast RGB888->(A)RGB888 blending with surface alpha */
   482 static void
   483 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
   484 {
   485     unsigned alpha = info->a;
   486     if (alpha == 128) {
   487         BlitRGBtoRGBSurfaceAlpha128(info);
   488     } else {
   489         int width = info->dst_w;
   490         int height = info->dst_h;
   491         Uint32 *srcp = (Uint32 *) info->src;
   492         int srcskip = info->src_skip >> 2;
   493         Uint32 *dstp = (Uint32 *) info->dst;
   494         int dstskip = info->dst_skip >> 2;
   495         Uint32 s;
   496         Uint32 d;
   497         Uint32 s1;
   498         Uint32 d1;
   499 
   500         while (height--) {
   501             /* *INDENT-OFF* */
   502             DUFFS_LOOP4({
   503                 s = *srcp;
   504                 d = *dstp;
   505                 s1 = s & 0xff00ff;
   506                 d1 = d & 0xff00ff;
   507                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
   508                      & 0xff00ff;
   509                 s &= 0xff00;
   510                 d &= 0xff00;
   511                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   512                 *dstp = d1 | d | 0xff000000;
   513                 ++srcp;
   514                 ++dstp;
   515             }, width);
   516             /* *INDENT-ON* */
   517             srcp += srcskip;
   518             dstp += dstskip;
   519         }
   520     }
   521 }
   522 
   523 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   524 static void
   525 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
   526 {
   527     int width = info->dst_w;
   528     int height = info->dst_h;
   529     Uint32 *srcp = (Uint32 *) info->src;
   530     int srcskip = info->src_skip >> 2;
   531     Uint32 *dstp = (Uint32 *) info->dst;
   532     int dstskip = info->dst_skip >> 2;
   533 
   534     while (height--) {
   535         /* *INDENT-OFF* */
   536         DUFFS_LOOP4({
   537         Uint32 dalpha;
   538         Uint32 d;
   539         Uint32 s1;
   540         Uint32 d1;
   541         Uint32 s = *srcp;
   542         Uint32 alpha = s >> 24;
   543         /* FIXME: Here we special-case opaque alpha since the
   544            compositioning used (>>8 instead of /255) doesn't handle
   545            it correctly. Also special-case alpha=0 for speed?
   546            Benchmark this! */
   547         if (alpha) {
   548           if (alpha == SDL_ALPHA_OPAQUE) {
   549               *dstp = *srcp;
   550           } else {
   551             /*
   552              * take out the middle component (green), and process
   553              * the other two in parallel. One multiply less.
   554              */
   555             d = *dstp;
   556             dalpha = d >> 24;
   557             s1 = s & 0xff00ff;
   558             d1 = d & 0xff00ff;
   559             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   560             s &= 0xff00;
   561             d &= 0xff00;
   562             d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   563             dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
   564             *dstp = d1 | d | (dalpha << 24);
   565           }
   566         }
   567         ++srcp;
   568         ++dstp;
   569         }, width);
   570         /* *INDENT-ON* */
   571         srcp += srcskip;
   572         dstp += dstskip;
   573     }
   574 }
   575 
   576 #ifdef __3dNOW__
   577 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   578 static void
   579 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   580 {
   581     int width = info->dst_w;
   582     int height = info->dst_h;
   583     Uint32 *srcp = (Uint32 *) info->src;
   584     int srcskip = info->src_skip >> 2;
   585     Uint32 *dstp = (Uint32 *) info->dst;
   586     int dstskip = info->dst_skip >> 2;
   587     SDL_PixelFormat *sf = info->src_fmt;
   588     Uint32 amask = sf->Amask;
   589     Uint32 ashift = sf->Ashift;
   590     Uint64 multmask, multmask2;
   591 
   592     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   593 
   594     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   595     multmask = 0x00FF;
   596     multmask <<= (ashift * 2);
   597     multmask2 = 0x00FF00FF00FF00FFULL;
   598 
   599     while (height--) {
   600         /* *INDENT-OFF* */
   601         DUFFS_LOOP4({
   602         Uint32 alpha;
   603 
   604         _m_prefetch(srcp + 16);
   605         _m_prefetch(dstp + 16);
   606 
   607         alpha = *srcp & amask;
   608         if (alpha == 0) {
   609             /* do nothing */
   610         } else if (alpha == amask) {
   611             *dstp = *srcp;
   612         } else {
   613             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   614             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   615 
   616             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   617             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   618 
   619             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   620             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   621             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   622             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   623             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   624             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   625 
   626 
   627             /* blend */            
   628             src1 = _mm_mullo_pi16(src1, mm_alpha);
   629             src1 = _mm_srli_pi16(src1, 8);
   630             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   631             dst1 = _mm_srli_pi16(dst1, 8);
   632             dst1 = _mm_add_pi16(src1, dst1);
   633             dst1 = _mm_packs_pu16(dst1, mm_zero);
   634             
   635             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   636         }
   637         ++srcp;
   638         ++dstp;
   639         }, width);
   640         /* *INDENT-ON* */
   641         srcp += srcskip;
   642         dstp += dstskip;
   643     }
   644     _mm_empty();
   645 }
   646 
   647 #endif /* __3dNOW__ */
   648 
   649 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   650 
   651 /* blend a single 16 bit pixel at 50% */
   652 #define BLEND16_50(d, s, mask)                        \
   653     ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   654 
   655 /* blend two 16 bit pixels at 50% */
   656 #define BLEND2x16_50(d, s, mask)                         \
   657     (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   658      + (s & d & (~(mask | mask << 16))))
   659 
   660 static void
   661 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
   662 {
   663     int width = info->dst_w;
   664     int height = info->dst_h;
   665     Uint16 *srcp = (Uint16 *) info->src;
   666     int srcskip = info->src_skip >> 1;
   667     Uint16 *dstp = (Uint16 *) info->dst;
   668     int dstskip = info->dst_skip >> 1;
   669 
   670     while (height--) {
   671         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
   672             /*
   673              * Source and destination not aligned, pipeline it.
   674              * This is mostly a win for big blits but no loss for
   675              * small ones
   676              */
   677             Uint32 prev_sw;
   678             int w = width;
   679 
   680             /* handle odd destination */
   681             if ((uintptr_t) dstp & 2) {
   682                 Uint16 d = *dstp, s = *srcp;
   683                 *dstp = BLEND16_50(d, s, mask);
   684                 dstp++;
   685                 srcp++;
   686                 w--;
   687             }
   688             srcp++;             /* srcp is now 32-bit aligned */
   689 
   690             /* bootstrap pipeline with first halfword */
   691             prev_sw = ((Uint32 *) srcp)[-1];
   692 
   693             while (w > 1) {
   694                 Uint32 sw, dw, s;
   695                 sw = *(Uint32 *) srcp;
   696                 dw = *(Uint32 *) dstp;
   697 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   698                 s = (prev_sw << 16) + (sw >> 16);
   699 #else
   700                 s = (prev_sw >> 16) + (sw << 16);
   701 #endif
   702                 prev_sw = sw;
   703                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
   704                 dstp += 2;
   705                 srcp += 2;
   706                 w -= 2;
   707             }
   708 
   709             /* final pixel if any */
   710             if (w) {
   711                 Uint16 d = *dstp, s;
   712 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   713                 s = (Uint16) prev_sw;
   714 #else
   715                 s = (Uint16) (prev_sw >> 16);
   716 #endif
   717                 *dstp = BLEND16_50(d, s, mask);
   718                 srcp++;
   719                 dstp++;
   720             }
   721             srcp += srcskip - 1;
   722             dstp += dstskip;
   723         } else {
   724             /* source and destination are aligned */
   725             int w = width;
   726 
   727             /* first odd pixel? */
   728             if ((uintptr_t) srcp & 2) {
   729                 Uint16 d = *dstp, s = *srcp;
   730                 *dstp = BLEND16_50(d, s, mask);
   731                 srcp++;
   732                 dstp++;
   733                 w--;
   734             }
   735             /* srcp and dstp are now 32-bit aligned */
   736 
   737             while (w > 1) {
   738                 Uint32 sw = *(Uint32 *) srcp;
   739                 Uint32 dw = *(Uint32 *) dstp;
   740                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
   741                 srcp += 2;
   742                 dstp += 2;
   743                 w -= 2;
   744             }
   745 
   746             /* last odd pixel? */
   747             if (w) {
   748                 Uint16 d = *dstp, s = *srcp;
   749                 *dstp = BLEND16_50(d, s, mask);
   750                 srcp++;
   751                 dstp++;
   752             }
   753             srcp += srcskip;
   754             dstp += dstskip;
   755         }
   756     }
   757 }
   758 
   759 #ifdef __MMX__
   760 
   761 /* fast RGB565->RGB565 blending with surface alpha */
   762 static void
   763 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   764 {
   765     unsigned alpha = info->a;
   766     if (alpha == 128) {
   767         Blit16to16SurfaceAlpha128(info, 0xf7de);
   768     } else {
   769         int width = info->dst_w;
   770         int height = info->dst_h;
   771         Uint16 *srcp = (Uint16 *) info->src;
   772         int srcskip = info->src_skip >> 1;
   773         Uint16 *dstp = (Uint16 *) info->dst;
   774         int dstskip = info->dst_skip >> 1;
   775         Uint32 s, d;
   776 
   777         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   778 
   779         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   780         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   781         alpha >>= 3;            /* downscale alpha to 5 bits */
   782 
   783         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   784         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   785         /* position alpha to allow for mullo and mulhi on diff channels
   786            to reduce the number of operations */
   787         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   788 
   789         /* Setup the 565 color channel masks */
   790         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
   791         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   792 
   793         while (height--) {
   794             /* *INDENT-OFF* */
   795             DUFFS_LOOP_124(
   796             {
   797                 s = *srcp++;
   798                 d = *dstp;
   799                 /*
   800                  * shift out the middle component (green) to
   801                  * the high 16 bits, and process all three RGB
   802                  * components at the same time.
   803                  */
   804                 s = (s | s << 16) & 0x07e0f81f;
   805                 d = (d | d << 16) & 0x07e0f81f;
   806                 d += (s - d) * alpha >> 5;
   807                 d &= 0x07e0f81f;
   808                 *dstp++ = (Uint16)(d | d >> 16);
   809             },{
   810                 s = *srcp++;
   811                 d = *dstp;
   812                 /*
   813                  * shift out the middle component (green) to
   814                  * the high 16 bits, and process all three RGB
   815                  * components at the same time.
   816                  */
   817                 s = (s | s << 16) & 0x07e0f81f;
   818                 d = (d | d << 16) & 0x07e0f81f;
   819                 d += (s - d) * alpha >> 5;
   820                 d &= 0x07e0f81f;
   821                 *dstp++ = (Uint16)(d | d >> 16);
   822                 s = *srcp++;
   823                 d = *dstp;
   824                 /*
   825                  * shift out the middle component (green) to
   826                  * the high 16 bits, and process all three RGB
   827                  * components at the same time.
   828                  */
   829                 s = (s | s << 16) & 0x07e0f81f;
   830                 d = (d | d << 16) & 0x07e0f81f;
   831                 d += (s - d) * alpha >> 5;
   832                 d &= 0x07e0f81f;
   833                 *dstp++ = (Uint16)(d | d >> 16);
   834             },{
   835                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   836                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   837 
   838                 /* red */
   839                 src2 = src1;
   840                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   841 
   842                 dst2 = dst1;
   843                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   844 
   845                 /* blend */
   846                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   847                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   848                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   849                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   850                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   851 
   852                 mm_res = dst2; /* RED -> mm_res */
   853 
   854                 /* green -- process the bits in place */
   855                 src2 = src1;
   856                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   857 
   858                 dst2 = dst1;
   859                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   860 
   861                 /* blend */
   862                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   863                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   864                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   865                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   866 
   867                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   868 
   869                 /* blue */
   870                 src2 = src1;
   871                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   872 
   873                 dst2 = dst1;
   874                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   875 
   876                 /* blend */
   877                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   878                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   879                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   880                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   881                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   882 
   883                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   884 
   885                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   886 
   887                 srcp += 4;
   888                 dstp += 4;
   889             }, width);
   890             /* *INDENT-ON* */
   891             srcp += srcskip;
   892             dstp += dstskip;
   893         }
   894         _mm_empty();
   895     }
   896 }
   897 
   898 /* fast RGB555->RGB555 blending with surface alpha */
   899 static void
   900 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   901 {
   902     unsigned alpha = info->a;
   903     if (alpha == 128) {
   904         Blit16to16SurfaceAlpha128(info, 0xfbde);
   905     } else {
   906         int width = info->dst_w;
   907         int height = info->dst_h;
   908         Uint16 *srcp = (Uint16 *) info->src;
   909         int srcskip = info->src_skip >> 1;
   910         Uint16 *dstp = (Uint16 *) info->dst;
   911         int dstskip = info->dst_skip >> 1;
   912         Uint32 s, d;
   913 
   914         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   915 
   916         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   917         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   918         alpha >>= 3;            /* downscale alpha to 5 bits */
   919 
   920         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   921         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   922         /* position alpha to allow for mullo and mulhi on diff channels
   923            to reduce the number of operations */
   924         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   925 
   926         /* Setup the 555 color channel masks */
   927         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
   928         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
   929         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   930 
   931         while (height--) {
   932             /* *INDENT-OFF* */
   933             DUFFS_LOOP_124(
   934             {
   935                 s = *srcp++;
   936                 d = *dstp;
   937                 /*
   938                  * shift out the middle component (green) to
   939                  * the high 16 bits, and process all three RGB
   940                  * components at the same time.
   941                  */
   942                 s = (s | s << 16) & 0x03e07c1f;
   943                 d = (d | d << 16) & 0x03e07c1f;
   944                 d += (s - d) * alpha >> 5;
   945                 d &= 0x03e07c1f;
   946                 *dstp++ = (Uint16)(d | d >> 16);
   947             },{
   948                 s = *srcp++;
   949                 d = *dstp;
   950                 /*
   951                  * shift out the middle component (green) to
   952                  * the high 16 bits, and process all three RGB
   953                  * components at the same time.
   954                  */
   955                 s = (s | s << 16) & 0x03e07c1f;
   956                 d = (d | d << 16) & 0x03e07c1f;
   957                 d += (s - d) * alpha >> 5;
   958                 d &= 0x03e07c1f;
   959                 *dstp++ = (Uint16)(d | d >> 16);
   960                     s = *srcp++;
   961                 d = *dstp;
   962                 /*
   963                  * shift out the middle component (green) to
   964                  * the high 16 bits, and process all three RGB
   965                  * components at the same time.
   966                  */
   967                 s = (s | s << 16) & 0x03e07c1f;
   968                 d = (d | d << 16) & 0x03e07c1f;
   969                 d += (s - d) * alpha >> 5;
   970                 d &= 0x03e07c1f;
   971                 *dstp++ = (Uint16)(d | d >> 16);
   972             },{
   973                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   974                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   975 
   976                 /* red -- process the bits in place */
   977                 src2 = src1;
   978                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   979 
   980                 dst2 = dst1;
   981                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   982 
   983                 /* blend */
   984                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   985                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   986                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   987                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   988                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   989 
   990                 mm_res = dst2; /* RED -> mm_res */
   991                 
   992                 /* green -- process the bits in place */
   993                 src2 = src1;
   994                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   995 
   996                 dst2 = dst1;
   997                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   998 
   999                 /* blend */
  1000                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1001                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1002                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1003                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1004 
  1005                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1006 
  1007                 /* blue */
  1008                 src2 = src1; /* src -> src2 */
  1009                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1010 
  1011                 dst2 = dst1; /* dst -> dst2 */
  1012                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1013 
  1014                 /* blend */
  1015                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1016                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1017                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1018                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1019                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1020 
  1021                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1022 
  1023                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1024 
  1025                 srcp += 4;
  1026                 dstp += 4;
  1027             }, width);
  1028             /* *INDENT-ON* */
  1029             srcp += srcskip;
  1030             dstp += dstskip;
  1031         }
  1032         _mm_empty();
  1033     }
  1034 }
  1035 
  1036 #endif /* __MMX__ */
  1037 
  1038 /* fast RGB565->RGB565 blending with surface alpha */
  1039 static void
  1040 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1041 {
  1042     unsigned alpha = info->a;
  1043     if (alpha == 128) {
  1044         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1045     } else {
  1046         int width = info->dst_w;
  1047         int height = info->dst_h;
  1048         Uint16 *srcp = (Uint16 *) info->src;
  1049         int srcskip = info->src_skip >> 1;
  1050         Uint16 *dstp = (Uint16 *) info->dst;
  1051         int dstskip = info->dst_skip >> 1;
  1052         alpha >>= 3;            /* downscale alpha to 5 bits */
  1053 
  1054         while (height--) {
  1055             /* *INDENT-OFF* */
  1056             DUFFS_LOOP4({
  1057                 Uint32 s = *srcp++;
  1058                 Uint32 d = *dstp;
  1059                 /*
  1060                  * shift out the middle component (green) to
  1061                  * the high 16 bits, and process all three RGB
  1062                  * components at the same time.
  1063                  */
  1064                 s = (s | s << 16) & 0x07e0f81f;
  1065                 d = (d | d << 16) & 0x07e0f81f;
  1066                 d += (s - d) * alpha >> 5;
  1067                 d &= 0x07e0f81f;
  1068                 *dstp++ = (Uint16)(d | d >> 16);
  1069             }, width);
  1070             /* *INDENT-ON* */
  1071             srcp += srcskip;
  1072             dstp += dstskip;
  1073         }
  1074     }
  1075 }
  1076 
  1077 /* fast RGB555->RGB555 blending with surface alpha */
  1078 static void
  1079 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1080 {
  1081     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1082     if (alpha == 128) {
  1083         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1084     } else {
  1085         int width = info->dst_w;
  1086         int height = info->dst_h;
  1087         Uint16 *srcp = (Uint16 *) info->src;
  1088         int srcskip = info->src_skip >> 1;
  1089         Uint16 *dstp = (Uint16 *) info->dst;
  1090         int dstskip = info->dst_skip >> 1;
  1091         alpha >>= 3;            /* downscale alpha to 5 bits */
  1092 
  1093         while (height--) {
  1094             /* *INDENT-OFF* */
  1095             DUFFS_LOOP4({
  1096                 Uint32 s = *srcp++;
  1097                 Uint32 d = *dstp;
  1098                 /*
  1099                  * shift out the middle component (green) to
  1100                  * the high 16 bits, and process all three RGB
  1101                  * components at the same time.
  1102                  */
  1103                 s = (s | s << 16) & 0x03e07c1f;
  1104                 d = (d | d << 16) & 0x03e07c1f;
  1105                 d += (s - d) * alpha >> 5;
  1106                 d &= 0x03e07c1f;
  1107                 *dstp++ = (Uint16)(d | d >> 16);
  1108             }, width);
  1109             /* *INDENT-ON* */
  1110             srcp += srcskip;
  1111             dstp += dstskip;
  1112         }
  1113     }
  1114 }
  1115 
  1116 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1117 static void
  1118 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1119 {
  1120     int width = info->dst_w;
  1121     int height = info->dst_h;
  1122     Uint32 *srcp = (Uint32 *) info->src;
  1123     int srcskip = info->src_skip >> 2;
  1124     Uint16 *dstp = (Uint16 *) info->dst;
  1125     int dstskip = info->dst_skip >> 1;
  1126 
  1127     while (height--) {
  1128         /* *INDENT-OFF* */
  1129         DUFFS_LOOP4({
  1130         Uint32 s = *srcp;
  1131         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1132         /* FIXME: Here we special-case opaque alpha since the
  1133            compositioning used (>>8 instead of /255) doesn't handle
  1134            it correctly. Also special-case alpha=0 for speed?
  1135            Benchmark this! */
  1136         if(alpha) {   
  1137           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1138             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1139           } else {
  1140             Uint32 d = *dstp;
  1141             /*
  1142              * convert source and destination to G0RAB65565
  1143              * and blend all components at the same time
  1144              */
  1145             s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1146               + (s >> 3 & 0x1f);
  1147             d = (d | d << 16) & 0x07e0f81f;
  1148             d += (s - d) * alpha >> 5;
  1149             d &= 0x07e0f81f;
  1150             *dstp = (Uint16)(d | d >> 16);
  1151           }
  1152         }
  1153         srcp++;
  1154         dstp++;
  1155         }, width);
  1156         /* *INDENT-ON* */
  1157         srcp += srcskip;
  1158         dstp += dstskip;
  1159     }
  1160 }
  1161 
  1162 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1163 static void
  1164 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1165 {
  1166     int width = info->dst_w;
  1167     int height = info->dst_h;
  1168     Uint32 *srcp = (Uint32 *) info->src;
  1169     int srcskip = info->src_skip >> 2;
  1170     Uint16 *dstp = (Uint16 *) info->dst;
  1171     int dstskip = info->dst_skip >> 1;
  1172 
  1173     while (height--) {
  1174         /* *INDENT-OFF* */
  1175         DUFFS_LOOP4({
  1176         unsigned alpha;
  1177         Uint32 s = *srcp;
  1178         alpha = s >> 27; /* downscale alpha to 5 bits */
  1179         /* FIXME: Here we special-case opaque alpha since the
  1180            compositioning used (>>8 instead of /255) doesn't handle
  1181            it correctly. Also special-case alpha=0 for speed?
  1182            Benchmark this! */
  1183         if(alpha) {   
  1184           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1185             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1186           } else {
  1187             Uint32 d = *dstp;
  1188             /*
  1189              * convert source and destination to G0RAB65565
  1190              * and blend all components at the same time
  1191              */
  1192             s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1193               + (s >> 3 & 0x1f);
  1194             d = (d | d << 16) & 0x03e07c1f;
  1195             d += (s - d) * alpha >> 5;
  1196             d &= 0x03e07c1f;
  1197             *dstp = (Uint16)(d | d >> 16);
  1198           }
  1199         }
  1200         srcp++;
  1201         dstp++;
  1202         }, width);
  1203         /* *INDENT-ON* */
  1204         srcp += srcskip;
  1205         dstp += dstskip;
  1206     }
  1207 }
  1208 
  1209 /* General (slow) N->N blending with per-surface alpha */
  1210 static void
  1211 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1212 {
  1213     int width = info->dst_w;
  1214     int height = info->dst_h;
  1215     Uint8 *src = info->src;
  1216     int srcskip = info->src_skip;
  1217     Uint8 *dst = info->dst;
  1218     int dstskip = info->dst_skip;
  1219     SDL_PixelFormat *srcfmt = info->src_fmt;
  1220     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1221     int srcbpp = srcfmt->BytesPerPixel;
  1222     int dstbpp = dstfmt->BytesPerPixel;
  1223     Uint32 Pixel;
  1224     unsigned sR, sG, sB;
  1225     unsigned dR, dG, dB, dA;
  1226     const unsigned sA = info->a;
  1227 
  1228     if (sA) {
  1229         while (height--) {
  1230         /* *INDENT-OFF* */
  1231         DUFFS_LOOP4(
  1232         {
  1233         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  1234         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1235         ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1236         ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1237         src += srcbpp;
  1238         dst += dstbpp;
  1239         },
  1240         width);
  1241         /* *INDENT-ON* */
  1242             src += srcskip;
  1243             dst += dstskip;
  1244         }
  1245     }
  1246 }
  1247 
  1248 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  1249 static void
  1250 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  1251 {
  1252     int width = info->dst_w;
  1253     int height = info->dst_h;
  1254     Uint8 *src = info->src;
  1255     int srcskip = info->src_skip;
  1256     Uint8 *dst = info->dst;
  1257     int dstskip = info->dst_skip;
  1258     SDL_PixelFormat *srcfmt = info->src_fmt;
  1259     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1260     Uint32 ckey = info->colorkey;
  1261     int srcbpp = srcfmt->BytesPerPixel;
  1262     int dstbpp = dstfmt->BytesPerPixel;
  1263     Uint32 Pixel;
  1264     unsigned sR, sG, sB;
  1265     unsigned dR, dG, dB, dA;
  1266     const unsigned sA = info->a;
  1267 
  1268     while (height--) {
  1269         /* *INDENT-OFF* */
  1270         DUFFS_LOOP4(
  1271         {
  1272         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  1273         if(sA && Pixel != ckey) {
  1274             RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  1275             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1276             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1277             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1278         }
  1279         src += srcbpp;
  1280         dst += dstbpp;
  1281         },
  1282         width);
  1283         /* *INDENT-ON* */
  1284         src += srcskip;
  1285         dst += dstskip;
  1286     }
  1287 }
  1288 
  1289 /* General (slow) N->N blending with pixel alpha */
  1290 static void
  1291 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  1292 {
  1293     int width = info->dst_w;
  1294     int height = info->dst_h;
  1295     Uint8 *src = info->src;
  1296     int srcskip = info->src_skip;
  1297     Uint8 *dst = info->dst;
  1298     int dstskip = info->dst_skip;
  1299     SDL_PixelFormat *srcfmt = info->src_fmt;
  1300     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1301     int srcbpp;
  1302     int dstbpp;
  1303     Uint32 Pixel;
  1304     unsigned sR, sG, sB, sA;
  1305     unsigned dR, dG, dB, dA;
  1306 
  1307     /* Set up some basic variables */
  1308     srcbpp = srcfmt->BytesPerPixel;
  1309     dstbpp = dstfmt->BytesPerPixel;
  1310 
  1311     while (height--) {
  1312         /* *INDENT-OFF* */
  1313         DUFFS_LOOP4(
  1314         {
  1315         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1316         if(sA) {
  1317             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1318             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1319             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1320         }
  1321         src += srcbpp;
  1322         dst += dstbpp;
  1323         },
  1324         width);
  1325         /* *INDENT-ON* */
  1326         src += srcskip;
  1327         dst += dstskip;
  1328     }
  1329 }
  1330 
  1331 
  1332 SDL_BlitFunc
  1333 SDL_CalculateBlitA(SDL_Surface * surface)
  1334 {
  1335     SDL_PixelFormat *sf = surface->format;
  1336     SDL_PixelFormat *df = surface->map->dst->format;
  1337 
  1338     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  1339     case SDL_COPY_BLEND:
  1340         /* Per-pixel alpha blits */
  1341         switch (df->BytesPerPixel) {
  1342         case 1:
  1343             if (df->palette != NULL) {
  1344                 return BlitNto1PixelAlpha;
  1345             } else {
  1346                 /* RGB332 has no palette ! */
  1347                 return BlitNtoNPixelAlpha;
  1348             }
  1349 
  1350         case 2:
  1351 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
  1352                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1353                     && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
  1354                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1355                     || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
  1356                 {
  1357 #if SDL_ARM_NEON_BLITTERS
  1358                     if (SDL_HasNEON())
  1359                         return BlitARGBto565PixelAlphaARMNEON;
  1360 #endif
  1361 #if SDL_ARM_SIMD_BLITTERS
  1362                     if (SDL_HasARMSIMD())
  1363                         return BlitARGBto565PixelAlphaARMSIMD;
  1364 #endif
  1365                 }
  1366 #endif
  1367                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1368                     && sf->Gmask == 0xff00
  1369                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1370                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1371                 if (df->Gmask == 0x7e0)
  1372                     return BlitARGBto565PixelAlpha;
  1373                 else if (df->Gmask == 0x3e0)
  1374                     return BlitARGBto555PixelAlpha;
  1375             }
  1376             return BlitNtoNPixelAlpha;
  1377 
  1378         case 4:
  1379             if (sf->Rmask == df->Rmask
  1380                 && sf->Gmask == df->Gmask
  1381                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1382 #if defined(__MMX__) || defined(__3dNOW__)
  1383                 if (sf->Rshift % 8 == 0
  1384                     && sf->Gshift % 8 == 0
  1385                     && sf->Bshift % 8 == 0
  1386                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  1387 #ifdef __3dNOW__
  1388                     if (SDL_Has3DNow())
  1389                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  1390 #endif
  1391 #ifdef __MMX__
  1392                     if (SDL_HasMMX())
  1393                         return BlitRGBtoRGBPixelAlphaMMX;
  1394 #endif
  1395                 }
  1396 #endif /* __MMX__ || __3dNOW__ */
  1397                 if (sf->Amask == 0xff000000) {
  1398 #if SDL_ARM_NEON_BLITTERS
  1399                     if (SDL_HasNEON())
  1400                         return BlitRGBtoRGBPixelAlphaARMNEON;
  1401 #endif
  1402 #if SDL_ARM_SIMD_BLITTERS
  1403                     if (SDL_HasARMSIMD())
  1404                         return BlitRGBtoRGBPixelAlphaARMSIMD;
  1405 #endif
  1406                     return BlitRGBtoRGBPixelAlpha;
  1407                 }
  1408             }
  1409             return BlitNtoNPixelAlpha;
  1410 
  1411         case 3:
  1412         default:
  1413             break;
  1414         }
  1415         return BlitNtoNPixelAlpha;
  1416 
  1417     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1418         if (sf->Amask == 0) {
  1419             /* Per-surface alpha blits */
  1420             switch (df->BytesPerPixel) {
  1421             case 1:
  1422                 if (df->palette != NULL) {
  1423                     return BlitNto1SurfaceAlpha;
  1424                 } else {
  1425                     /* RGB332 has no palette ! */
  1426                     return BlitNtoNSurfaceAlpha;
  1427                 }
  1428 
  1429             case 2:
  1430                 if (surface->map->identity) {
  1431                     if (df->Gmask == 0x7e0) {
  1432 #ifdef __MMX__
  1433                         if (SDL_HasMMX())
  1434                             return Blit565to565SurfaceAlphaMMX;
  1435                         else
  1436 #endif
  1437                             return Blit565to565SurfaceAlpha;
  1438                     } else if (df->Gmask == 0x3e0) {
  1439 #ifdef __MMX__
  1440                         if (SDL_HasMMX())
  1441                             return Blit555to555SurfaceAlphaMMX;
  1442                         else
  1443 #endif
  1444                             return Blit555to555SurfaceAlpha;
  1445                     }
  1446                 }
  1447                 return BlitNtoNSurfaceAlpha;
  1448 
  1449             case 4:
  1450                 if (sf->Rmask == df->Rmask
  1451                     && sf->Gmask == df->Gmask
  1452                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1453 #ifdef __MMX__
  1454                     if (sf->Rshift % 8 == 0
  1455                         && sf->Gshift % 8 == 0
  1456                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  1457                         return BlitRGBtoRGBSurfaceAlphaMMX;
  1458 #endif
  1459                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1460                         return BlitRGBtoRGBSurfaceAlpha;
  1461                     }
  1462                 }
  1463                 return BlitNtoNSurfaceAlpha;
  1464 
  1465             case 3:
  1466             default:
  1467                 return BlitNtoNSurfaceAlpha;
  1468             }
  1469         }
  1470         break;
  1471 
  1472     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1473         if (sf->Amask == 0) {
  1474             if (df->BytesPerPixel == 1) {
  1475 
  1476                 if (df->palette != NULL) {
  1477                     return BlitNto1SurfaceAlphaKey;
  1478                 } else {
  1479                     /* RGB332 has no palette ! */
  1480                     return BlitNtoNSurfaceAlphaKey;
  1481                 }
  1482             } else {
  1483                 return BlitNtoNSurfaceAlphaKey;
  1484             }
  1485         }
  1486         break;
  1487     }
  1488 
  1489     return NULL;
  1490 }
  1491 
  1492 /* vi: set ts=4 sw=4 expandtab: */