src/video/SDL_blit_A.c
author Manuel Alfayate Corchete
Sat, 26 Sep 2020 19:18:09 +0200
changeset 14057 c9b9ade35084
parent 13440 ac297b67f6d9
permissions -rw-r--r--
kmsdrm: merge patches from Ozkan Sezer for removing c-99'isms and raising libgbm version reqeriments.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2020 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 #include "../SDL_internal.h"
    22 
    23 #if SDL_HAVE_BLIT_A
    24 
    25 #include "SDL_video.h"
    26 #include "SDL_blit.h"
    27 
    28 /* Functions to perform alpha blended blitting */
    29 
    30 /* N->1 blending with per-surface alpha */
    31 static void
    32 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    33 {
    34     int width = info->dst_w;
    35     int height = info->dst_h;
    36     Uint8 *src = info->src;
    37     int srcskip = info->src_skip;
    38     Uint8 *dst = info->dst;
    39     int dstskip = info->dst_skip;
    40     Uint8 *palmap = info->table;
    41     SDL_PixelFormat *srcfmt = info->src_fmt;
    42     SDL_PixelFormat *dstfmt = info->dst_fmt;
    43     int srcbpp = srcfmt->BytesPerPixel;
    44     Uint32 Pixel;
    45     unsigned sR, sG, sB;
    46     unsigned dR, dG, dB;
    47     const unsigned A = info->a;
    48 
    49     while (height--) {
    50         /* *INDENT-OFF* */
    51         DUFFS_LOOP4(
    52         {
    53         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    54         dR = dstfmt->palette->colors[*dst].r;
    55         dG = dstfmt->palette->colors[*dst].g;
    56         dB = dstfmt->palette->colors[*dst].b;
    57         ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    58         dR &= 0xff;
    59         dG &= 0xff;
    60         dB &= 0xff;
    61         /* Pack RGB into 8bit pixel */
    62         if ( palmap == NULL ) {
    63             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    64         } else {
    65             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    66         }
    67         dst++;
    68         src += srcbpp;
    69         },
    70         width);
    71         /* *INDENT-ON* */
    72         src += srcskip;
    73         dst += dstskip;
    74     }
    75 }
    76 
    77 /* N->1 blending with pixel alpha */
    78 static void
    79 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    80 {
    81     int width = info->dst_w;
    82     int height = info->dst_h;
    83     Uint8 *src = info->src;
    84     int srcskip = info->src_skip;
    85     Uint8 *dst = info->dst;
    86     int dstskip = info->dst_skip;
    87     Uint8 *palmap = info->table;
    88     SDL_PixelFormat *srcfmt = info->src_fmt;
    89     SDL_PixelFormat *dstfmt = info->dst_fmt;
    90     int srcbpp = srcfmt->BytesPerPixel;
    91     Uint32 Pixel;
    92     unsigned sR, sG, sB, sA;
    93     unsigned dR, dG, dB;
    94 
    95     while (height--) {
    96         /* *INDENT-OFF* */
    97         DUFFS_LOOP4(
    98         {
    99         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
   100         dR = dstfmt->palette->colors[*dst].r;
   101         dG = dstfmt->palette->colors[*dst].g;
   102         dB = dstfmt->palette->colors[*dst].b;
   103         ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
   104         dR &= 0xff;
   105         dG &= 0xff;
   106         dB &= 0xff;
   107         /* Pack RGB into 8bit pixel */
   108         if ( palmap == NULL ) {
   109             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   110         } else {
   111             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   112         }
   113         dst++;
   114         src += srcbpp;
   115         },
   116         width);
   117         /* *INDENT-ON* */
   118         src += srcskip;
   119         dst += dstskip;
   120     }
   121 }
   122 
   123 /* colorkeyed N->1 blending with per-surface alpha */
   124 static void
   125 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   126 {
   127     int width = info->dst_w;
   128     int height = info->dst_h;
   129     Uint8 *src = info->src;
   130     int srcskip = info->src_skip;
   131     Uint8 *dst = info->dst;
   132     int dstskip = info->dst_skip;
   133     Uint8 *palmap = info->table;
   134     SDL_PixelFormat *srcfmt = info->src_fmt;
   135     SDL_PixelFormat *dstfmt = info->dst_fmt;
   136     int srcbpp = srcfmt->BytesPerPixel;
   137     Uint32 ckey = info->colorkey;
   138     Uint32 Pixel;
   139     unsigned sR, sG, sB;
   140     unsigned dR, dG, dB;
   141     const unsigned A = info->a;
   142 
   143     while (height--) {
   144         /* *INDENT-OFF* */
   145         DUFFS_LOOP(
   146         {
   147         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   148         if ( Pixel != ckey ) {
   149             dR = dstfmt->palette->colors[*dst].r;
   150             dG = dstfmt->palette->colors[*dst].g;
   151             dB = dstfmt->palette->colors[*dst].b;
   152             ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
   153             dR &= 0xff;
   154             dG &= 0xff;
   155             dB &= 0xff;
   156             /* Pack RGB into 8bit pixel */
   157             if ( palmap == NULL ) {
   158                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   159             } else {
   160                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   161             }
   162         }
   163         dst++;
   164         src += srcbpp;
   165         },
   166         width);
   167         /* *INDENT-ON* */
   168         src += srcskip;
   169         dst += dstskip;
   170     }
   171 }
   172 
   173 #ifdef __MMX__
   174 
   175 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   176 static void
   177 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   178 {
   179     int width = info->dst_w;
   180     int height = info->dst_h;
   181     Uint32 *srcp = (Uint32 *) info->src;
   182     int srcskip = info->src_skip >> 2;
   183     Uint32 *dstp = (Uint32 *) info->dst;
   184     int dstskip = info->dst_skip >> 2;
   185     Uint32 dalpha = info->dst_fmt->Amask;
   186 
   187     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   188 
   189     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   190     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   191     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   192 
   193     while (height--) {
   194         int n = width;
   195         if (n & 1) {
   196             Uint32 s = *srcp++;
   197             Uint32 d = *dstp;
   198             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   199                        + (s & d & 0x00010101)) | dalpha;
   200             n--;
   201         }
   202 
   203         for (n >>= 1; n > 0; --n) {
   204             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   205             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   206 
   207             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   208             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   209 
   210             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   211             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   212             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   213             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   214 
   215             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   216             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   217             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   218             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   219 
   220             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   221             dstp += 2;
   222             srcp += 2;
   223         }
   224 
   225         srcp += srcskip;
   226         dstp += dstskip;
   227     }
   228     _mm_empty();
   229 }
   230 
   231 /* fast RGB888->(A)RGB888 blending with surface alpha */
   232 static void
   233 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   234 {
   235     SDL_PixelFormat *df = info->dst_fmt;
   236     Uint32 chanmask;
   237     unsigned alpha = info->a;
   238 
   239     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   240         /* only call a128 version when R,G,B occupy lower bits */
   241         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   242     } else {
   243         int width = info->dst_w;
   244         int height = info->dst_h;
   245         Uint32 *srcp = (Uint32 *) info->src;
   246         int srcskip = info->src_skip >> 2;
   247         Uint32 *dstp = (Uint32 *) info->dst;
   248         int dstskip = info->dst_skip >> 2;
   249         Uint32 dalpha = df->Amask;
   250         Uint32 amult;
   251 
   252         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   253 
   254         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   255         /* form the alpha mult */
   256         amult = alpha | (alpha << 8);
   257         amult = amult | (amult << 16);
   258         chanmask =
   259             (0xff << df->Rshift) | (0xff << df->
   260                                     Gshift) | (0xff << df->Bshift);
   261         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   262         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   263         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   264         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   265 
   266         while (height--) {
   267             int n = width;
   268             if (n & 1) {
   269                 /* One Pixel Blend */
   270                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   271                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   272 
   273                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   274                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   275 
   276                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   277                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   278                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   279                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   280 
   281                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   282                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   283                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   284 
   285                 ++srcp;
   286                 ++dstp;
   287 
   288                 n--;
   289             }
   290 
   291             for (n >>= 1; n > 0; --n) {
   292                 /* Two Pixels Blend */
   293                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   294                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   295                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   296                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   297 
   298                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   299                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   300                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   301                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   302 
   303                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   304                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   305                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   306                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   307 
   308                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   309                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   310                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   311                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   312 
   313                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   314                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   315 
   316                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   317 
   318                 srcp += 2;
   319                 dstp += 2;
   320             }
   321             srcp += srcskip;
   322             dstp += dstskip;
   323         }
   324         _mm_empty();
   325     }
   326 }
   327 
   328 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   329 static void
   330 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   331 {
   332     int width = info->dst_w;
   333     int height = info->dst_h;
   334     Uint32 *srcp = (Uint32 *) info->src;
   335     int srcskip = info->src_skip >> 2;
   336     Uint32 *dstp = (Uint32 *) info->dst;
   337     int dstskip = info->dst_skip >> 2;
   338     SDL_PixelFormat *sf = info->src_fmt;
   339     Uint32 amask = sf->Amask;
   340     Uint32 ashift = sf->Ashift;
   341     Uint64 multmask, multmask2;
   342 
   343     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   344 
   345     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   346     multmask = 0x00FF;
   347     multmask <<= (ashift * 2);
   348     multmask2 = 0x00FF00FF00FF00FFULL;
   349 
   350     while (height--) {
   351         /* *INDENT-OFF* */
   352         DUFFS_LOOP4({
   353         Uint32 alpha = *srcp & amask;
   354         if (alpha == 0) {
   355             /* do nothing */
   356         } else if (alpha == amask) {
   357             *dstp = *srcp;
   358         } else {
   359             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   360             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   361 
   362             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   363             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   364 
   365             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   366             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   367             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   368             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   369             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   370             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   371 
   372             /* blend */            
   373             src1 = _mm_mullo_pi16(src1, mm_alpha);
   374             src1 = _mm_srli_pi16(src1, 8);
   375             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   376             dst1 = _mm_srli_pi16(dst1, 8);
   377             dst1 = _mm_add_pi16(src1, dst1);
   378             dst1 = _mm_packs_pu16(dst1, mm_zero);
   379             
   380             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   381         }
   382         ++srcp;
   383         ++dstp;
   384         }, width);
   385         /* *INDENT-ON* */
   386         srcp += srcskip;
   387         dstp += dstskip;
   388     }
   389     _mm_empty();
   390 }
   391 
   392 #endif /* __MMX__ */
   393 
   394 #if SDL_ARM_SIMD_BLITTERS
   395 void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   396 
   397 static void
   398 BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo * info)
   399 {
   400 	int32_t width = info->dst_w;
   401 	int32_t height = info->dst_h;
   402 	uint16_t *dstp = (uint16_t *)info->dst;
   403 	int32_t dststride = width + (info->dst_skip >> 1);
   404 	uint32_t *srcp = (uint32_t *)info->src;
   405 	int32_t srcstride = width + (info->src_skip >> 2);
   406 
   407 	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
   408 }
   409 
   410 void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   411 
   412 static void
   413 BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
   414 {
   415     int32_t width = info->dst_w;
   416     int32_t height = info->dst_h;
   417     uint32_t *dstp = (uint32_t *)info->dst;
   418     int32_t dststride = width + (info->dst_skip >> 2);
   419     uint32_t *srcp = (uint32_t *)info->src;
   420     int32_t srcstride = width + (info->src_skip >> 2);
   421 
   422     BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
   423 }
   424 #endif
   425 
   426 #if SDL_ARM_NEON_BLITTERS
   427 void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   428 
   429 static void
   430 BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo * info)
   431 {
   432     int32_t width = info->dst_w;
   433     int32_t height = info->dst_h;
   434     uint16_t *dstp = (uint16_t *)info->dst;
   435     int32_t dststride = width + (info->dst_skip >> 1);
   436     uint32_t *srcp = (uint32_t *)info->src;
   437     int32_t srcstride = width + (info->src_skip >> 2);
   438 
   439     BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
   440 }
   441 
   442 void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
   443 
   444 static void
   445 BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo * info)
   446 {
   447 	int32_t width = info->dst_w;
   448 	int32_t height = info->dst_h;
   449 	uint32_t *dstp = (uint32_t *)info->dst;
   450 	int32_t dststride = width + (info->dst_skip >> 2);
   451 	uint32_t *srcp = (uint32_t *)info->src;
   452 	int32_t srcstride = width + (info->src_skip >> 2);
   453 
   454 	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
   455 }
   456 #endif
   457 
   458 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   459 static void
   460 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
   461 {
   462     int width = info->dst_w;
   463     int height = info->dst_h;
   464     Uint32 *srcp = (Uint32 *) info->src;
   465     int srcskip = info->src_skip >> 2;
   466     Uint32 *dstp = (Uint32 *) info->dst;
   467     int dstskip = info->dst_skip >> 2;
   468 
   469     while (height--) {
   470         /* *INDENT-OFF* */
   471         DUFFS_LOOP4({
   472             Uint32 s = *srcp++;
   473             Uint32 d = *dstp;
   474             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   475                    + (s & d & 0x00010101)) | 0xff000000;
   476         }, width);
   477         /* *INDENT-ON* */
   478         srcp += srcskip;
   479         dstp += dstskip;
   480     }
   481 }
   482 
   483 /* fast RGB888->(A)RGB888 blending with surface alpha */
   484 static void
   485 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
   486 {
   487     unsigned alpha = info->a;
   488     if (alpha == 128) {
   489         BlitRGBtoRGBSurfaceAlpha128(info);
   490     } else {
   491         int width = info->dst_w;
   492         int height = info->dst_h;
   493         Uint32 *srcp = (Uint32 *) info->src;
   494         int srcskip = info->src_skip >> 2;
   495         Uint32 *dstp = (Uint32 *) info->dst;
   496         int dstskip = info->dst_skip >> 2;
   497         Uint32 s;
   498         Uint32 d;
   499         Uint32 s1;
   500         Uint32 d1;
   501 
   502         while (height--) {
   503             /* *INDENT-OFF* */
   504             DUFFS_LOOP4({
   505                 s = *srcp;
   506                 d = *dstp;
   507                 s1 = s & 0xff00ff;
   508                 d1 = d & 0xff00ff;
   509                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
   510                      & 0xff00ff;
   511                 s &= 0xff00;
   512                 d &= 0xff00;
   513                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   514                 *dstp = d1 | d | 0xff000000;
   515                 ++srcp;
   516                 ++dstp;
   517             }, width);
   518             /* *INDENT-ON* */
   519             srcp += srcskip;
   520             dstp += dstskip;
   521         }
   522     }
   523 }
   524 
   525 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   526 static void
   527 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
   528 {
   529     int width = info->dst_w;
   530     int height = info->dst_h;
   531     Uint32 *srcp = (Uint32 *) info->src;
   532     int srcskip = info->src_skip >> 2;
   533     Uint32 *dstp = (Uint32 *) info->dst;
   534     int dstskip = info->dst_skip >> 2;
   535 
   536     while (height--) {
   537         /* *INDENT-OFF* */
   538         DUFFS_LOOP4({
   539         Uint32 dalpha;
   540         Uint32 d;
   541         Uint32 s1;
   542         Uint32 d1;
   543         Uint32 s = *srcp;
   544         Uint32 alpha = s >> 24;
   545         /* FIXME: Here we special-case opaque alpha since the
   546            compositioning used (>>8 instead of /255) doesn't handle
   547            it correctly. Also special-case alpha=0 for speed?
   548            Benchmark this! */
   549         if (alpha) {
   550           if (alpha == SDL_ALPHA_OPAQUE) {
   551               *dstp = *srcp;
   552           } else {
   553             /*
   554              * take out the middle component (green), and process
   555              * the other two in parallel. One multiply less.
   556              */
   557             d = *dstp;
   558             dalpha = d >> 24;
   559             s1 = s & 0xff00ff;
   560             d1 = d & 0xff00ff;
   561             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   562             s &= 0xff00;
   563             d &= 0xff00;
   564             d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   565             dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
   566             *dstp = d1 | d | (dalpha << 24);
   567           }
   568         }
   569         ++srcp;
   570         ++dstp;
   571         }, width);
   572         /* *INDENT-ON* */
   573         srcp += srcskip;
   574         dstp += dstskip;
   575     }
   576 }
   577 
   578 #ifdef __3dNOW__
   579 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   580 static void
   581 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   582 {
   583     int width = info->dst_w;
   584     int height = info->dst_h;
   585     Uint32 *srcp = (Uint32 *) info->src;
   586     int srcskip = info->src_skip >> 2;
   587     Uint32 *dstp = (Uint32 *) info->dst;
   588     int dstskip = info->dst_skip >> 2;
   589     SDL_PixelFormat *sf = info->src_fmt;
   590     Uint32 amask = sf->Amask;
   591     Uint32 ashift = sf->Ashift;
   592     Uint64 multmask, multmask2;
   593 
   594     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   595 
   596     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   597     multmask = 0x00FF;
   598     multmask <<= (ashift * 2);
   599     multmask2 = 0x00FF00FF00FF00FFULL;
   600 
   601     while (height--) {
   602         /* *INDENT-OFF* */
   603         DUFFS_LOOP4({
   604         Uint32 alpha;
   605 
   606         _m_prefetch(srcp + 16);
   607         _m_prefetch(dstp + 16);
   608 
   609         alpha = *srcp & amask;
   610         if (alpha == 0) {
   611             /* do nothing */
   612         } else if (alpha == amask) {
   613             *dstp = *srcp;
   614         } else {
   615             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   616             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   617 
   618             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   619             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   620 
   621             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   622             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   623             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   624             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   625             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   626             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   627 
   628 
   629             /* blend */            
   630             src1 = _mm_mullo_pi16(src1, mm_alpha);
   631             src1 = _mm_srli_pi16(src1, 8);
   632             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   633             dst1 = _mm_srli_pi16(dst1, 8);
   634             dst1 = _mm_add_pi16(src1, dst1);
   635             dst1 = _mm_packs_pu16(dst1, mm_zero);
   636             
   637             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   638         }
   639         ++srcp;
   640         ++dstp;
   641         }, width);
   642         /* *INDENT-ON* */
   643         srcp += srcskip;
   644         dstp += dstskip;
   645     }
   646     _mm_empty();
   647 }
   648 
   649 #endif /* __3dNOW__ */
   650 
   651 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   652 
   653 /* blend a single 16 bit pixel at 50% */
   654 #define BLEND16_50(d, s, mask)                        \
   655     ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   656 
   657 /* blend two 16 bit pixels at 50% */
   658 #define BLEND2x16_50(d, s, mask)                         \
   659     (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   660      + (s & d & (~(mask | mask << 16))))
   661 
   662 static void
   663 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
   664 {
   665     int width = info->dst_w;
   666     int height = info->dst_h;
   667     Uint16 *srcp = (Uint16 *) info->src;
   668     int srcskip = info->src_skip >> 1;
   669     Uint16 *dstp = (Uint16 *) info->dst;
   670     int dstskip = info->dst_skip >> 1;
   671 
   672     while (height--) {
   673         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
   674             /*
   675              * Source and destination not aligned, pipeline it.
   676              * This is mostly a win for big blits but no loss for
   677              * small ones
   678              */
   679             Uint32 prev_sw;
   680             int w = width;
   681 
   682             /* handle odd destination */
   683             if ((uintptr_t) dstp & 2) {
   684                 Uint16 d = *dstp, s = *srcp;
   685                 *dstp = BLEND16_50(d, s, mask);
   686                 dstp++;
   687                 srcp++;
   688                 w--;
   689             }
   690             srcp++;             /* srcp is now 32-bit aligned */
   691 
   692             /* bootstrap pipeline with first halfword */
   693             prev_sw = ((Uint32 *) srcp)[-1];
   694 
   695             while (w > 1) {
   696                 Uint32 sw, dw, s;
   697                 sw = *(Uint32 *) srcp;
   698                 dw = *(Uint32 *) dstp;
   699 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   700                 s = (prev_sw << 16) + (sw >> 16);
   701 #else
   702                 s = (prev_sw >> 16) + (sw << 16);
   703 #endif
   704                 prev_sw = sw;
   705                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
   706                 dstp += 2;
   707                 srcp += 2;
   708                 w -= 2;
   709             }
   710 
   711             /* final pixel if any */
   712             if (w) {
   713                 Uint16 d = *dstp, s;
   714 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   715                 s = (Uint16) prev_sw;
   716 #else
   717                 s = (Uint16) (prev_sw >> 16);
   718 #endif
   719                 *dstp = BLEND16_50(d, s, mask);
   720                 srcp++;
   721                 dstp++;
   722             }
   723             srcp += srcskip - 1;
   724             dstp += dstskip;
   725         } else {
   726             /* source and destination are aligned */
   727             int w = width;
   728 
   729             /* first odd pixel? */
   730             if ((uintptr_t) srcp & 2) {
   731                 Uint16 d = *dstp, s = *srcp;
   732                 *dstp = BLEND16_50(d, s, mask);
   733                 srcp++;
   734                 dstp++;
   735                 w--;
   736             }
   737             /* srcp and dstp are now 32-bit aligned */
   738 
   739             while (w > 1) {
   740                 Uint32 sw = *(Uint32 *) srcp;
   741                 Uint32 dw = *(Uint32 *) dstp;
   742                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
   743                 srcp += 2;
   744                 dstp += 2;
   745                 w -= 2;
   746             }
   747 
   748             /* last odd pixel? */
   749             if (w) {
   750                 Uint16 d = *dstp, s = *srcp;
   751                 *dstp = BLEND16_50(d, s, mask);
   752                 srcp++;
   753                 dstp++;
   754             }
   755             srcp += srcskip;
   756             dstp += dstskip;
   757         }
   758     }
   759 }
   760 
   761 #ifdef __MMX__
   762 
   763 /* fast RGB565->RGB565 blending with surface alpha */
   764 static void
   765 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   766 {
   767     unsigned alpha = info->a;
   768     if (alpha == 128) {
   769         Blit16to16SurfaceAlpha128(info, 0xf7de);
   770     } else {
   771         int width = info->dst_w;
   772         int height = info->dst_h;
   773         Uint16 *srcp = (Uint16 *) info->src;
   774         int srcskip = info->src_skip >> 1;
   775         Uint16 *dstp = (Uint16 *) info->dst;
   776         int dstskip = info->dst_skip >> 1;
   777         Uint32 s, d;
   778 
   779         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   780 
   781         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   782         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   783         alpha >>= 3;            /* downscale alpha to 5 bits */
   784 
   785         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   786         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   787         /* position alpha to allow for mullo and mulhi on diff channels
   788            to reduce the number of operations */
   789         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   790 
   791         /* Setup the 565 color channel masks */
   792         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
   793         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   794 
   795         while (height--) {
   796             /* *INDENT-OFF* */
   797             DUFFS_LOOP_124(
   798             {
   799                 s = *srcp++;
   800                 d = *dstp;
   801                 /*
   802                  * shift out the middle component (green) to
   803                  * the high 16 bits, and process all three RGB
   804                  * components at the same time.
   805                  */
   806                 s = (s | s << 16) & 0x07e0f81f;
   807                 d = (d | d << 16) & 0x07e0f81f;
   808                 d += (s - d) * alpha >> 5;
   809                 d &= 0x07e0f81f;
   810                 *dstp++ = (Uint16)(d | d >> 16);
   811             },{
   812                 s = *srcp++;
   813                 d = *dstp;
   814                 /*
   815                  * shift out the middle component (green) to
   816                  * the high 16 bits, and process all three RGB
   817                  * components at the same time.
   818                  */
   819                 s = (s | s << 16) & 0x07e0f81f;
   820                 d = (d | d << 16) & 0x07e0f81f;
   821                 d += (s - d) * alpha >> 5;
   822                 d &= 0x07e0f81f;
   823                 *dstp++ = (Uint16)(d | d >> 16);
   824                 s = *srcp++;
   825                 d = *dstp;
   826                 /*
   827                  * shift out the middle component (green) to
   828                  * the high 16 bits, and process all three RGB
   829                  * components at the same time.
   830                  */
   831                 s = (s | s << 16) & 0x07e0f81f;
   832                 d = (d | d << 16) & 0x07e0f81f;
   833                 d += (s - d) * alpha >> 5;
   834                 d &= 0x07e0f81f;
   835                 *dstp++ = (Uint16)(d | d >> 16);
   836             },{
   837                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   838                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   839 
   840                 /* red */
   841                 src2 = src1;
   842                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   843 
   844                 dst2 = dst1;
   845                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   846 
   847                 /* blend */
   848                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   849                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   850                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   851                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   852                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   853 
   854                 mm_res = dst2; /* RED -> mm_res */
   855 
   856                 /* green -- process the bits in place */
   857                 src2 = src1;
   858                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   859 
   860                 dst2 = dst1;
   861                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   862 
   863                 /* blend */
   864                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   865                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   866                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   867                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   868 
   869                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   870 
   871                 /* blue */
   872                 src2 = src1;
   873                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   874 
   875                 dst2 = dst1;
   876                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   877 
   878                 /* blend */
   879                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   880                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   881                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   882                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   883                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   884 
   885                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   886 
   887                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   888 
   889                 srcp += 4;
   890                 dstp += 4;
   891             }, width);
   892             /* *INDENT-ON* */
   893             srcp += srcskip;
   894             dstp += dstskip;
   895         }
   896         _mm_empty();
   897     }
   898 }
   899 
   900 /* fast RGB555->RGB555 blending with surface alpha */
   901 static void
   902 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   903 {
   904     unsigned alpha = info->a;
   905     if (alpha == 128) {
   906         Blit16to16SurfaceAlpha128(info, 0xfbde);
   907     } else {
   908         int width = info->dst_w;
   909         int height = info->dst_h;
   910         Uint16 *srcp = (Uint16 *) info->src;
   911         int srcskip = info->src_skip >> 1;
   912         Uint16 *dstp = (Uint16 *) info->dst;
   913         int dstskip = info->dst_skip >> 1;
   914         Uint32 s, d;
   915 
   916         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   917 
   918         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   919         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   920         alpha >>= 3;            /* downscale alpha to 5 bits */
   921 
   922         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   923         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   924         /* position alpha to allow for mullo and mulhi on diff channels
   925            to reduce the number of operations */
   926         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   927 
   928         /* Setup the 555 color channel masks */
   929         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
   930         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
   931         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   932 
   933         while (height--) {
   934             /* *INDENT-OFF* */
   935             DUFFS_LOOP_124(
   936             {
   937                 s = *srcp++;
   938                 d = *dstp;
   939                 /*
   940                  * shift out the middle component (green) to
   941                  * the high 16 bits, and process all three RGB
   942                  * components at the same time.
   943                  */
   944                 s = (s | s << 16) & 0x03e07c1f;
   945                 d = (d | d << 16) & 0x03e07c1f;
   946                 d += (s - d) * alpha >> 5;
   947                 d &= 0x03e07c1f;
   948                 *dstp++ = (Uint16)(d | d >> 16);
   949             },{
   950                 s = *srcp++;
   951                 d = *dstp;
   952                 /*
   953                  * shift out the middle component (green) to
   954                  * the high 16 bits, and process all three RGB
   955                  * components at the same time.
   956                  */
   957                 s = (s | s << 16) & 0x03e07c1f;
   958                 d = (d | d << 16) & 0x03e07c1f;
   959                 d += (s - d) * alpha >> 5;
   960                 d &= 0x03e07c1f;
   961                 *dstp++ = (Uint16)(d | d >> 16);
   962                     s = *srcp++;
   963                 d = *dstp;
   964                 /*
   965                  * shift out the middle component (green) to
   966                  * the high 16 bits, and process all three RGB
   967                  * components at the same time.
   968                  */
   969                 s = (s | s << 16) & 0x03e07c1f;
   970                 d = (d | d << 16) & 0x03e07c1f;
   971                 d += (s - d) * alpha >> 5;
   972                 d &= 0x03e07c1f;
   973                 *dstp++ = (Uint16)(d | d >> 16);
   974             },{
   975                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   976                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   977 
   978                 /* red -- process the bits in place */
   979                 src2 = src1;
   980                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   981 
   982                 dst2 = dst1;
   983                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   984 
   985                 /* blend */
   986                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   987                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   988                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   989                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   990                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   991 
   992                 mm_res = dst2; /* RED -> mm_res */
   993                 
   994                 /* green -- process the bits in place */
   995                 src2 = src1;
   996                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   997 
   998                 dst2 = dst1;
   999                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
  1000 
  1001                 /* blend */
  1002                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1003                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1004                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
  1005                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1006 
  1007                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
  1008 
  1009                 /* blue */
  1010                 src2 = src1; /* src -> src2 */
  1011                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
  1012 
  1013                 dst2 = dst1; /* dst -> dst2 */
  1014                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
  1015 
  1016                 /* blend */
  1017                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
  1018                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
  1019                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
  1020                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
  1021                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
  1022 
  1023                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
  1024 
  1025                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
  1026 
  1027                 srcp += 4;
  1028                 dstp += 4;
  1029             }, width);
  1030             /* *INDENT-ON* */
  1031             srcp += srcskip;
  1032             dstp += dstskip;
  1033         }
  1034         _mm_empty();
  1035     }
  1036 }
  1037 
  1038 #endif /* __MMX__ */
  1039 
  1040 /* fast RGB565->RGB565 blending with surface alpha */
  1041 static void
  1042 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  1043 {
  1044     unsigned alpha = info->a;
  1045     if (alpha == 128) {
  1046         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1047     } else {
  1048         int width = info->dst_w;
  1049         int height = info->dst_h;
  1050         Uint16 *srcp = (Uint16 *) info->src;
  1051         int srcskip = info->src_skip >> 1;
  1052         Uint16 *dstp = (Uint16 *) info->dst;
  1053         int dstskip = info->dst_skip >> 1;
  1054         alpha >>= 3;            /* downscale alpha to 5 bits */
  1055 
  1056         while (height--) {
  1057             /* *INDENT-OFF* */
  1058             DUFFS_LOOP4({
  1059                 Uint32 s = *srcp++;
  1060                 Uint32 d = *dstp;
  1061                 /*
  1062                  * shift out the middle component (green) to
  1063                  * the high 16 bits, and process all three RGB
  1064                  * components at the same time.
  1065                  */
  1066                 s = (s | s << 16) & 0x07e0f81f;
  1067                 d = (d | d << 16) & 0x07e0f81f;
  1068                 d += (s - d) * alpha >> 5;
  1069                 d &= 0x07e0f81f;
  1070                 *dstp++ = (Uint16)(d | d >> 16);
  1071             }, width);
  1072             /* *INDENT-ON* */
  1073             srcp += srcskip;
  1074             dstp += dstskip;
  1075         }
  1076     }
  1077 }
  1078 
  1079 /* fast RGB555->RGB555 blending with surface alpha */
  1080 static void
  1081 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1082 {
  1083     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1084     if (alpha == 128) {
  1085         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1086     } else {
  1087         int width = info->dst_w;
  1088         int height = info->dst_h;
  1089         Uint16 *srcp = (Uint16 *) info->src;
  1090         int srcskip = info->src_skip >> 1;
  1091         Uint16 *dstp = (Uint16 *) info->dst;
  1092         int dstskip = info->dst_skip >> 1;
  1093         alpha >>= 3;            /* downscale alpha to 5 bits */
  1094 
  1095         while (height--) {
  1096             /* *INDENT-OFF* */
  1097             DUFFS_LOOP4({
  1098                 Uint32 s = *srcp++;
  1099                 Uint32 d = *dstp;
  1100                 /*
  1101                  * shift out the middle component (green) to
  1102                  * the high 16 bits, and process all three RGB
  1103                  * components at the same time.
  1104                  */
  1105                 s = (s | s << 16) & 0x03e07c1f;
  1106                 d = (d | d << 16) & 0x03e07c1f;
  1107                 d += (s - d) * alpha >> 5;
  1108                 d &= 0x03e07c1f;
  1109                 *dstp++ = (Uint16)(d | d >> 16);
  1110             }, width);
  1111             /* *INDENT-ON* */
  1112             srcp += srcskip;
  1113             dstp += dstskip;
  1114         }
  1115     }
  1116 }
  1117 
  1118 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1119 static void
  1120 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1121 {
  1122     int width = info->dst_w;
  1123     int height = info->dst_h;
  1124     Uint32 *srcp = (Uint32 *) info->src;
  1125     int srcskip = info->src_skip >> 2;
  1126     Uint16 *dstp = (Uint16 *) info->dst;
  1127     int dstskip = info->dst_skip >> 1;
  1128 
  1129     while (height--) {
  1130         /* *INDENT-OFF* */
  1131         DUFFS_LOOP4({
  1132         Uint32 s = *srcp;
  1133         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1134         /* FIXME: Here we special-case opaque alpha since the
  1135            compositioning used (>>8 instead of /255) doesn't handle
  1136            it correctly. Also special-case alpha=0 for speed?
  1137            Benchmark this! */
  1138         if(alpha) {   
  1139           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1140             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1141           } else {
  1142             Uint32 d = *dstp;
  1143             /*
  1144              * convert source and destination to G0RAB65565
  1145              * and blend all components at the same time
  1146              */
  1147             s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1148               + (s >> 3 & 0x1f);
  1149             d = (d | d << 16) & 0x07e0f81f;
  1150             d += (s - d) * alpha >> 5;
  1151             d &= 0x07e0f81f;
  1152             *dstp = (Uint16)(d | d >> 16);
  1153           }
  1154         }
  1155         srcp++;
  1156         dstp++;
  1157         }, width);
  1158         /* *INDENT-ON* */
  1159         srcp += srcskip;
  1160         dstp += dstskip;
  1161     }
  1162 }
  1163 
  1164 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1165 static void
  1166 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1167 {
  1168     int width = info->dst_w;
  1169     int height = info->dst_h;
  1170     Uint32 *srcp = (Uint32 *) info->src;
  1171     int srcskip = info->src_skip >> 2;
  1172     Uint16 *dstp = (Uint16 *) info->dst;
  1173     int dstskip = info->dst_skip >> 1;
  1174 
  1175     while (height--) {
  1176         /* *INDENT-OFF* */
  1177         DUFFS_LOOP4({
  1178         unsigned alpha;
  1179         Uint32 s = *srcp;
  1180         alpha = s >> 27; /* downscale alpha to 5 bits */
  1181         /* FIXME: Here we special-case opaque alpha since the
  1182            compositioning used (>>8 instead of /255) doesn't handle
  1183            it correctly. Also special-case alpha=0 for speed?
  1184            Benchmark this! */
  1185         if(alpha) {   
  1186           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1187             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1188           } else {
  1189             Uint32 d = *dstp;
  1190             /*
  1191              * convert source and destination to G0RAB65565
  1192              * and blend all components at the same time
  1193              */
  1194             s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1195               + (s >> 3 & 0x1f);
  1196             d = (d | d << 16) & 0x03e07c1f;
  1197             d += (s - d) * alpha >> 5;
  1198             d &= 0x03e07c1f;
  1199             *dstp = (Uint16)(d | d >> 16);
  1200           }
  1201         }
  1202         srcp++;
  1203         dstp++;
  1204         }, width);
  1205         /* *INDENT-ON* */
  1206         srcp += srcskip;
  1207         dstp += dstskip;
  1208     }
  1209 }
  1210 
  1211 /* General (slow) N->N blending with per-surface alpha */
  1212 static void
  1213 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1214 {
  1215     int width = info->dst_w;
  1216     int height = info->dst_h;
  1217     Uint8 *src = info->src;
  1218     int srcskip = info->src_skip;
  1219     Uint8 *dst = info->dst;
  1220     int dstskip = info->dst_skip;
  1221     SDL_PixelFormat *srcfmt = info->src_fmt;
  1222     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1223     int srcbpp = srcfmt->BytesPerPixel;
  1224     int dstbpp = dstfmt->BytesPerPixel;
  1225     Uint32 Pixel;
  1226     unsigned sR, sG, sB;
  1227     unsigned dR, dG, dB, dA;
  1228     const unsigned sA = info->a;
  1229 
  1230     if (sA) {
  1231         while (height--) {
  1232         /* *INDENT-OFF* */
  1233         DUFFS_LOOP4(
  1234         {
  1235         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  1236         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1237         ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1238         ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1239         src += srcbpp;
  1240         dst += dstbpp;
  1241         },
  1242         width);
  1243         /* *INDENT-ON* */
  1244             src += srcskip;
  1245             dst += dstskip;
  1246         }
  1247     }
  1248 }
  1249 
  1250 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  1251 static void
  1252 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  1253 {
  1254     int width = info->dst_w;
  1255     int height = info->dst_h;
  1256     Uint8 *src = info->src;
  1257     int srcskip = info->src_skip;
  1258     Uint8 *dst = info->dst;
  1259     int dstskip = info->dst_skip;
  1260     SDL_PixelFormat *srcfmt = info->src_fmt;
  1261     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1262     Uint32 ckey = info->colorkey;
  1263     int srcbpp = srcfmt->BytesPerPixel;
  1264     int dstbpp = dstfmt->BytesPerPixel;
  1265     Uint32 Pixel;
  1266     unsigned sR, sG, sB;
  1267     unsigned dR, dG, dB, dA;
  1268     const unsigned sA = info->a;
  1269 
  1270     while (height--) {
  1271         /* *INDENT-OFF* */
  1272         DUFFS_LOOP4(
  1273         {
  1274         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  1275         if(sA && Pixel != ckey) {
  1276             RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  1277             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1278             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1279             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1280         }
  1281         src += srcbpp;
  1282         dst += dstbpp;
  1283         },
  1284         width);
  1285         /* *INDENT-ON* */
  1286         src += srcskip;
  1287         dst += dstskip;
  1288     }
  1289 }
  1290 
  1291 /* General (slow) N->N blending with pixel alpha */
  1292 static void
  1293 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  1294 {
  1295     int width = info->dst_w;
  1296     int height = info->dst_h;
  1297     Uint8 *src = info->src;
  1298     int srcskip = info->src_skip;
  1299     Uint8 *dst = info->dst;
  1300     int dstskip = info->dst_skip;
  1301     SDL_PixelFormat *srcfmt = info->src_fmt;
  1302     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1303     int srcbpp;
  1304     int dstbpp;
  1305     Uint32 Pixel;
  1306     unsigned sR, sG, sB, sA;
  1307     unsigned dR, dG, dB, dA;
  1308 
  1309     /* Set up some basic variables */
  1310     srcbpp = srcfmt->BytesPerPixel;
  1311     dstbpp = dstfmt->BytesPerPixel;
  1312 
  1313     while (height--) {
  1314         /* *INDENT-OFF* */
  1315         DUFFS_LOOP4(
  1316         {
  1317         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1318         if(sA) {
  1319             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1320             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1321             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1322         }
  1323         src += srcbpp;
  1324         dst += dstbpp;
  1325         },
  1326         width);
  1327         /* *INDENT-ON* */
  1328         src += srcskip;
  1329         dst += dstskip;
  1330     }
  1331 }
  1332 
  1333 
  1334 SDL_BlitFunc
  1335 SDL_CalculateBlitA(SDL_Surface * surface)
  1336 {
  1337     SDL_PixelFormat *sf = surface->format;
  1338     SDL_PixelFormat *df = surface->map->dst->format;
  1339 
  1340     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  1341     case SDL_COPY_BLEND:
  1342         /* Per-pixel alpha blits */
  1343         switch (df->BytesPerPixel) {
  1344         case 1:
  1345             if (df->palette != NULL) {
  1346                 return BlitNto1PixelAlpha;
  1347             } else {
  1348                 /* RGB332 has no palette ! */
  1349                 return BlitNtoNPixelAlpha;
  1350             }
  1351 
  1352         case 2:
  1353 #if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
  1354                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1355                     && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
  1356                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1357                     || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
  1358                 {
  1359 #if SDL_ARM_NEON_BLITTERS
  1360                     if (SDL_HasNEON())
  1361                         return BlitARGBto565PixelAlphaARMNEON;
  1362 #endif
  1363 #if SDL_ARM_SIMD_BLITTERS
  1364                     if (SDL_HasARMSIMD())
  1365                         return BlitARGBto565PixelAlphaARMSIMD;
  1366 #endif
  1367                 }
  1368 #endif
  1369                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1370                     && sf->Gmask == 0xff00
  1371                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1372                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1373                 if (df->Gmask == 0x7e0)
  1374                     return BlitARGBto565PixelAlpha;
  1375                 else if (df->Gmask == 0x3e0)
  1376                     return BlitARGBto555PixelAlpha;
  1377             }
  1378             return BlitNtoNPixelAlpha;
  1379 
  1380         case 4:
  1381             if (sf->Rmask == df->Rmask
  1382                 && sf->Gmask == df->Gmask
  1383                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1384 #if defined(__MMX__) || defined(__3dNOW__)
  1385                 if (sf->Rshift % 8 == 0
  1386                     && sf->Gshift % 8 == 0
  1387                     && sf->Bshift % 8 == 0
  1388                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  1389 #ifdef __3dNOW__
  1390                     if (SDL_Has3DNow())
  1391                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  1392 #endif
  1393 #ifdef __MMX__
  1394                     if (SDL_HasMMX())
  1395                         return BlitRGBtoRGBPixelAlphaMMX;
  1396 #endif
  1397                 }
  1398 #endif /* __MMX__ || __3dNOW__ */
  1399                 if (sf->Amask == 0xff000000) {
  1400 #if SDL_ARM_NEON_BLITTERS
  1401                     if (SDL_HasNEON())
  1402                         return BlitRGBtoRGBPixelAlphaARMNEON;
  1403 #endif
  1404 #if SDL_ARM_SIMD_BLITTERS
  1405                     if (SDL_HasARMSIMD())
  1406                         return BlitRGBtoRGBPixelAlphaARMSIMD;
  1407 #endif
  1408                     return BlitRGBtoRGBPixelAlpha;
  1409                 }
  1410             }
  1411             return BlitNtoNPixelAlpha;
  1412 
  1413         case 3:
  1414         default:
  1415             break;
  1416         }
  1417         return BlitNtoNPixelAlpha;
  1418 
  1419     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1420         if (sf->Amask == 0) {
  1421             /* Per-surface alpha blits */
  1422             switch (df->BytesPerPixel) {
  1423             case 1:
  1424                 if (df->palette != NULL) {
  1425                     return BlitNto1SurfaceAlpha;
  1426                 } else {
  1427                     /* RGB332 has no palette ! */
  1428                     return BlitNtoNSurfaceAlpha;
  1429                 }
  1430 
  1431             case 2:
  1432                 if (surface->map->identity) {
  1433                     if (df->Gmask == 0x7e0) {
  1434 #ifdef __MMX__
  1435                         if (SDL_HasMMX())
  1436                             return Blit565to565SurfaceAlphaMMX;
  1437                         else
  1438 #endif
  1439                             return Blit565to565SurfaceAlpha;
  1440                     } else if (df->Gmask == 0x3e0) {
  1441 #ifdef __MMX__
  1442                         if (SDL_HasMMX())
  1443                             return Blit555to555SurfaceAlphaMMX;
  1444                         else
  1445 #endif
  1446                             return Blit555to555SurfaceAlpha;
  1447                     }
  1448                 }
  1449                 return BlitNtoNSurfaceAlpha;
  1450 
  1451             case 4:
  1452                 if (sf->Rmask == df->Rmask
  1453                     && sf->Gmask == df->Gmask
  1454                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1455 #ifdef __MMX__
  1456                     if (sf->Rshift % 8 == 0
  1457                         && sf->Gshift % 8 == 0
  1458                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  1459                         return BlitRGBtoRGBSurfaceAlphaMMX;
  1460 #endif
  1461                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1462                         return BlitRGBtoRGBSurfaceAlpha;
  1463                     }
  1464                 }
  1465                 return BlitNtoNSurfaceAlpha;
  1466 
  1467             case 3:
  1468             default:
  1469                 return BlitNtoNSurfaceAlpha;
  1470             }
  1471         }
  1472         break;
  1473 
  1474     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1475         if (sf->Amask == 0) {
  1476             if (df->BytesPerPixel == 1) {
  1477 
  1478                 if (df->palette != NULL) {
  1479                     return BlitNto1SurfaceAlphaKey;
  1480                 } else {
  1481                     /* RGB332 has no palette ! */
  1482                     return BlitNtoNSurfaceAlphaKey;
  1483                 }
  1484             } else {
  1485                 return BlitNtoNSurfaceAlphaKey;
  1486             }
  1487         }
  1488         break;
  1489     }
  1490 
  1491     return NULL;
  1492 }
  1493 
  1494 #endif /* SDL_HAVE_BLIT_A */
  1495 
  1496 /* vi: set ts=4 sw=4 expandtab: */