src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Mon, 24 Sep 2018 11:49:25 -0700
changeset 12201 8bdc4d340419
parent 11811 5d94cb6b24d3
child 12503 806492103856
permissions -rw-r--r--
Fixed whitespace
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 #include "../SDL_internal.h"
    22 
    23 #include "SDL_video.h"
    24 #include "SDL_blit.h"
    25 
    26 /* Functions to perform alpha blended blitting */
    27 
    28 /* N->1 blending with per-surface alpha */
    29 static void
    30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    31 {
    32     int width = info->dst_w;
    33     int height = info->dst_h;
    34     Uint8 *src = info->src;
    35     int srcskip = info->src_skip;
    36     Uint8 *dst = info->dst;
    37     int dstskip = info->dst_skip;
    38     Uint8 *palmap = info->table;
    39     SDL_PixelFormat *srcfmt = info->src_fmt;
    40     SDL_PixelFormat *dstfmt = info->dst_fmt;
    41     int srcbpp = srcfmt->BytesPerPixel;
    42     Uint32 Pixel;
    43     unsigned sR, sG, sB;
    44     unsigned dR, dG, dB;
    45     const unsigned A = info->a;
    46 
    47     while (height--) {
    48         /* *INDENT-OFF* */
    49         DUFFS_LOOP4(
    50         {
    51         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
    52         dR = dstfmt->palette->colors[*dst].r;
    53         dG = dstfmt->palette->colors[*dst].g;
    54         dB = dstfmt->palette->colors[*dst].b;
    55         ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
    56         dR &= 0xff;
    57         dG &= 0xff;
    58         dB &= 0xff;
    59         /* Pack RGB into 8bit pixel */
    60         if ( palmap == NULL ) {
    61             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
    62         } else {
    63             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
    64         }
    65         dst++;
    66         src += srcbpp;
    67         },
    68         width);
    69         /* *INDENT-ON* */
    70         src += srcskip;
    71         dst += dstskip;
    72     }
    73 }
    74 
    75 /* N->1 blending with pixel alpha */
    76 static void
    77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
    78 {
    79     int width = info->dst_w;
    80     int height = info->dst_h;
    81     Uint8 *src = info->src;
    82     int srcskip = info->src_skip;
    83     Uint8 *dst = info->dst;
    84     int dstskip = info->dst_skip;
    85     Uint8 *palmap = info->table;
    86     SDL_PixelFormat *srcfmt = info->src_fmt;
    87     SDL_PixelFormat *dstfmt = info->dst_fmt;
    88     int srcbpp = srcfmt->BytesPerPixel;
    89     Uint32 Pixel;
    90     unsigned sR, sG, sB, sA;
    91     unsigned dR, dG, dB;
    92 
    93     while (height--) {
    94         /* *INDENT-OFF* */
    95         DUFFS_LOOP4(
    96         {
    97         DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
    98         dR = dstfmt->palette->colors[*dst].r;
    99         dG = dstfmt->palette->colors[*dst].g;
   100         dB = dstfmt->palette->colors[*dst].b;
   101         ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
   102         dR &= 0xff;
   103         dG &= 0xff;
   104         dB &= 0xff;
   105         /* Pack RGB into 8bit pixel */
   106         if ( palmap == NULL ) {
   107             *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   108         } else {
   109             *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   110         }
   111         dst++;
   112         src += srcbpp;
   113         },
   114         width);
   115         /* *INDENT-ON* */
   116         src += srcskip;
   117         dst += dstskip;
   118     }
   119 }
   120 
   121 /* colorkeyed N->1 blending with per-surface alpha */
   122 static void
   123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   124 {
   125     int width = info->dst_w;
   126     int height = info->dst_h;
   127     Uint8 *src = info->src;
   128     int srcskip = info->src_skip;
   129     Uint8 *dst = info->dst;
   130     int dstskip = info->dst_skip;
   131     Uint8 *palmap = info->table;
   132     SDL_PixelFormat *srcfmt = info->src_fmt;
   133     SDL_PixelFormat *dstfmt = info->dst_fmt;
   134     int srcbpp = srcfmt->BytesPerPixel;
   135     Uint32 ckey = info->colorkey;
   136     Uint32 Pixel;
   137     unsigned sR, sG, sB;
   138     unsigned dR, dG, dB;
   139     const unsigned A = info->a;
   140 
   141     while (height--) {
   142         /* *INDENT-OFF* */
   143         DUFFS_LOOP(
   144         {
   145         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
   146         if ( Pixel != ckey ) {
   147             dR = dstfmt->palette->colors[*dst].r;
   148             dG = dstfmt->palette->colors[*dst].g;
   149             dB = dstfmt->palette->colors[*dst].b;
   150             ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
   151             dR &= 0xff;
   152             dG &= 0xff;
   153             dB &= 0xff;
   154             /* Pack RGB into 8bit pixel */
   155             if ( palmap == NULL ) {
   156                 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
   157             } else {
   158                 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
   159             }
   160         }
   161         dst++;
   162         src += srcbpp;
   163         },
   164         width);
   165         /* *INDENT-ON* */
   166         src += srcskip;
   167         dst += dstskip;
   168     }
   169 }
   170 
   171 #ifdef __MMX__
   172 
   173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   174 static void
   175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   176 {
   177     int width = info->dst_w;
   178     int height = info->dst_h;
   179     Uint32 *srcp = (Uint32 *) info->src;
   180     int srcskip = info->src_skip >> 2;
   181     Uint32 *dstp = (Uint32 *) info->dst;
   182     int dstskip = info->dst_skip >> 2;
   183     Uint32 dalpha = info->dst_fmt->Amask;
   184 
   185     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   186 
   187     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   188     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   189     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   190 
   191     while (height--) {
   192         int n = width;
   193         if (n & 1) {
   194             Uint32 s = *srcp++;
   195             Uint32 d = *dstp;
   196             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   197                        + (s & d & 0x00010101)) | dalpha;
   198             n--;
   199         }
   200 
   201         for (n >>= 1; n > 0; --n) {
   202             dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
   203             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   204 
   205             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   206             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   207 
   208             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   209             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   210             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   211             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   212 
   213             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   214             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   215             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   216             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   217 
   218             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   219             dstp += 2;
   220             srcp += 2;
   221         }
   222 
   223         srcp += srcskip;
   224         dstp += dstskip;
   225     }
   226     _mm_empty();
   227 }
   228 
   229 /* fast RGB888->(A)RGB888 blending with surface alpha */
   230 static void
   231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   232 {
   233     SDL_PixelFormat *df = info->dst_fmt;
   234     Uint32 chanmask;
   235     unsigned alpha = info->a;
   236 
   237     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   238         /* only call a128 version when R,G,B occupy lower bits */
   239         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   240     } else {
   241         int width = info->dst_w;
   242         int height = info->dst_h;
   243         Uint32 *srcp = (Uint32 *) info->src;
   244         int srcskip = info->src_skip >> 2;
   245         Uint32 *dstp = (Uint32 *) info->dst;
   246         int dstskip = info->dst_skip >> 2;
   247         Uint32 dalpha = df->Amask;
   248         Uint32 amult;
   249 
   250         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   251 
   252         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   253         /* form the alpha mult */
   254         amult = alpha | (alpha << 8);
   255         amult = amult | (amult << 16);
   256         chanmask =
   257             (0xff << df->Rshift) | (0xff << df->
   258                                     Gshift) | (0xff << df->Bshift);
   259         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   260         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   261         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   262         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   263 
   264         while (height--) {
   265             int n = width;
   266             if (n & 1) {
   267                 /* One Pixel Blend */
   268                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   269                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   270 
   271                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   272                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   273 
   274                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   275                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   276                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   277                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   278 
   279                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   280                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   281                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   282 
   283                 ++srcp;
   284                 ++dstp;
   285 
   286                 n--;
   287             }
   288 
   289             for (n >>= 1; n > 0; --n) {
   290                 /* Two Pixels Blend */
   291                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   292                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   293                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   294                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   295 
   296                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   297                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   298                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   299                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   300 
   301                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   302                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   303                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   304                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   305 
   306                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   307                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   308                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   309                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   310 
   311                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   312                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   313 
   314                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   315 
   316                 srcp += 2;
   317                 dstp += 2;
   318             }
   319             srcp += srcskip;
   320             dstp += dstskip;
   321         }
   322         _mm_empty();
   323     }
   324 }
   325 
   326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   327 static void
   328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   329 {
   330     int width = info->dst_w;
   331     int height = info->dst_h;
   332     Uint32 *srcp = (Uint32 *) info->src;
   333     int srcskip = info->src_skip >> 2;
   334     Uint32 *dstp = (Uint32 *) info->dst;
   335     int dstskip = info->dst_skip >> 2;
   336     SDL_PixelFormat *sf = info->src_fmt;
   337     Uint32 amask = sf->Amask;
   338     Uint32 ashift = sf->Ashift;
   339     Uint64 multmask, multmask2;
   340 
   341     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   342 
   343     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   344     multmask = 0x00FF;
   345     multmask <<= (ashift * 2);
   346     multmask2 = 0x00FF00FF00FF00FFULL;
   347 
   348     while (height--) {
   349         /* *INDENT-OFF* */
   350         DUFFS_LOOP4({
   351         Uint32 alpha = *srcp & amask;
   352         if (alpha == 0) {
   353             /* do nothing */
   354         } else if (alpha == amask) {
   355             *dstp = *srcp;
   356         } else {
   357             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   358             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   359 
   360             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   361             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   362 
   363             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   364             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   365             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   366             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   367             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   368             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   369 
   370             /* blend */            
   371             src1 = _mm_mullo_pi16(src1, mm_alpha);
   372             src1 = _mm_srli_pi16(src1, 8);
   373             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   374             dst1 = _mm_srli_pi16(dst1, 8);
   375             dst1 = _mm_add_pi16(src1, dst1);
   376             dst1 = _mm_packs_pu16(dst1, mm_zero);
   377             
   378             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   379         }
   380         ++srcp;
   381         ++dstp;
   382         }, width);
   383         /* *INDENT-ON* */
   384         srcp += srcskip;
   385         dstp += dstskip;
   386     }
   387     _mm_empty();
   388 }
   389 
   390 #endif /* __MMX__ */
   391 
   392 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   393 static void
   394 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
   395 {
   396     int width = info->dst_w;
   397     int height = info->dst_h;
   398     Uint32 *srcp = (Uint32 *) info->src;
   399     int srcskip = info->src_skip >> 2;
   400     Uint32 *dstp = (Uint32 *) info->dst;
   401     int dstskip = info->dst_skip >> 2;
   402 
   403     while (height--) {
   404         /* *INDENT-OFF* */
   405         DUFFS_LOOP4({
   406             Uint32 s = *srcp++;
   407             Uint32 d = *dstp;
   408             *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
   409                    + (s & d & 0x00010101)) | 0xff000000;
   410         }, width);
   411         /* *INDENT-ON* */
   412         srcp += srcskip;
   413         dstp += dstskip;
   414     }
   415 }
   416 
   417 /* fast RGB888->(A)RGB888 blending with surface alpha */
   418 static void
   419 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
   420 {
   421     unsigned alpha = info->a;
   422     if (alpha == 128) {
   423         BlitRGBtoRGBSurfaceAlpha128(info);
   424     } else {
   425         int width = info->dst_w;
   426         int height = info->dst_h;
   427         Uint32 *srcp = (Uint32 *) info->src;
   428         int srcskip = info->src_skip >> 2;
   429         Uint32 *dstp = (Uint32 *) info->dst;
   430         int dstskip = info->dst_skip >> 2;
   431         Uint32 s;
   432         Uint32 d;
   433         Uint32 s1;
   434         Uint32 d1;
   435 
   436         while (height--) {
   437             /* *INDENT-OFF* */
   438             DUFFS_LOOP4({
   439                 s = *srcp;
   440                 d = *dstp;
   441                 s1 = s & 0xff00ff;
   442                 d1 = d & 0xff00ff;
   443                 d1 = (d1 + ((s1 - d1) * alpha >> 8))
   444                      & 0xff00ff;
   445                 s &= 0xff00;
   446                 d &= 0xff00;
   447                 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   448                 *dstp = d1 | d | 0xff000000;
   449                 ++srcp;
   450                 ++dstp;
   451             }, width);
   452             /* *INDENT-ON* */
   453             srcp += srcskip;
   454             dstp += dstskip;
   455         }
   456     }
   457 }
   458 
   459 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   460 static void
   461 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
   462 {
   463     int width = info->dst_w;
   464     int height = info->dst_h;
   465     Uint32 *srcp = (Uint32 *) info->src;
   466     int srcskip = info->src_skip >> 2;
   467     Uint32 *dstp = (Uint32 *) info->dst;
   468     int dstskip = info->dst_skip >> 2;
   469 
   470     while (height--) {
   471         /* *INDENT-OFF* */
   472         DUFFS_LOOP4({
   473         Uint32 dalpha;
   474         Uint32 d;
   475         Uint32 s1;
   476         Uint32 d1;
   477         Uint32 s = *srcp;
   478         Uint32 alpha = s >> 24;
   479         /* FIXME: Here we special-case opaque alpha since the
   480            compositioning used (>>8 instead of /255) doesn't handle
   481            it correctly. Also special-case alpha=0 for speed?
   482            Benchmark this! */
   483         if (alpha) {
   484           if (alpha == SDL_ALPHA_OPAQUE) {
   485               *dstp = *srcp;
   486           } else {
   487             /*
   488              * take out the middle component (green), and process
   489              * the other two in parallel. One multiply less.
   490              */
   491             d = *dstp;
   492             dalpha = d >> 24;
   493             s1 = s & 0xff00ff;
   494             d1 = d & 0xff00ff;
   495             d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   496             s &= 0xff00;
   497             d &= 0xff00;
   498             d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   499             dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
   500             *dstp = d1 | d | (dalpha << 24);
   501           }
   502         }
   503         ++srcp;
   504         ++dstp;
   505         }, width);
   506         /* *INDENT-ON* */
   507         srcp += srcskip;
   508         dstp += dstskip;
   509     }
   510 }
   511 
   512 #ifdef __3dNOW__
   513 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
   514 static void
   515 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
   516 {
   517     int width = info->dst_w;
   518     int height = info->dst_h;
   519     Uint32 *srcp = (Uint32 *) info->src;
   520     int srcskip = info->src_skip >> 2;
   521     Uint32 *dstp = (Uint32 *) info->dst;
   522     int dstskip = info->dst_skip >> 2;
   523     SDL_PixelFormat *sf = info->src_fmt;
   524     Uint32 amask = sf->Amask;
   525     Uint32 ashift = sf->Ashift;
   526     Uint64 multmask, multmask2;
   527 
   528     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   529 
   530     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   531     multmask = 0x00FF;
   532     multmask <<= (ashift * 2);
   533     multmask2 = 0x00FF00FF00FF00FFULL;
   534 
   535     while (height--) {
   536         /* *INDENT-OFF* */
   537         DUFFS_LOOP4({
   538         Uint32 alpha;
   539 
   540         _m_prefetch(srcp + 16);
   541         _m_prefetch(dstp + 16);
   542 
   543         alpha = *srcp & amask;
   544         if (alpha == 0) {
   545             /* do nothing */
   546         } else if (alpha == amask) {
   547             *dstp = *srcp;
   548         } else {
   549             src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
   550             src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   551 
   552             dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   553             dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   554 
   555             mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   556             mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   557             mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   558             mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   559             mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
   560             mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
   561 
   562 
   563             /* blend */            
   564             src1 = _mm_mullo_pi16(src1, mm_alpha);
   565             src1 = _mm_srli_pi16(src1, 8);
   566             dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   567             dst1 = _mm_srli_pi16(dst1, 8);
   568             dst1 = _mm_add_pi16(src1, dst1);
   569             dst1 = _mm_packs_pu16(dst1, mm_zero);
   570             
   571             *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   572         }
   573         ++srcp;
   574         ++dstp;
   575         }, width);
   576         /* *INDENT-ON* */
   577         srcp += srcskip;
   578         dstp += dstskip;
   579     }
   580     _mm_empty();
   581 }
   582 
   583 #endif /* __3dNOW__ */
   584 
   585 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
   586 
   587 /* blend a single 16 bit pixel at 50% */
   588 #define BLEND16_50(d, s, mask)                        \
   589     ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
   590 
   591 /* blend two 16 bit pixels at 50% */
   592 #define BLEND2x16_50(d, s, mask)                         \
   593     (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
   594      + (s & d & (~(mask | mask << 16))))
   595 
   596 static void
   597 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
   598 {
   599     int width = info->dst_w;
   600     int height = info->dst_h;
   601     Uint16 *srcp = (Uint16 *) info->src;
   602     int srcskip = info->src_skip >> 1;
   603     Uint16 *dstp = (Uint16 *) info->dst;
   604     int dstskip = info->dst_skip >> 1;
   605 
   606     while (height--) {
   607         if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
   608             /*
   609              * Source and destination not aligned, pipeline it.
   610              * This is mostly a win for big blits but no loss for
   611              * small ones
   612              */
   613             Uint32 prev_sw;
   614             int w = width;
   615 
   616             /* handle odd destination */
   617             if ((uintptr_t) dstp & 2) {
   618                 Uint16 d = *dstp, s = *srcp;
   619                 *dstp = BLEND16_50(d, s, mask);
   620                 dstp++;
   621                 srcp++;
   622                 w--;
   623             }
   624             srcp++;             /* srcp is now 32-bit aligned */
   625 
   626             /* bootstrap pipeline with first halfword */
   627             prev_sw = ((Uint32 *) srcp)[-1];
   628 
   629             while (w > 1) {
   630                 Uint32 sw, dw, s;
   631                 sw = *(Uint32 *) srcp;
   632                 dw = *(Uint32 *) dstp;
   633 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   634                 s = (prev_sw << 16) + (sw >> 16);
   635 #else
   636                 s = (prev_sw >> 16) + (sw << 16);
   637 #endif
   638                 prev_sw = sw;
   639                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
   640                 dstp += 2;
   641                 srcp += 2;
   642                 w -= 2;
   643             }
   644 
   645             /* final pixel if any */
   646             if (w) {
   647                 Uint16 d = *dstp, s;
   648 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
   649                 s = (Uint16) prev_sw;
   650 #else
   651                 s = (Uint16) (prev_sw >> 16);
   652 #endif
   653                 *dstp = BLEND16_50(d, s, mask);
   654                 srcp++;
   655                 dstp++;
   656             }
   657             srcp += srcskip - 1;
   658             dstp += dstskip;
   659         } else {
   660             /* source and destination are aligned */
   661             int w = width;
   662 
   663             /* first odd pixel? */
   664             if ((uintptr_t) srcp & 2) {
   665                 Uint16 d = *dstp, s = *srcp;
   666                 *dstp = BLEND16_50(d, s, mask);
   667                 srcp++;
   668                 dstp++;
   669                 w--;
   670             }
   671             /* srcp and dstp are now 32-bit aligned */
   672 
   673             while (w > 1) {
   674                 Uint32 sw = *(Uint32 *) srcp;
   675                 Uint32 dw = *(Uint32 *) dstp;
   676                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
   677                 srcp += 2;
   678                 dstp += 2;
   679                 w -= 2;
   680             }
   681 
   682             /* last odd pixel? */
   683             if (w) {
   684                 Uint16 d = *dstp, s = *srcp;
   685                 *dstp = BLEND16_50(d, s, mask);
   686                 srcp++;
   687                 dstp++;
   688             }
   689             srcp += srcskip;
   690             dstp += dstskip;
   691         }
   692     }
   693 }
   694 
   695 #ifdef __MMX__
   696 
   697 /* fast RGB565->RGB565 blending with surface alpha */
   698 static void
   699 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
   700 {
   701     unsigned alpha = info->a;
   702     if (alpha == 128) {
   703         Blit16to16SurfaceAlpha128(info, 0xf7de);
   704     } else {
   705         int width = info->dst_w;
   706         int height = info->dst_h;
   707         Uint16 *srcp = (Uint16 *) info->src;
   708         int srcskip = info->src_skip >> 1;
   709         Uint16 *dstp = (Uint16 *) info->dst;
   710         int dstskip = info->dst_skip >> 1;
   711         Uint32 s, d;
   712 
   713         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
   714 
   715         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   716         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   717         alpha >>= 3;            /* downscale alpha to 5 bits */
   718 
   719         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   720         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   721         /* position alpha to allow for mullo and mulhi on diff channels
   722            to reduce the number of operations */
   723         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   724 
   725         /* Setup the 565 color channel masks */
   726         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
   727         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   728 
   729         while (height--) {
   730             /* *INDENT-OFF* */
   731             DUFFS_LOOP_124(
   732             {
   733                 s = *srcp++;
   734                 d = *dstp;
   735                 /*
   736                  * shift out the middle component (green) to
   737                  * the high 16 bits, and process all three RGB
   738                  * components at the same time.
   739                  */
   740                 s = (s | s << 16) & 0x07e0f81f;
   741                 d = (d | d << 16) & 0x07e0f81f;
   742                 d += (s - d) * alpha >> 5;
   743                 d &= 0x07e0f81f;
   744                 *dstp++ = (Uint16)(d | d >> 16);
   745             },{
   746                 s = *srcp++;
   747                 d = *dstp;
   748                 /*
   749                  * shift out the middle component (green) to
   750                  * the high 16 bits, and process all three RGB
   751                  * components at the same time.
   752                  */
   753                 s = (s | s << 16) & 0x07e0f81f;
   754                 d = (d | d << 16) & 0x07e0f81f;
   755                 d += (s - d) * alpha >> 5;
   756                 d &= 0x07e0f81f;
   757                 *dstp++ = (Uint16)(d | d >> 16);
   758                 s = *srcp++;
   759                 d = *dstp;
   760                 /*
   761                  * shift out the middle component (green) to
   762                  * the high 16 bits, and process all three RGB
   763                  * components at the same time.
   764                  */
   765                 s = (s | s << 16) & 0x07e0f81f;
   766                 d = (d | d << 16) & 0x07e0f81f;
   767                 d += (s - d) * alpha >> 5;
   768                 d &= 0x07e0f81f;
   769                 *dstp++ = (Uint16)(d | d >> 16);
   770             },{
   771                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   772                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   773 
   774                 /* red */
   775                 src2 = src1;
   776                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
   777 
   778                 dst2 = dst1;
   779                 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
   780 
   781                 /* blend */
   782                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   783                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   784                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   785                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   786                 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
   787 
   788                 mm_res = dst2; /* RED -> mm_res */
   789 
   790                 /* green -- process the bits in place */
   791                 src2 = src1;
   792                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   793 
   794                 dst2 = dst1;
   795                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   796 
   797                 /* blend */
   798                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   799                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   800                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   801                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   802 
   803                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   804 
   805                 /* blue */
   806                 src2 = src1;
   807                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   808 
   809                 dst2 = dst1;
   810                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   811 
   812                 /* blend */
   813                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   814                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   815                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   816                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   817                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   818 
   819                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   820 
   821                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   822 
   823                 srcp += 4;
   824                 dstp += 4;
   825             }, width);
   826             /* *INDENT-ON* */
   827             srcp += srcskip;
   828             dstp += dstskip;
   829         }
   830         _mm_empty();
   831     }
   832 }
   833 
   834 /* fast RGB555->RGB555 blending with surface alpha */
   835 static void
   836 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
   837 {
   838     unsigned alpha = info->a;
   839     if (alpha == 128) {
   840         Blit16to16SurfaceAlpha128(info, 0xfbde);
   841     } else {
   842         int width = info->dst_w;
   843         int height = info->dst_h;
   844         Uint16 *srcp = (Uint16 *) info->src;
   845         int srcskip = info->src_skip >> 1;
   846         Uint16 *dstp = (Uint16 *) info->dst;
   847         int dstskip = info->dst_skip >> 1;
   848         Uint32 s, d;
   849 
   850         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
   851 
   852         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
   853         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
   854         alpha >>= 3;            /* downscale alpha to 5 bits */
   855 
   856         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
   857         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
   858         /* position alpha to allow for mullo and mulhi on diff channels
   859            to reduce the number of operations */
   860         mm_alpha = _mm_slli_si64(mm_alpha, 3);
   861 
   862         /* Setup the 555 color channel masks */
   863         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
   864         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
   865         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
   866 
   867         while (height--) {
   868             /* *INDENT-OFF* */
   869             DUFFS_LOOP_124(
   870             {
   871                 s = *srcp++;
   872                 d = *dstp;
   873                 /*
   874                  * shift out the middle component (green) to
   875                  * the high 16 bits, and process all three RGB
   876                  * components at the same time.
   877                  */
   878                 s = (s | s << 16) & 0x03e07c1f;
   879                 d = (d | d << 16) & 0x03e07c1f;
   880                 d += (s - d) * alpha >> 5;
   881                 d &= 0x03e07c1f;
   882                 *dstp++ = (Uint16)(d | d >> 16);
   883             },{
   884                 s = *srcp++;
   885                 d = *dstp;
   886                 /*
   887                  * shift out the middle component (green) to
   888                  * the high 16 bits, and process all three RGB
   889                  * components at the same time.
   890                  */
   891                 s = (s | s << 16) & 0x03e07c1f;
   892                 d = (d | d << 16) & 0x03e07c1f;
   893                 d += (s - d) * alpha >> 5;
   894                 d &= 0x03e07c1f;
   895                 *dstp++ = (Uint16)(d | d >> 16);
   896                     s = *srcp++;
   897                 d = *dstp;
   898                 /*
   899                  * shift out the middle component (green) to
   900                  * the high 16 bits, and process all three RGB
   901                  * components at the same time.
   902                  */
   903                 s = (s | s << 16) & 0x03e07c1f;
   904                 d = (d | d << 16) & 0x03e07c1f;
   905                 d += (s - d) * alpha >> 5;
   906                 d &= 0x03e07c1f;
   907                 *dstp++ = (Uint16)(d | d >> 16);
   908             },{
   909                 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
   910                 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
   911 
   912                 /* red -- process the bits in place */
   913                 src2 = src1;
   914                 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
   915 
   916                 dst2 = dst1;
   917                 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
   918 
   919                 /* blend */
   920                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   921                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   922                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   923                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   924                 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
   925 
   926                 mm_res = dst2; /* RED -> mm_res */
   927                 
   928                 /* green -- process the bits in place */
   929                 src2 = src1;
   930                 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
   931 
   932                 dst2 = dst1;
   933                 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
   934 
   935                 /* blend */
   936                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   937                 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   938                 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
   939                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   940 
   941                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
   942 
   943                 /* blue */
   944                 src2 = src1; /* src -> src2 */
   945                 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
   946 
   947                 dst2 = dst1; /* dst -> dst2 */
   948                 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
   949 
   950                 /* blend */
   951                 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
   952                 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
   953                 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
   954                 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
   955                 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
   956 
   957                 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
   958 
   959                 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
   960 
   961                 srcp += 4;
   962                 dstp += 4;
   963             }, width);
   964             /* *INDENT-ON* */
   965             srcp += srcskip;
   966             dstp += dstskip;
   967         }
   968         _mm_empty();
   969     }
   970 }
   971 
   972 #endif /* __MMX__ */
   973 
   974 /* fast RGB565->RGB565 blending with surface alpha */
   975 static void
   976 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
   977 {
   978     unsigned alpha = info->a;
   979     if (alpha == 128) {
   980         Blit16to16SurfaceAlpha128(info, 0xf7de);
   981     } else {
   982         int width = info->dst_w;
   983         int height = info->dst_h;
   984         Uint16 *srcp = (Uint16 *) info->src;
   985         int srcskip = info->src_skip >> 1;
   986         Uint16 *dstp = (Uint16 *) info->dst;
   987         int dstskip = info->dst_skip >> 1;
   988         alpha >>= 3;            /* downscale alpha to 5 bits */
   989 
   990         while (height--) {
   991             /* *INDENT-OFF* */
   992             DUFFS_LOOP4({
   993                 Uint32 s = *srcp++;
   994                 Uint32 d = *dstp;
   995                 /*
   996                  * shift out the middle component (green) to
   997                  * the high 16 bits, and process all three RGB
   998                  * components at the same time.
   999                  */
  1000                 s = (s | s << 16) & 0x07e0f81f;
  1001                 d = (d | d << 16) & 0x07e0f81f;
  1002                 d += (s - d) * alpha >> 5;
  1003                 d &= 0x07e0f81f;
  1004                 *dstp++ = (Uint16)(d | d >> 16);
  1005             }, width);
  1006             /* *INDENT-ON* */
  1007             srcp += srcskip;
  1008             dstp += dstskip;
  1009         }
  1010     }
  1011 }
  1012 
  1013 /* fast RGB555->RGB555 blending with surface alpha */
  1014 static void
  1015 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  1016 {
  1017     unsigned alpha = info->a;   /* downscale alpha to 5 bits */
  1018     if (alpha == 128) {
  1019         Blit16to16SurfaceAlpha128(info, 0xfbde);
  1020     } else {
  1021         int width = info->dst_w;
  1022         int height = info->dst_h;
  1023         Uint16 *srcp = (Uint16 *) info->src;
  1024         int srcskip = info->src_skip >> 1;
  1025         Uint16 *dstp = (Uint16 *) info->dst;
  1026         int dstskip = info->dst_skip >> 1;
  1027         alpha >>= 3;            /* downscale alpha to 5 bits */
  1028 
  1029         while (height--) {
  1030             /* *INDENT-OFF* */
  1031             DUFFS_LOOP4({
  1032                 Uint32 s = *srcp++;
  1033                 Uint32 d = *dstp;
  1034                 /*
  1035                  * shift out the middle component (green) to
  1036                  * the high 16 bits, and process all three RGB
  1037                  * components at the same time.
  1038                  */
  1039                 s = (s | s << 16) & 0x03e07c1f;
  1040                 d = (d | d << 16) & 0x03e07c1f;
  1041                 d += (s - d) * alpha >> 5;
  1042                 d &= 0x03e07c1f;
  1043                 *dstp++ = (Uint16)(d | d >> 16);
  1044             }, width);
  1045             /* *INDENT-ON* */
  1046             srcp += srcskip;
  1047             dstp += dstskip;
  1048         }
  1049     }
  1050 }
  1051 
  1052 /* fast ARGB8888->RGB565 blending with pixel alpha */
  1053 static void
  1054 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  1055 {
  1056     int width = info->dst_w;
  1057     int height = info->dst_h;
  1058     Uint32 *srcp = (Uint32 *) info->src;
  1059     int srcskip = info->src_skip >> 2;
  1060     Uint16 *dstp = (Uint16 *) info->dst;
  1061     int dstskip = info->dst_skip >> 1;
  1062 
  1063     while (height--) {
  1064         /* *INDENT-OFF* */
  1065         DUFFS_LOOP4({
  1066         Uint32 s = *srcp;
  1067         unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
  1068         /* FIXME: Here we special-case opaque alpha since the
  1069            compositioning used (>>8 instead of /255) doesn't handle
  1070            it correctly. Also special-case alpha=0 for speed?
  1071            Benchmark this! */
  1072         if(alpha) {   
  1073           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1074             *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
  1075           } else {
  1076             Uint32 d = *dstp;
  1077             /*
  1078              * convert source and destination to G0RAB65565
  1079              * and blend all components at the same time
  1080              */
  1081             s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
  1082               + (s >> 3 & 0x1f);
  1083             d = (d | d << 16) & 0x07e0f81f;
  1084             d += (s - d) * alpha >> 5;
  1085             d &= 0x07e0f81f;
  1086             *dstp = (Uint16)(d | d >> 16);
  1087           }
  1088         }
  1089         srcp++;
  1090         dstp++;
  1091         }, width);
  1092         /* *INDENT-ON* */
  1093         srcp += srcskip;
  1094         dstp += dstskip;
  1095     }
  1096 }
  1097 
  1098 /* fast ARGB8888->RGB555 blending with pixel alpha */
  1099 static void
  1100 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  1101 {
  1102     int width = info->dst_w;
  1103     int height = info->dst_h;
  1104     Uint32 *srcp = (Uint32 *) info->src;
  1105     int srcskip = info->src_skip >> 2;
  1106     Uint16 *dstp = (Uint16 *) info->dst;
  1107     int dstskip = info->dst_skip >> 1;
  1108 
  1109     while (height--) {
  1110         /* *INDENT-OFF* */
  1111         DUFFS_LOOP4({
  1112         unsigned alpha;
  1113         Uint32 s = *srcp;
  1114         alpha = s >> 27; /* downscale alpha to 5 bits */
  1115         /* FIXME: Here we special-case opaque alpha since the
  1116            compositioning used (>>8 instead of /255) doesn't handle
  1117            it correctly. Also special-case alpha=0 for speed?
  1118            Benchmark this! */
  1119         if(alpha) {   
  1120           if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
  1121             *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
  1122           } else {
  1123             Uint32 d = *dstp;
  1124             /*
  1125              * convert source and destination to G0RAB65565
  1126              * and blend all components at the same time
  1127              */
  1128             s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
  1129               + (s >> 3 & 0x1f);
  1130             d = (d | d << 16) & 0x03e07c1f;
  1131             d += (s - d) * alpha >> 5;
  1132             d &= 0x03e07c1f;
  1133             *dstp = (Uint16)(d | d >> 16);
  1134           }
  1135         }
  1136         srcp++;
  1137         dstp++;
  1138         }, width);
  1139         /* *INDENT-ON* */
  1140         srcp += srcskip;
  1141         dstp += dstskip;
  1142     }
  1143 }
  1144 
  1145 /* General (slow) N->N blending with per-surface alpha */
  1146 static void
  1147 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  1148 {
  1149     int width = info->dst_w;
  1150     int height = info->dst_h;
  1151     Uint8 *src = info->src;
  1152     int srcskip = info->src_skip;
  1153     Uint8 *dst = info->dst;
  1154     int dstskip = info->dst_skip;
  1155     SDL_PixelFormat *srcfmt = info->src_fmt;
  1156     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1157     int srcbpp = srcfmt->BytesPerPixel;
  1158     int dstbpp = dstfmt->BytesPerPixel;
  1159     Uint32 Pixel;
  1160     unsigned sR, sG, sB;
  1161     unsigned dR, dG, dB, dA;
  1162     const unsigned sA = info->a;
  1163 
  1164     if (sA) {
  1165         while (height--) {
  1166         /* *INDENT-OFF* */
  1167         DUFFS_LOOP4(
  1168         {
  1169         DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
  1170         DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1171         ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1172         ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1173         src += srcbpp;
  1174         dst += dstbpp;
  1175         },
  1176         width);
  1177         /* *INDENT-ON* */
  1178             src += srcskip;
  1179             dst += dstskip;
  1180         }
  1181     }
  1182 }
  1183 
  1184 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  1185 static void
  1186 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  1187 {
  1188     int width = info->dst_w;
  1189     int height = info->dst_h;
  1190     Uint8 *src = info->src;
  1191     int srcskip = info->src_skip;
  1192     Uint8 *dst = info->dst;
  1193     int dstskip = info->dst_skip;
  1194     SDL_PixelFormat *srcfmt = info->src_fmt;
  1195     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1196     Uint32 ckey = info->colorkey;
  1197     int srcbpp = srcfmt->BytesPerPixel;
  1198     int dstbpp = dstfmt->BytesPerPixel;
  1199     Uint32 Pixel;
  1200     unsigned sR, sG, sB;
  1201     unsigned dR, dG, dB, dA;
  1202     const unsigned sA = info->a;
  1203 
  1204     while (height--) {
  1205         /* *INDENT-OFF* */
  1206         DUFFS_LOOP4(
  1207         {
  1208         RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
  1209         if(sA && Pixel != ckey) {
  1210             RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
  1211             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1212             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1213             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1214         }
  1215         src += srcbpp;
  1216         dst += dstbpp;
  1217         },
  1218         width);
  1219         /* *INDENT-ON* */
  1220         src += srcskip;
  1221         dst += dstskip;
  1222     }
  1223 }
  1224 
  1225 /* General (slow) N->N blending with pixel alpha */
  1226 static void
  1227 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  1228 {
  1229     int width = info->dst_w;
  1230     int height = info->dst_h;
  1231     Uint8 *src = info->src;
  1232     int srcskip = info->src_skip;
  1233     Uint8 *dst = info->dst;
  1234     int dstskip = info->dst_skip;
  1235     SDL_PixelFormat *srcfmt = info->src_fmt;
  1236     SDL_PixelFormat *dstfmt = info->dst_fmt;
  1237     int srcbpp;
  1238     int dstbpp;
  1239     Uint32 Pixel;
  1240     unsigned sR, sG, sB, sA;
  1241     unsigned dR, dG, dB, dA;
  1242 
  1243     /* Set up some basic variables */
  1244     srcbpp = srcfmt->BytesPerPixel;
  1245     dstbpp = dstfmt->BytesPerPixel;
  1246 
  1247     while (height--) {
  1248         /* *INDENT-OFF* */
  1249         DUFFS_LOOP4(
  1250         {
  1251         DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
  1252         if(sA) {
  1253             DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
  1254             ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
  1255             ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
  1256         }
  1257         src += srcbpp;
  1258         dst += dstbpp;
  1259         },
  1260         width);
  1261         /* *INDENT-ON* */
  1262         src += srcskip;
  1263         dst += dstskip;
  1264     }
  1265 }
  1266 
  1267 
  1268 SDL_BlitFunc
  1269 SDL_CalculateBlitA(SDL_Surface * surface)
  1270 {
  1271     SDL_PixelFormat *sf = surface->format;
  1272     SDL_PixelFormat *df = surface->map->dst->format;
  1273 
  1274     switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
  1275     case SDL_COPY_BLEND:
  1276         /* Per-pixel alpha blits */
  1277         switch (df->BytesPerPixel) {
  1278         case 1:
  1279             return BlitNto1PixelAlpha;
  1280 
  1281         case 2:
  1282                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  1283                     && sf->Gmask == 0xff00
  1284                     && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
  1285                         || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
  1286                 if (df->Gmask == 0x7e0)
  1287                     return BlitARGBto565PixelAlpha;
  1288                 else if (df->Gmask == 0x3e0)
  1289                     return BlitARGBto555PixelAlpha;
  1290             }
  1291             return BlitNtoNPixelAlpha;
  1292 
  1293         case 4:
  1294             if (sf->Rmask == df->Rmask
  1295                 && sf->Gmask == df->Gmask
  1296                 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1297 #if defined(__MMX__) || defined(__3dNOW__)
  1298                 if (sf->Rshift % 8 == 0
  1299                     && sf->Gshift % 8 == 0
  1300                     && sf->Bshift % 8 == 0
  1301                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  1302 #ifdef __3dNOW__
  1303                     if (SDL_Has3DNow())
  1304                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  1305 #endif
  1306 #ifdef __MMX__
  1307                     if (SDL_HasMMX())
  1308                         return BlitRGBtoRGBPixelAlphaMMX;
  1309 #endif
  1310                 }
  1311 #endif /* __MMX__ || __3dNOW__ */
  1312                 if (sf->Amask == 0xff000000) {
  1313                     return BlitRGBtoRGBPixelAlpha;
  1314                 }
  1315             }
  1316             return BlitNtoNPixelAlpha;
  1317 
  1318         case 3:
  1319         default:
  1320             break;
  1321         }
  1322         return BlitNtoNPixelAlpha;
  1323 
  1324     case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1325         if (sf->Amask == 0) {
  1326             /* Per-surface alpha blits */
  1327             switch (df->BytesPerPixel) {
  1328             case 1:
  1329                 return BlitNto1SurfaceAlpha;
  1330 
  1331             case 2:
  1332                 if (surface->map->identity) {
  1333                     if (df->Gmask == 0x7e0) {
  1334 #ifdef __MMX__
  1335                         if (SDL_HasMMX())
  1336                             return Blit565to565SurfaceAlphaMMX;
  1337                         else
  1338 #endif
  1339                             return Blit565to565SurfaceAlpha;
  1340                     } else if (df->Gmask == 0x3e0) {
  1341 #ifdef __MMX__
  1342                         if (SDL_HasMMX())
  1343                             return Blit555to555SurfaceAlphaMMX;
  1344                         else
  1345 #endif
  1346                             return Blit555to555SurfaceAlpha;
  1347                     }
  1348                 }
  1349                 return BlitNtoNSurfaceAlpha;
  1350 
  1351             case 4:
  1352                 if (sf->Rmask == df->Rmask
  1353                     && sf->Gmask == df->Gmask
  1354                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  1355 #ifdef __MMX__
  1356                     if (sf->Rshift % 8 == 0
  1357                         && sf->Gshift % 8 == 0
  1358                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  1359                         return BlitRGBtoRGBSurfaceAlphaMMX;
  1360 #endif
  1361                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  1362                         return BlitRGBtoRGBSurfaceAlpha;
  1363                     }
  1364                 }
  1365                 return BlitNtoNSurfaceAlpha;
  1366 
  1367             case 3:
  1368             default:
  1369                 return BlitNtoNSurfaceAlpha;
  1370             }
  1371         }
  1372         break;
  1373 
  1374     case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
  1375         if (sf->Amask == 0) {
  1376             if (df->BytesPerPixel == 1) {
  1377                 return BlitNto1SurfaceAlphaKey;
  1378             } else {
  1379                 return BlitNtoNSurfaceAlphaKey;
  1380             }
  1381         }
  1382         break;
  1383     }
  1384 
  1385     return NULL;
  1386 }
  1387 
  1388 /* vi: set ts=4 sw=4 expandtab: */