src/video/SDL_blit_A.c
author Sylvain Becker <sylvain.becker@gmail.com>
Wed, 30 Jan 2019 17:16:08 +0100
changeset 12585 dff36de37426
parent 12503 806492103856
permissions -rw-r--r--
Fix blit with blending (Blit_A) to RGB332 which has no palette
slouken@0
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@12503
     3
  Copyright (C) 1997-2019 Sam Lantinga <slouken@libsdl.org>
slouken@0
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@0
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@0
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
slouken@0
    20
*/
icculus@8093
    21
#include "../SDL_internal.h"
slouken@0
    22
slouken@0
    23
#include "SDL_video.h"
slouken@0
    24
#include "SDL_blit.h"
slouken@0
    25
slouken@0
    26
/* Functions to perform alpha blended blitting */
slouken@0
    27
slouken@0
    28
/* N->1 blending with per-surface alpha */
slouken@1895
    29
static void
slouken@1895
    30
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
    31
{
slouken@2262
    32
    int width = info->dst_w;
slouken@2262
    33
    int height = info->dst_h;
slouken@2262
    34
    Uint8 *src = info->src;
slouken@2267
    35
    int srcskip = info->src_skip;
slouken@2262
    36
    Uint8 *dst = info->dst;
slouken@2267
    37
    int dstskip = info->dst_skip;
slouken@1895
    38
    Uint8 *palmap = info->table;
slouken@2267
    39
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    40
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    41
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    42
    Uint32 Pixel;
slouken@7502
    43
    unsigned sR, sG, sB;
slouken@7502
    44
    unsigned dR, dG, dB;
slouken@2267
    45
    const unsigned A = info->a;
slouken@0
    46
slouken@1895
    47
    while (height--) {
slouken@12201
    48
        /* *INDENT-OFF* */
slouken@12201
    49
        DUFFS_LOOP4(
slouken@12201
    50
        {
slouken@12201
    51
        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@12201
    52
        dR = dstfmt->palette->colors[*dst].r;
slouken@12201
    53
        dG = dstfmt->palette->colors[*dst].g;
slouken@12201
    54
        dB = dstfmt->palette->colors[*dst].b;
slouken@12201
    55
        ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@12201
    56
        dR &= 0xff;
slouken@12201
    57
        dG &= 0xff;
slouken@12201
    58
        dB &= 0xff;
slouken@12201
    59
        /* Pack RGB into 8bit pixel */
slouken@12201
    60
        if ( palmap == NULL ) {
slouken@12201
    61
            *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@12201
    62
        } else {
slouken@12201
    63
            *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@12201
    64
        }
slouken@12201
    65
        dst++;
slouken@12201
    66
        src += srcbpp;
slouken@12201
    67
        },
slouken@12201
    68
        width);
slouken@12201
    69
        /* *INDENT-ON* */
slouken@1895
    70
        src += srcskip;
slouken@1895
    71
        dst += dstskip;
slouken@1895
    72
    }
slouken@0
    73
}
slouken@0
    74
slouken@0
    75
/* N->1 blending with pixel alpha */
slouken@1895
    76
static void
slouken@1895
    77
BlitNto1PixelAlpha(SDL_BlitInfo * info)
slouken@0
    78
{
slouken@2262
    79
    int width = info->dst_w;
slouken@2262
    80
    int height = info->dst_h;
slouken@2262
    81
    Uint8 *src = info->src;
slouken@2267
    82
    int srcskip = info->src_skip;
slouken@2262
    83
    Uint8 *dst = info->dst;
slouken@2267
    84
    int dstskip = info->dst_skip;
slouken@1895
    85
    Uint8 *palmap = info->table;
slouken@2267
    86
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    87
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    88
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    89
    Uint32 Pixel;
slouken@7502
    90
    unsigned sR, sG, sB, sA;
slouken@7502
    91
    unsigned dR, dG, dB;
slouken@0
    92
slouken@1895
    93
    while (height--) {
slouken@12201
    94
        /* *INDENT-OFF* */
slouken@12201
    95
        DUFFS_LOOP4(
slouken@12201
    96
        {
slouken@12201
    97
        DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@12201
    98
        dR = dstfmt->palette->colors[*dst].r;
slouken@12201
    99
        dG = dstfmt->palette->colors[*dst].g;
slouken@12201
   100
        dB = dstfmt->palette->colors[*dst].b;
slouken@12201
   101
        ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
slouken@12201
   102
        dR &= 0xff;
slouken@12201
   103
        dG &= 0xff;
slouken@12201
   104
        dB &= 0xff;
slouken@12201
   105
        /* Pack RGB into 8bit pixel */
slouken@12201
   106
        if ( palmap == NULL ) {
slouken@12201
   107
            *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@12201
   108
        } else {
slouken@12201
   109
            *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@12201
   110
        }
slouken@12201
   111
        dst++;
slouken@12201
   112
        src += srcbpp;
slouken@12201
   113
        },
slouken@12201
   114
        width);
slouken@12201
   115
        /* *INDENT-ON* */
slouken@1895
   116
        src += srcskip;
slouken@1895
   117
        dst += dstskip;
slouken@1895
   118
    }
slouken@0
   119
}
slouken@0
   120
slouken@0
   121
/* colorkeyed N->1 blending with per-surface alpha */
slouken@1895
   122
static void
slouken@1895
   123
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
   124
{
slouken@2262
   125
    int width = info->dst_w;
slouken@2262
   126
    int height = info->dst_h;
slouken@2262
   127
    Uint8 *src = info->src;
slouken@2267
   128
    int srcskip = info->src_skip;
slouken@2262
   129
    Uint8 *dst = info->dst;
slouken@2267
   130
    int dstskip = info->dst_skip;
slouken@1895
   131
    Uint8 *palmap = info->table;
slouken@2267
   132
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
   133
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
   134
    int srcbpp = srcfmt->BytesPerPixel;
slouken@2267
   135
    Uint32 ckey = info->colorkey;
slouken@7502
   136
    Uint32 Pixel;
slouken@7502
   137
    unsigned sR, sG, sB;
slouken@7502
   138
    unsigned dR, dG, dB;
slouken@7502
   139
    const unsigned A = info->a;
slouken@0
   140
slouken@1895
   141
    while (height--) {
slouken@12201
   142
        /* *INDENT-OFF* */
slouken@12201
   143
        DUFFS_LOOP(
slouken@12201
   144
        {
slouken@12201
   145
        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@12201
   146
        if ( Pixel != ckey ) {
slouken@12201
   147
            dR = dstfmt->palette->colors[*dst].r;
slouken@12201
   148
            dG = dstfmt->palette->colors[*dst].g;
slouken@12201
   149
            dB = dstfmt->palette->colors[*dst].b;
slouken@12201
   150
            ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@12201
   151
            dR &= 0xff;
slouken@12201
   152
            dG &= 0xff;
slouken@12201
   153
            dB &= 0xff;
slouken@12201
   154
            /* Pack RGB into 8bit pixel */
slouken@12201
   155
            if ( palmap == NULL ) {
slouken@7502
   156
                *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@12201
   157
            } else {
slouken@7502
   158
                *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@12201
   159
            }
slouken@12201
   160
        }
slouken@12201
   161
        dst++;
slouken@12201
   162
        src += srcbpp;
slouken@12201
   163
        },
slouken@12201
   164
        width);
slouken@12201
   165
        /* *INDENT-ON* */
slouken@1895
   166
        src += srcskip;
slouken@1895
   167
        dst += dstskip;
slouken@1895
   168
    }
slouken@0
   169
}
slouken@0
   170
slouken@2255
   171
#ifdef __MMX__
slouken@1542
   172
slouken@1542
   173
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   174
static void
slouken@1895
   175
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@1542
   176
{
slouken@2262
   177
    int width = info->dst_w;
slouken@2262
   178
    int height = info->dst_h;
slouken@2262
   179
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   180
    int srcskip = info->src_skip >> 2;
slouken@2262
   181
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   182
    int dstskip = info->dst_skip >> 2;
slouken@2267
   183
    Uint32 dalpha = info->dst_fmt->Amask;
slouken@1542
   184
slouken@1895
   185
    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   186
slouken@1895
   187
    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
slouken@1895
   188
    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
slouken@1895
   189
    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
slouken@1542
   190
slouken@1895
   191
    while (height--) {
slouken@1895
   192
        int n = width;
slouken@1895
   193
        if (n & 1) {
slouken@1895
   194
            Uint32 s = *srcp++;
slouken@1895
   195
            Uint32 d = *dstp;
slouken@1895
   196
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1895
   197
                       + (s & d & 0x00010101)) | dalpha;
slouken@1895
   198
            n--;
slouken@1895
   199
        }
slouken@1542
   200
slouken@1895
   201
        for (n >>= 1; n > 0; --n) {
slouken@1895
   202
            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   203
            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   204
slouken@1895
   205
            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   206
            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   207
slouken@1895
   208
            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
slouken@1895
   209
            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
slouken@1895
   210
            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
slouken@1895
   211
            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
slouken@1895
   212
slouken@1895
   213
            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
slouken@1895
   214
            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
slouken@1895
   215
            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
slouken@1895
   216
            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
slouken@1895
   217
slouken@1895
   218
            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
slouken@1895
   219
            dstp += 2;
slouken@1895
   220
            srcp += 2;
slouken@1895
   221
        }
slouken@1895
   222
slouken@1895
   223
        srcp += srcskip;
slouken@1895
   224
        dstp += dstskip;
slouken@1895
   225
    }
slouken@1895
   226
    _mm_empty();
slouken@1542
   227
}
slouken@1542
   228
slouken@1542
   229
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   230
static void
slouken@1895
   231
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   232
{
slouken@2267
   233
    SDL_PixelFormat *df = info->dst_fmt;
slouken@6863
   234
    Uint32 chanmask;
slouken@2267
   235
    unsigned alpha = info->a;
slouken@1542
   236
slouken@1895
   237
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   238
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   239
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   240
    } else {
slouken@2262
   241
        int width = info->dst_w;
slouken@2262
   242
        int height = info->dst_h;
slouken@2262
   243
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   244
        int srcskip = info->src_skip >> 2;
slouken@2262
   245
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   246
        int dstskip = info->dst_skip >> 2;
slouken@1895
   247
        Uint32 dalpha = df->Amask;
slouken@1895
   248
        Uint32 amult;
slouken@1542
   249
slouken@1895
   250
        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   251
slouken@1895
   252
        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
slouken@1895
   253
        /* form the alpha mult */
slouken@1895
   254
        amult = alpha | (alpha << 8);
slouken@1895
   255
        amult = amult | (amult << 16);
slouken@1895
   256
        chanmask =
slouken@3013
   257
            (0xff << df->Rshift) | (0xff << df->
slouken@3013
   258
                                    Gshift) | (0xff << df->Bshift);
slouken@1895
   259
        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1895
   260
        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1895
   261
        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   262
        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
slouken@1542
   263
slouken@1895
   264
        while (height--) {
slouken@1895
   265
            int n = width;
slouken@1895
   266
            if (n & 1) {
slouken@1895
   267
                /* One Pixel Blend */
slouken@1895
   268
                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
slouken@1895
   269
                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1542
   270
slouken@1895
   271
                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1895
   272
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   273
slouken@1895
   274
                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
slouken@1895
   275
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   276
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   277
                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1542
   278
slouken@1895
   279
                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
slouken@1895
   280
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   281
                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   282
slouken@1895
   283
                ++srcp;
slouken@1895
   284
                ++dstp;
slouken@1542
   285
slouken@1895
   286
                n--;
slouken@1895
   287
            }
slouken@1542
   288
slouken@1895
   289
            for (n >>= 1; n > 0; --n) {
slouken@1895
   290
                /* Two Pixels Blend */
slouken@1895
   291
                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   292
                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   293
                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1895
   294
                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   295
slouken@1895
   296
                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   297
                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
slouken@1895
   298
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1895
   299
                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1895
   300
slouken@1895
   301
                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
slouken@1895
   302
                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
slouken@1895
   303
                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
slouken@1895
   304
                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1895
   305
slouken@1895
   306
                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
slouken@1895
   307
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   308
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   309
                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1895
   310
slouken@1895
   311
                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1895
   312
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   313
slouken@1895
   314
                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1895
   315
slouken@1895
   316
                srcp += 2;
slouken@1895
   317
                dstp += 2;
slouken@1895
   318
            }
slouken@1895
   319
            srcp += srcskip;
slouken@1895
   320
            dstp += dstskip;
slouken@1895
   321
        }
slouken@1895
   322
        _mm_empty();
slouken@1895
   323
    }
slouken@1542
   324
}
slouken@1542
   325
slouken@1542
   326
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   327
static void
slouken@1895
   328
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   329
{
slouken@2262
   330
    int width = info->dst_w;
slouken@2262
   331
    int height = info->dst_h;
slouken@2262
   332
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   333
    int srcskip = info->src_skip >> 2;
slouken@2262
   334
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   335
    int dstskip = info->dst_skip >> 2;
slouken@2267
   336
    SDL_PixelFormat *sf = info->src_fmt;
slouken@1895
   337
    Uint32 amask = sf->Amask;
slouken@1895
   338
    Uint32 ashift = sf->Ashift;
slouken@7640
   339
    Uint64 multmask, multmask2;
slouken@1542
   340
slouken@7640
   341
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@1542
   342
slouken@1895
   343
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   344
    multmask = 0x00FF;
slouken@12201
   345
    multmask <<= (ashift * 2);
slouken@12201
   346
    multmask2 = 0x00FF00FF00FF00FFULL;
slouken@1542
   347
slouken@1895
   348
    while (height--) {
slouken@12201
   349
        /* *INDENT-OFF* */
slouken@12201
   350
        DUFFS_LOOP4({
slouken@12201
   351
        Uint32 alpha = *srcp & amask;
slouken@12201
   352
        if (alpha == 0) {
slouken@12201
   353
            /* do nothing */
slouken@12201
   354
        } else if (alpha == amask) {
slouken@12201
   355
            *dstp = *srcp;
slouken@12201
   356
        } else {
slouken@12201
   357
            src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@12201
   358
            src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   359
slouken@12201
   360
            dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@12201
   361
            dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   362
slouken@12201
   363
            mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@12201
   364
            mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@12201
   365
            mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@12201
   366
            mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
slouken@12201
   367
            mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
slouken@12201
   368
            mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
slouken@1542
   369
slouken@12201
   370
            /* blend */            
slouken@12201
   371
            src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@12201
   372
            src1 = _mm_srli_pi16(src1, 8);
slouken@12201
   373
            dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@12201
   374
            dst1 = _mm_srli_pi16(dst1, 8);
slouken@12201
   375
            dst1 = _mm_add_pi16(src1, dst1);
slouken@12201
   376
            dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@12201
   377
            
slouken@12201
   378
            *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@12201
   379
        }
slouken@12201
   380
        ++srcp;
slouken@12201
   381
        ++dstp;
slouken@12201
   382
        }, width);
slouken@12201
   383
        /* *INDENT-ON* */
slouken@1895
   384
        srcp += srcskip;
slouken@1895
   385
        dstp += dstskip;
slouken@1895
   386
    }
slouken@1895
   387
    _mm_empty();
slouken@1542
   388
}
slouken@1895
   389
slouken@2255
   390
#endif /* __MMX__ */
slouken@689
   391
slouken@1
   392
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   393
static void
slouken@1895
   394
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
slouken@0
   395
{
slouken@2262
   396
    int width = info->dst_w;
slouken@2262
   397
    int height = info->dst_h;
slouken@2262
   398
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   399
    int srcskip = info->src_skip >> 2;
slouken@2262
   400
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   401
    int dstskip = info->dst_skip >> 2;
slouken@0
   402
slouken@1895
   403
    while (height--) {
slouken@12201
   404
        /* *INDENT-OFF* */
slouken@12201
   405
        DUFFS_LOOP4({
slouken@12201
   406
            Uint32 s = *srcp++;
slouken@12201
   407
            Uint32 d = *dstp;
slouken@12201
   408
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@12201
   409
                   + (s & d & 0x00010101)) | 0xff000000;
slouken@12201
   410
        }, width);
slouken@12201
   411
        /* *INDENT-ON* */
slouken@1895
   412
        srcp += srcskip;
slouken@1895
   413
        dstp += dstskip;
slouken@1895
   414
    }
slouken@0
   415
}
slouken@0
   416
slouken@1
   417
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   418
static void
slouken@1895
   419
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   420
{
slouken@2267
   421
    unsigned alpha = info->a;
slouken@1895
   422
    if (alpha == 128) {
slouken@1895
   423
        BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1895
   424
    } else {
slouken@2262
   425
        int width = info->dst_w;
slouken@2262
   426
        int height = info->dst_h;
slouken@2262
   427
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   428
        int srcskip = info->src_skip >> 2;
slouken@2262
   429
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   430
        int dstskip = info->dst_skip >> 2;
slouken@1895
   431
        Uint32 s;
slouken@1895
   432
        Uint32 d;
slouken@1895
   433
        Uint32 s1;
slouken@1895
   434
        Uint32 d1;
slouken@1
   435
slouken@1895
   436
        while (height--) {
slouken@12201
   437
            /* *INDENT-OFF* */
slouken@12201
   438
            DUFFS_LOOP4({
slouken@12201
   439
                s = *srcp;
slouken@12201
   440
                d = *dstp;
slouken@12201
   441
                s1 = s & 0xff00ff;
slouken@12201
   442
                d1 = d & 0xff00ff;
slouken@12201
   443
                d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@12201
   444
                     & 0xff00ff;
slouken@12201
   445
                s &= 0xff00;
slouken@12201
   446
                d &= 0xff00;
slouken@12201
   447
                d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@12201
   448
                *dstp = d1 | d | 0xff000000;
slouken@12201
   449
                ++srcp;
slouken@12201
   450
                ++dstp;
slouken@12201
   451
            }, width);
slouken@12201
   452
            /* *INDENT-ON* */
slouken@1895
   453
            srcp += srcskip;
slouken@1895
   454
            dstp += dstskip;
slouken@1895
   455
        }
slouken@1895
   456
    }
slouken@1
   457
}
slouken@1
   458
slouken@0
   459
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   460
static void
slouken@1895
   461
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
slouken@0
   462
{
slouken@2262
   463
    int width = info->dst_w;
slouken@2262
   464
    int height = info->dst_h;
slouken@2262
   465
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   466
    int srcskip = info->src_skip >> 2;
slouken@2262
   467
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   468
    int dstskip = info->dst_skip >> 2;
slouken@0
   469
slouken@1895
   470
    while (height--) {
slouken@12201
   471
        /* *INDENT-OFF* */
slouken@12201
   472
        DUFFS_LOOP4({
slouken@12201
   473
        Uint32 dalpha;
slouken@12201
   474
        Uint32 d;
slouken@12201
   475
        Uint32 s1;
slouken@12201
   476
        Uint32 d1;
slouken@12201
   477
        Uint32 s = *srcp;
slouken@12201
   478
        Uint32 alpha = s >> 24;
slouken@12201
   479
        /* FIXME: Here we special-case opaque alpha since the
slouken@12201
   480
           compositioning used (>>8 instead of /255) doesn't handle
slouken@12201
   481
           it correctly. Also special-case alpha=0 for speed?
slouken@12201
   482
           Benchmark this! */
slouken@12201
   483
        if (alpha) {
slouken@12201
   484
          if (alpha == SDL_ALPHA_OPAQUE) {
slouken@12201
   485
              *dstp = *srcp;
slouken@12201
   486
          } else {
slouken@12201
   487
            /*
slouken@12201
   488
             * take out the middle component (green), and process
slouken@12201
   489
             * the other two in parallel. One multiply less.
slouken@12201
   490
             */
slouken@12201
   491
            d = *dstp;
slouken@12201
   492
            dalpha = d >> 24;
slouken@12201
   493
            s1 = s & 0xff00ff;
slouken@12201
   494
            d1 = d & 0xff00ff;
slouken@12201
   495
            d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@12201
   496
            s &= 0xff00;
slouken@12201
   497
            d &= 0xff00;
slouken@12201
   498
            d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@12201
   499
            dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
slouken@12201
   500
            *dstp = d1 | d | (dalpha << 24);
slouken@12201
   501
          }
slouken@12201
   502
        }
slouken@12201
   503
        ++srcp;
slouken@12201
   504
        ++dstp;
slouken@12201
   505
        }, width);
slouken@12201
   506
        /* *INDENT-ON* */
slouken@1895
   507
        srcp += srcskip;
slouken@1895
   508
        dstp += dstskip;
slouken@1895
   509
    }
slouken@0
   510
}
slouken@0
   511
slouken@5389
   512
#ifdef __3dNOW__
slouken@5389
   513
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@5389
   514
static void
slouken@5389
   515
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
slouken@5389
   516
{
slouken@5389
   517
    int width = info->dst_w;
slouken@5389
   518
    int height = info->dst_h;
slouken@5389
   519
    Uint32 *srcp = (Uint32 *) info->src;
slouken@5389
   520
    int srcskip = info->src_skip >> 2;
slouken@5389
   521
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@5389
   522
    int dstskip = info->dst_skip >> 2;
slouken@5389
   523
    SDL_PixelFormat *sf = info->src_fmt;
slouken@5389
   524
    Uint32 amask = sf->Amask;
slouken@5389
   525
    Uint32 ashift = sf->Ashift;
slouken@7640
   526
    Uint64 multmask, multmask2;
slouken@5389
   527
slouken@7640
   528
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@5389
   529
slouken@5389
   530
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   531
    multmask = 0x00FF;
slouken@5389
   532
    multmask <<= (ashift * 2);
sbc@8879
   533
    multmask2 = 0x00FF00FF00FF00FFULL;
slouken@5389
   534
slouken@5389
   535
    while (height--) {
slouken@12201
   536
        /* *INDENT-OFF* */
slouken@12201
   537
        DUFFS_LOOP4({
slouken@12201
   538
        Uint32 alpha;
slouken@5389
   539
slouken@12201
   540
        _m_prefetch(srcp + 16);
slouken@12201
   541
        _m_prefetch(dstp + 16);
slouken@5389
   542
slouken@12201
   543
        alpha = *srcp & amask;
slouken@12201
   544
        if (alpha == 0) {
slouken@12201
   545
            /* do nothing */
slouken@12201
   546
        } else if (alpha == amask) {
slouken@12201
   547
            *dstp = *srcp;
slouken@12201
   548
        } else {
slouken@12201
   549
            src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@12201
   550
            src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@5389
   551
slouken@12201
   552
            dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@12201
   553
            dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@5389
   554
slouken@12201
   555
            mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@12201
   556
            mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@12201
   557
            mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@12201
   558
            mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
slouken@12201
   559
            mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);    /* 0F0A0A0A -> mm_alpha */
slouken@12201
   560
            mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);    /* 255 - mm_alpha -> mm_alpha */
slouken@7640
   561
slouken@5389
   562
slouken@12201
   563
            /* blend */            
slouken@12201
   564
            src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@12201
   565
            src1 = _mm_srli_pi16(src1, 8);
slouken@12201
   566
            dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@12201
   567
            dst1 = _mm_srli_pi16(dst1, 8);
slouken@12201
   568
            dst1 = _mm_add_pi16(src1, dst1);
slouken@12201
   569
            dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@12201
   570
            
slouken@12201
   571
            *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@12201
   572
        }
slouken@12201
   573
        ++srcp;
slouken@12201
   574
        ++dstp;
slouken@12201
   575
        }, width);
slouken@12201
   576
        /* *INDENT-ON* */
slouken@5389
   577
        srcp += srcskip;
slouken@5389
   578
        dstp += dstskip;
slouken@5389
   579
    }
slouken@5389
   580
    _mm_empty();
slouken@5389
   581
}
slouken@5389
   582
philipp@9709
   583
#endif /* __3dNOW__ */
slouken@5389
   584
slouken@1
   585
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
   586
slouken@1
   587
/* blend a single 16 bit pixel at 50% */
slouken@12201
   588
#define BLEND16_50(d, s, mask)                        \
slouken@12201
   589
    ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
   590
slouken@1
   591
/* blend two 16 bit pixels at 50% */
slouken@12201
   592
#define BLEND2x16_50(d, s, mask)                         \
slouken@12201
   593
    (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@12201
   594
     + (s & d & (~(mask | mask << 16))))
slouken@1
   595
slouken@1895
   596
static void
slouken@1895
   597
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
slouken@0
   598
{
slouken@2262
   599
    int width = info->dst_w;
slouken@2262
   600
    int height = info->dst_h;
slouken@2262
   601
    Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   602
    int srcskip = info->src_skip >> 1;
slouken@2262
   603
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   604
    int dstskip = info->dst_skip >> 1;
slouken@0
   605
slouken@1895
   606
    while (height--) {
slouken@1895
   607
        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
slouken@1895
   608
            /*
slouken@1895
   609
             * Source and destination not aligned, pipeline it.
slouken@1895
   610
             * This is mostly a win for big blits but no loss for
slouken@1895
   611
             * small ones
slouken@1895
   612
             */
slouken@1895
   613
            Uint32 prev_sw;
slouken@1895
   614
            int w = width;
slouken@1
   615
slouken@1895
   616
            /* handle odd destination */
slouken@1895
   617
            if ((uintptr_t) dstp & 2) {
slouken@1895
   618
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   619
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   620
                dstp++;
slouken@1895
   621
                srcp++;
slouken@1895
   622
                w--;
slouken@1895
   623
            }
slouken@1895
   624
            srcp++;             /* srcp is now 32-bit aligned */
slouken@1
   625
slouken@1895
   626
            /* bootstrap pipeline with first halfword */
slouken@1895
   627
            prev_sw = ((Uint32 *) srcp)[-1];
slouken@1
   628
slouken@1895
   629
            while (w > 1) {
slouken@1895
   630
                Uint32 sw, dw, s;
slouken@1895
   631
                sw = *(Uint32 *) srcp;
slouken@1895
   632
                dw = *(Uint32 *) dstp;
slouken@1443
   633
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   634
                s = (prev_sw << 16) + (sw >> 16);
slouken@1443
   635
#else
slouken@1895
   636
                s = (prev_sw >> 16) + (sw << 16);
slouken@1443
   637
#endif
slouken@1895
   638
                prev_sw = sw;
slouken@1895
   639
                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
slouken@1895
   640
                dstp += 2;
slouken@1895
   641
                srcp += 2;
slouken@1895
   642
                w -= 2;
slouken@1895
   643
            }
slouken@1
   644
slouken@1895
   645
            /* final pixel if any */
slouken@1895
   646
            if (w) {
slouken@1895
   647
                Uint16 d = *dstp, s;
slouken@1443
   648
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   649
                s = (Uint16) prev_sw;
slouken@1443
   650
#else
slouken@1895
   651
                s = (Uint16) (prev_sw >> 16);
slouken@1443
   652
#endif
slouken@1895
   653
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   654
                srcp++;
slouken@1895
   655
                dstp++;
slouken@1895
   656
            }
slouken@1895
   657
            srcp += srcskip - 1;
slouken@1895
   658
            dstp += dstskip;
slouken@1895
   659
        } else {
slouken@1895
   660
            /* source and destination are aligned */
slouken@1895
   661
            int w = width;
slouken@1
   662
slouken@1895
   663
            /* first odd pixel? */
slouken@1895
   664
            if ((uintptr_t) srcp & 2) {
slouken@1895
   665
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   666
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   667
                srcp++;
slouken@1895
   668
                dstp++;
slouken@1895
   669
                w--;
slouken@1895
   670
            }
slouken@1895
   671
            /* srcp and dstp are now 32-bit aligned */
slouken@1
   672
slouken@1895
   673
            while (w > 1) {
slouken@1895
   674
                Uint32 sw = *(Uint32 *) srcp;
slouken@1895
   675
                Uint32 dw = *(Uint32 *) dstp;
slouken@1895
   676
                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
slouken@1895
   677
                srcp += 2;
slouken@1895
   678
                dstp += 2;
slouken@1895
   679
                w -= 2;
slouken@1895
   680
            }
slouken@1
   681
slouken@1895
   682
            /* last odd pixel? */
slouken@1895
   683
            if (w) {
slouken@1895
   684
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   685
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   686
                srcp++;
slouken@1895
   687
                dstp++;
slouken@1895
   688
            }
slouken@1895
   689
            srcp += srcskip;
slouken@1895
   690
            dstp += dstskip;
slouken@1895
   691
        }
slouken@1895
   692
    }
slouken@1
   693
}
slouken@1
   694
slouken@2255
   695
#ifdef __MMX__
slouken@689
   696
slouken@1542
   697
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   698
static void
slouken@1895
   699
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   700
{
slouken@2267
   701
    unsigned alpha = info->a;
slouken@1895
   702
    if (alpha == 128) {
slouken@1895
   703
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   704
    } else {
slouken@2262
   705
        int width = info->dst_w;
slouken@2262
   706
        int height = info->dst_h;
slouken@2262
   707
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   708
        int srcskip = info->src_skip >> 1;
slouken@2262
   709
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   710
        int dstskip = info->dst_skip >> 1;
slouken@1895
   711
        Uint32 s, d;
slouken@1542
   712
slouken@1895
   713
        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   714
slouken@1895
   715
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   716
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   717
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1895
   718
slouken@1895
   719
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   720
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   721
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   722
           to reduce the number of operations */
slouken@1895
   723
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   724
slouken@1895
   725
        /* Setup the 565 color channel masks */
slouken@1895
   726
        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
slouken@1895
   727
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   728
slouken@1895
   729
        while (height--) {
slouken@12201
   730
            /* *INDENT-OFF* */
slouken@12201
   731
            DUFFS_LOOP_124(
slouken@12201
   732
            {
slouken@12201
   733
                s = *srcp++;
slouken@12201
   734
                d = *dstp;
slouken@12201
   735
                /*
slouken@12201
   736
                 * shift out the middle component (green) to
slouken@12201
   737
                 * the high 16 bits, and process all three RGB
slouken@12201
   738
                 * components at the same time.
slouken@12201
   739
                 */
slouken@12201
   740
                s = (s | s << 16) & 0x07e0f81f;
slouken@12201
   741
                d = (d | d << 16) & 0x07e0f81f;
slouken@12201
   742
                d += (s - d) * alpha >> 5;
slouken@12201
   743
                d &= 0x07e0f81f;
slouken@12201
   744
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   745
            },{
slouken@12201
   746
                s = *srcp++;
slouken@12201
   747
                d = *dstp;
slouken@12201
   748
                /*
slouken@12201
   749
                 * shift out the middle component (green) to
slouken@12201
   750
                 * the high 16 bits, and process all three RGB
slouken@12201
   751
                 * components at the same time.
slouken@12201
   752
                 */
slouken@12201
   753
                s = (s | s << 16) & 0x07e0f81f;
slouken@12201
   754
                d = (d | d << 16) & 0x07e0f81f;
slouken@12201
   755
                d += (s - d) * alpha >> 5;
slouken@12201
   756
                d &= 0x07e0f81f;
slouken@12201
   757
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   758
                s = *srcp++;
slouken@12201
   759
                d = *dstp;
slouken@12201
   760
                /*
slouken@12201
   761
                 * shift out the middle component (green) to
slouken@12201
   762
                 * the high 16 bits, and process all three RGB
slouken@12201
   763
                 * components at the same time.
slouken@12201
   764
                 */
slouken@12201
   765
                s = (s | s << 16) & 0x07e0f81f;
slouken@12201
   766
                d = (d | d << 16) & 0x07e0f81f;
slouken@12201
   767
                d += (s - d) * alpha >> 5;
slouken@12201
   768
                d &= 0x07e0f81f;
slouken@12201
   769
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   770
            },{
slouken@12201
   771
                src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@12201
   772
                dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   773
slouken@12201
   774
                /* red */
slouken@12201
   775
                src2 = src1;
slouken@12201
   776
                src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
   777
slouken@12201
   778
                dst2 = dst1;
slouken@12201
   779
                dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
   780
slouken@12201
   781
                /* blend */
slouken@12201
   782
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   783
                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   784
                src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@12201
   785
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@12201
   786
                dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
   787
slouken@12201
   788
                mm_res = dst2; /* RED -> mm_res */
slouken@1542
   789
slouken@12201
   790
                /* green -- process the bits in place */
slouken@12201
   791
                src2 = src1;
slouken@12201
   792
                src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   793
slouken@12201
   794
                dst2 = dst1;
slouken@12201
   795
                dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   796
slouken@12201
   797
                /* blend */
slouken@12201
   798
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   799
                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   800
                src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@12201
   801
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   802
slouken@12201
   803
                mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   804
slouken@12201
   805
                /* blue */
slouken@12201
   806
                src2 = src1;
slouken@12201
   807
                src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   808
slouken@12201
   809
                dst2 = dst1;
slouken@12201
   810
                dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   811
slouken@12201
   812
                /* blend */
slouken@12201
   813
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   814
                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   815
                src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@12201
   816
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@12201
   817
                dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   818
slouken@12201
   819
                mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   820
slouken@12201
   821
                *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   822
slouken@12201
   823
                srcp += 4;
slouken@12201
   824
                dstp += 4;
slouken@12201
   825
            }, width);
slouken@12201
   826
            /* *INDENT-ON* */
slouken@1895
   827
            srcp += srcskip;
slouken@1895
   828
            dstp += dstskip;
slouken@1895
   829
        }
slouken@1895
   830
        _mm_empty();
slouken@1895
   831
    }
slouken@1542
   832
}
slouken@1542
   833
slouken@1542
   834
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
   835
static void
slouken@1895
   836
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   837
{
slouken@2267
   838
    unsigned alpha = info->a;
slouken@1895
   839
    if (alpha == 128) {
slouken@1895
   840
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
   841
    } else {
slouken@2262
   842
        int width = info->dst_w;
slouken@2262
   843
        int height = info->dst_h;
slouken@2262
   844
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   845
        int srcskip = info->src_skip >> 1;
slouken@2262
   846
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   847
        int dstskip = info->dst_skip >> 1;
slouken@1895
   848
        Uint32 s, d;
slouken@1542
   849
slouken@1895
   850
        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   851
slouken@1895
   852
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   853
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   854
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
   855
slouken@1895
   856
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   857
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   858
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   859
           to reduce the number of operations */
slouken@1895
   860
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   861
slouken@1895
   862
        /* Setup the 555 color channel masks */
slouken@1895
   863
        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
slouken@1895
   864
        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
slouken@1895
   865
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   866
slouken@1895
   867
        while (height--) {
slouken@12201
   868
            /* *INDENT-OFF* */
slouken@12201
   869
            DUFFS_LOOP_124(
slouken@12201
   870
            {
slouken@12201
   871
                s = *srcp++;
slouken@12201
   872
                d = *dstp;
slouken@12201
   873
                /*
slouken@12201
   874
                 * shift out the middle component (green) to
slouken@12201
   875
                 * the high 16 bits, and process all three RGB
slouken@12201
   876
                 * components at the same time.
slouken@12201
   877
                 */
slouken@12201
   878
                s = (s | s << 16) & 0x03e07c1f;
slouken@12201
   879
                d = (d | d << 16) & 0x03e07c1f;
slouken@12201
   880
                d += (s - d) * alpha >> 5;
slouken@12201
   881
                d &= 0x03e07c1f;
slouken@12201
   882
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   883
            },{
slouken@12201
   884
                s = *srcp++;
slouken@12201
   885
                d = *dstp;
slouken@12201
   886
                /*
slouken@12201
   887
                 * shift out the middle component (green) to
slouken@12201
   888
                 * the high 16 bits, and process all three RGB
slouken@12201
   889
                 * components at the same time.
slouken@12201
   890
                 */
slouken@12201
   891
                s = (s | s << 16) & 0x03e07c1f;
slouken@12201
   892
                d = (d | d << 16) & 0x03e07c1f;
slouken@12201
   893
                d += (s - d) * alpha >> 5;
slouken@12201
   894
                d &= 0x03e07c1f;
slouken@12201
   895
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   896
                    s = *srcp++;
slouken@12201
   897
                d = *dstp;
slouken@12201
   898
                /*
slouken@12201
   899
                 * shift out the middle component (green) to
slouken@12201
   900
                 * the high 16 bits, and process all three RGB
slouken@12201
   901
                 * components at the same time.
slouken@12201
   902
                 */
slouken@12201
   903
                s = (s | s << 16) & 0x03e07c1f;
slouken@12201
   904
                d = (d | d << 16) & 0x03e07c1f;
slouken@12201
   905
                d += (s - d) * alpha >> 5;
slouken@12201
   906
                d &= 0x03e07c1f;
slouken@12201
   907
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
   908
            },{
slouken@12201
   909
                src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@12201
   910
                dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   911
slouken@12201
   912
                /* red -- process the bits in place */
slouken@12201
   913
                src2 = src1;
slouken@12201
   914
                src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
   915
slouken@12201
   916
                dst2 = dst1;
slouken@12201
   917
                dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
   918
slouken@12201
   919
                /* blend */
slouken@12201
   920
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   921
                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   922
                src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@12201
   923
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@12201
   924
                dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
   925
slouken@12201
   926
                mm_res = dst2; /* RED -> mm_res */
slouken@12201
   927
                
slouken@12201
   928
                /* green -- process the bits in place */
slouken@12201
   929
                src2 = src1;
slouken@12201
   930
                src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   931
slouken@12201
   932
                dst2 = dst1;
slouken@12201
   933
                dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   934
slouken@12201
   935
                /* blend */
slouken@12201
   936
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   937
                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   938
                src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@12201
   939
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   940
slouken@12201
   941
                mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   942
slouken@12201
   943
                /* blue */
slouken@12201
   944
                src2 = src1; /* src -> src2 */
slouken@12201
   945
                src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   946
slouken@12201
   947
                dst2 = dst1; /* dst -> dst2 */
slouken@12201
   948
                dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   949
slouken@12201
   950
                /* blend */
slouken@12201
   951
                src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@12201
   952
                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@12201
   953
                src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@12201
   954
                dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@12201
   955
                dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   956
slouken@12201
   957
                mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   958
slouken@12201
   959
                *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   960
slouken@12201
   961
                srcp += 4;
slouken@12201
   962
                dstp += 4;
slouken@12201
   963
            }, width);
slouken@12201
   964
            /* *INDENT-ON* */
slouken@1895
   965
            srcp += srcskip;
slouken@1895
   966
            dstp += dstskip;
slouken@1895
   967
        }
slouken@1895
   968
        _mm_empty();
slouken@1895
   969
    }
slouken@1542
   970
}
slouken@2255
   971
slouken@2255
   972
#endif /* __MMX__ */
slouken@689
   973
slouken@1
   974
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   975
static void
slouken@1895
   976
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   977
{
slouken@2267
   978
    unsigned alpha = info->a;
slouken@1895
   979
    if (alpha == 128) {
slouken@1895
   980
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   981
    } else {
slouken@2262
   982
        int width = info->dst_w;
slouken@2262
   983
        int height = info->dst_h;
slouken@2262
   984
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   985
        int srcskip = info->src_skip >> 1;
slouken@2262
   986
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   987
        int dstskip = info->dst_skip >> 1;
slouken@1895
   988
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1
   989
slouken@1895
   990
        while (height--) {
slouken@12201
   991
            /* *INDENT-OFF* */
slouken@12201
   992
            DUFFS_LOOP4({
slouken@12201
   993
                Uint32 s = *srcp++;
slouken@12201
   994
                Uint32 d = *dstp;
slouken@12201
   995
                /*
slouken@12201
   996
                 * shift out the middle component (green) to
slouken@12201
   997
                 * the high 16 bits, and process all three RGB
slouken@12201
   998
                 * components at the same time.
slouken@12201
   999
                 */
slouken@12201
  1000
                s = (s | s << 16) & 0x07e0f81f;
slouken@12201
  1001
                d = (d | d << 16) & 0x07e0f81f;
slouken@12201
  1002
                d += (s - d) * alpha >> 5;
slouken@12201
  1003
                d &= 0x07e0f81f;
slouken@12201
  1004
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
  1005
            }, width);
slouken@12201
  1006
            /* *INDENT-ON* */
slouken@1895
  1007
            srcp += srcskip;
slouken@1895
  1008
            dstp += dstskip;
slouken@1895
  1009
        }
slouken@1895
  1010
    }
slouken@0
  1011
}
slouken@0
  1012
slouken@0
  1013
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  1014
static void
slouken@1895
  1015
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1016
{
slouken@2267
  1017
    unsigned alpha = info->a;   /* downscale alpha to 5 bits */
slouken@1895
  1018
    if (alpha == 128) {
slouken@1895
  1019
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  1020
    } else {
slouken@2262
  1021
        int width = info->dst_w;
slouken@2262
  1022
        int height = info->dst_h;
slouken@2262
  1023
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
  1024
        int srcskip = info->src_skip >> 1;
slouken@2262
  1025
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1026
        int dstskip = info->dst_skip >> 1;
slouken@1895
  1027
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@0
  1028
slouken@1895
  1029
        while (height--) {
slouken@12201
  1030
            /* *INDENT-OFF* */
slouken@12201
  1031
            DUFFS_LOOP4({
slouken@12201
  1032
                Uint32 s = *srcp++;
slouken@12201
  1033
                Uint32 d = *dstp;
slouken@12201
  1034
                /*
slouken@12201
  1035
                 * shift out the middle component (green) to
slouken@12201
  1036
                 * the high 16 bits, and process all three RGB
slouken@12201
  1037
                 * components at the same time.
slouken@12201
  1038
                 */
slouken@12201
  1039
                s = (s | s << 16) & 0x03e07c1f;
slouken@12201
  1040
                d = (d | d << 16) & 0x03e07c1f;
slouken@12201
  1041
                d += (s - d) * alpha >> 5;
slouken@12201
  1042
                d &= 0x03e07c1f;
slouken@12201
  1043
                *dstp++ = (Uint16)(d | d >> 16);
slouken@12201
  1044
            }, width);
slouken@12201
  1045
            /* *INDENT-ON* */
slouken@1895
  1046
            srcp += srcskip;
slouken@1895
  1047
            dstp += dstskip;
slouken@1895
  1048
        }
slouken@1895
  1049
    }
slouken@0
  1050
}
slouken@0
  1051
slouken@0
  1052
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@1895
  1053
static void
slouken@1895
  1054
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1055
{
slouken@2262
  1056
    int width = info->dst_w;
slouken@2262
  1057
    int height = info->dst_h;
slouken@2262
  1058
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1059
    int srcskip = info->src_skip >> 2;
slouken@2262
  1060
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1061
    int dstskip = info->dst_skip >> 1;
slouken@0
  1062
slouken@1895
  1063
    while (height--) {
slouken@12201
  1064
        /* *INDENT-OFF* */
slouken@12201
  1065
        DUFFS_LOOP4({
slouken@12201
  1066
        Uint32 s = *srcp;
slouken@12201
  1067
        unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@12201
  1068
        /* FIXME: Here we special-case opaque alpha since the
slouken@12201
  1069
           compositioning used (>>8 instead of /255) doesn't handle
slouken@12201
  1070
           it correctly. Also special-case alpha=0 for speed?
slouken@12201
  1071
           Benchmark this! */
slouken@12201
  1072
        if(alpha) {   
slouken@12201
  1073
          if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@12201
  1074
            *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@12201
  1075
          } else {
slouken@12201
  1076
            Uint32 d = *dstp;
slouken@12201
  1077
            /*
slouken@12201
  1078
             * convert source and destination to G0RAB65565
slouken@12201
  1079
             * and blend all components at the same time
slouken@12201
  1080
             */
slouken@12201
  1081
            s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@12201
  1082
              + (s >> 3 & 0x1f);
slouken@12201
  1083
            d = (d | d << 16) & 0x07e0f81f;
slouken@12201
  1084
            d += (s - d) * alpha >> 5;
slouken@12201
  1085
            d &= 0x07e0f81f;
slouken@12201
  1086
            *dstp = (Uint16)(d | d >> 16);
slouken@12201
  1087
          }
slouken@12201
  1088
        }
slouken@12201
  1089
        srcp++;
slouken@12201
  1090
        dstp++;
slouken@12201
  1091
        }, width);
slouken@12201
  1092
        /* *INDENT-ON* */
slouken@1895
  1093
        srcp += srcskip;
slouken@1895
  1094
        dstp += dstskip;
slouken@1895
  1095
    }
slouken@0
  1096
}
slouken@0
  1097
slouken@0
  1098
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@1895
  1099
static void
slouken@1895
  1100
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1101
{
slouken@2262
  1102
    int width = info->dst_w;
slouken@2262
  1103
    int height = info->dst_h;
slouken@2262
  1104
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1105
    int srcskip = info->src_skip >> 2;
slouken@2262
  1106
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1107
    int dstskip = info->dst_skip >> 1;
slouken@0
  1108
slouken@1895
  1109
    while (height--) {
slouken@12201
  1110
        /* *INDENT-OFF* */
slouken@12201
  1111
        DUFFS_LOOP4({
slouken@12201
  1112
        unsigned alpha;
slouken@12201
  1113
        Uint32 s = *srcp;
slouken@12201
  1114
        alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@12201
  1115
        /* FIXME: Here we special-case opaque alpha since the
slouken@12201
  1116
           compositioning used (>>8 instead of /255) doesn't handle
slouken@12201
  1117
           it correctly. Also special-case alpha=0 for speed?
slouken@12201
  1118
           Benchmark this! */
slouken@12201
  1119
        if(alpha) {   
slouken@12201
  1120
          if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@12201
  1121
            *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@12201
  1122
          } else {
slouken@12201
  1123
            Uint32 d = *dstp;
slouken@12201
  1124
            /*
slouken@12201
  1125
             * convert source and destination to G0RAB65565
slouken@12201
  1126
             * and blend all components at the same time
slouken@12201
  1127
             */
slouken@12201
  1128
            s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@12201
  1129
              + (s >> 3 & 0x1f);
slouken@12201
  1130
            d = (d | d << 16) & 0x03e07c1f;
slouken@12201
  1131
            d += (s - d) * alpha >> 5;
slouken@12201
  1132
            d &= 0x03e07c1f;
slouken@12201
  1133
            *dstp = (Uint16)(d | d >> 16);
slouken@12201
  1134
          }
slouken@12201
  1135
        }
slouken@12201
  1136
        srcp++;
slouken@12201
  1137
        dstp++;
slouken@12201
  1138
        }, width);
slouken@12201
  1139
        /* *INDENT-ON* */
slouken@1895
  1140
        srcp += srcskip;
slouken@1895
  1141
        dstp += dstskip;
slouken@1895
  1142
    }
slouken@0
  1143
}
slouken@0
  1144
slouken@0
  1145
/* General (slow) N->N blending with per-surface alpha */
slouken@1895
  1146
static void
slouken@1895
  1147
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1148
{
slouken@2262
  1149
    int width = info->dst_w;
slouken@2262
  1150
    int height = info->dst_h;
slouken@2262
  1151
    Uint8 *src = info->src;
slouken@2267
  1152
    int srcskip = info->src_skip;
slouken@2262
  1153
    Uint8 *dst = info->dst;
slouken@2267
  1154
    int dstskip = info->dst_skip;
slouken@2267
  1155
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1156
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1157
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1158
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1159
    Uint32 Pixel;
slouken@7502
  1160
    unsigned sR, sG, sB;
slouken@7502
  1161
    unsigned dR, dG, dB, dA;
slouken@7502
  1162
    const unsigned sA = info->a;
slouken@0
  1163
slouken@1895
  1164
    if (sA) {
slouken@1895
  1165
        while (height--) {
slouken@12201
  1166
        /* *INDENT-OFF* */
slouken@12201
  1167
        DUFFS_LOOP4(
slouken@12201
  1168
        {
slouken@12201
  1169
        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@12201
  1170
        DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@12201
  1171
        ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@12201
  1172
        ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@12201
  1173
        src += srcbpp;
slouken@12201
  1174
        dst += dstbpp;
slouken@12201
  1175
        },
slouken@12201
  1176
        width);
slouken@12201
  1177
        /* *INDENT-ON* */
slouken@1895
  1178
            src += srcskip;
slouken@1895
  1179
            dst += dstskip;
slouken@1895
  1180
        }
slouken@1895
  1181
    }
slouken@0
  1182
}
slouken@0
  1183
slouken@0
  1184
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@1895
  1185
static void
slouken@1895
  1186
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
  1187
{
slouken@2262
  1188
    int width = info->dst_w;
slouken@2262
  1189
    int height = info->dst_h;
slouken@2262
  1190
    Uint8 *src = info->src;
slouken@2267
  1191
    int srcskip = info->src_skip;
slouken@2262
  1192
    Uint8 *dst = info->dst;
slouken@2267
  1193
    int dstskip = info->dst_skip;
slouken@2267
  1194
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1195
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@2267
  1196
    Uint32 ckey = info->colorkey;
slouken@1895
  1197
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1198
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1199
    Uint32 Pixel;
slouken@7502
  1200
    unsigned sR, sG, sB;
slouken@7502
  1201
    unsigned dR, dG, dB, dA;
slouken@7502
  1202
    const unsigned sA = info->a;
slouken@0
  1203
slouken@1895
  1204
    while (height--) {
slouken@12201
  1205
        /* *INDENT-OFF* */
slouken@12201
  1206
        DUFFS_LOOP4(
slouken@12201
  1207
        {
slouken@12201
  1208
        RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
slouken@12201
  1209
        if(sA && Pixel != ckey) {
slouken@12201
  1210
            RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
slouken@12201
  1211
            DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@12201
  1212
            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@12201
  1213
            ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@12201
  1214
        }
slouken@12201
  1215
        src += srcbpp;
slouken@12201
  1216
        dst += dstbpp;
slouken@12201
  1217
        },
slouken@12201
  1218
        width);
slouken@12201
  1219
        /* *INDENT-ON* */
slouken@1895
  1220
        src += srcskip;
slouken@1895
  1221
        dst += dstskip;
slouken@1895
  1222
    }
slouken@0
  1223
}
slouken@0
  1224
slouken@0
  1225
/* General (slow) N->N blending with pixel alpha */
slouken@1895
  1226
static void
slouken@1895
  1227
BlitNtoNPixelAlpha(SDL_BlitInfo * info)
slouken@0
  1228
{
slouken@2262
  1229
    int width = info->dst_w;
slouken@2262
  1230
    int height = info->dst_h;
slouken@2262
  1231
    Uint8 *src = info->src;
slouken@2267
  1232
    int srcskip = info->src_skip;
slouken@2262
  1233
    Uint8 *dst = info->dst;
slouken@2267
  1234
    int dstskip = info->dst_skip;
slouken@2267
  1235
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1236
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1237
    int srcbpp;
slouken@1895
  1238
    int dstbpp;
slouken@7502
  1239
    Uint32 Pixel;
slouken@7502
  1240
    unsigned sR, sG, sB, sA;
slouken@7502
  1241
    unsigned dR, dG, dB, dA;
slouken@0
  1242
slouken@1895
  1243
    /* Set up some basic variables */
slouken@1895
  1244
    srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1245
    dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1246
slouken@1895
  1247
    while (height--) {
slouken@12201
  1248
        /* *INDENT-OFF* */
slouken@12201
  1249
        DUFFS_LOOP4(
slouken@12201
  1250
        {
slouken@12201
  1251
        DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@12201
  1252
        if(sA) {
slouken@12201
  1253
            DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@12201
  1254
            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@12201
  1255
            ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@12201
  1256
        }
slouken@12201
  1257
        src += srcbpp;
slouken@12201
  1258
        dst += dstbpp;
slouken@12201
  1259
        },
slouken@12201
  1260
        width);
slouken@12201
  1261
        /* *INDENT-ON* */
slouken@1895
  1262
        src += srcskip;
slouken@1895
  1263
        dst += dstskip;
slouken@1895
  1264
    }
slouken@0
  1265
}
slouken@0
  1266
slouken@0
  1267
slouken@2267
  1268
SDL_BlitFunc
slouken@2267
  1269
SDL_CalculateBlitA(SDL_Surface * surface)
slouken@0
  1270
{
slouken@0
  1271
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1272
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  1273
slouken@2853
  1274
    switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
slouken@2267
  1275
    case SDL_COPY_BLEND:
slouken@1895
  1276
        /* Per-pixel alpha blits */
slouken@1895
  1277
        switch (df->BytesPerPixel) {
slouken@1895
  1278
        case 1:
sylvain@12585
  1279
            if (df->palette != NULL) {
sylvain@12585
  1280
                return BlitNto1PixelAlpha;
sylvain@12585
  1281
            } else {
sylvain@12585
  1282
                /* RGB332 has no palette ! */
sylvain@12585
  1283
                return BlitNtoNPixelAlpha;
sylvain@12585
  1284
            }
slouken@0
  1285
slouken@1895
  1286
        case 2:
slouken@5389
  1287
                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@5389
  1288
                    && sf->Gmask == 0xff00
slouken@5389
  1289
                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@5389
  1290
                        || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@1895
  1291
                if (df->Gmask == 0x7e0)
slouken@1895
  1292
                    return BlitARGBto565PixelAlpha;
slouken@1895
  1293
                else if (df->Gmask == 0x3e0)
slouken@1895
  1294
                    return BlitARGBto555PixelAlpha;
slouken@1895
  1295
            }
slouken@1895
  1296
            return BlitNtoNPixelAlpha;
slouken@0
  1297
slouken@1895
  1298
        case 4:
slouken@1895
  1299
            if (sf->Rmask == df->Rmask
slouken@1895
  1300
                && sf->Gmask == df->Gmask
slouken@1895
  1301
                && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@5389
  1302
#if defined(__MMX__) || defined(__3dNOW__)
slouken@1895
  1303
                if (sf->Rshift % 8 == 0
slouken@1895
  1304
                    && sf->Gshift % 8 == 0
slouken@1895
  1305
                    && sf->Bshift % 8 == 0
slouken@1895
  1306
                    && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
slouken@5389
  1307
#ifdef __3dNOW__
slouken@5389
  1308
                    if (SDL_Has3DNow())
slouken@5389
  1309
                        return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@5389
  1310
#endif
slouken@5389
  1311
#ifdef __MMX__
slouken@1895
  1312
                    if (SDL_HasMMX())
slouken@1895
  1313
                        return BlitRGBtoRGBPixelAlphaMMX;
slouken@5389
  1314
#endif
slouken@1895
  1315
                }
slouken@5389
  1316
#endif /* __MMX__ || __3dNOW__ */
slouken@1895
  1317
                if (sf->Amask == 0xff000000) {
slouken@1895
  1318
                    return BlitRGBtoRGBPixelAlpha;
slouken@1895
  1319
                }
slouken@1895
  1320
            }
slouken@7502
  1321
            return BlitNtoNPixelAlpha;
slouken@0
  1322
slouken@1895
  1323
        case 3:
slouken@1895
  1324
        default:
icculus@10926
  1325
            break;
slouken@1895
  1326
        }
icculus@10926
  1327
        return BlitNtoNPixelAlpha;
slouken@2267
  1328
slouken@2267
  1329
    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1330
        if (sf->Amask == 0) {
slouken@2267
  1331
            /* Per-surface alpha blits */
slouken@2267
  1332
            switch (df->BytesPerPixel) {
slouken@2267
  1333
            case 1:
sylvain@12585
  1334
                if (df->palette != NULL) {
sylvain@12585
  1335
                    return BlitNto1SurfaceAlpha;
sylvain@12585
  1336
                } else {
sylvain@12585
  1337
                    /* RGB332 has no palette ! */
sylvain@12585
  1338
                    return BlitNtoNSurfaceAlpha;
sylvain@12585
  1339
                }
slouken@2267
  1340
slouken@2267
  1341
            case 2:
slouken@2267
  1342
                if (surface->map->identity) {
slouken@2267
  1343
                    if (df->Gmask == 0x7e0) {
slouken@2267
  1344
#ifdef __MMX__
slouken@2267
  1345
                        if (SDL_HasMMX())
slouken@2267
  1346
                            return Blit565to565SurfaceAlphaMMX;
slouken@2267
  1347
                        else
slouken@2267
  1348
#endif
slouken@2267
  1349
                            return Blit565to565SurfaceAlpha;
slouken@2267
  1350
                    } else if (df->Gmask == 0x3e0) {
slouken@2267
  1351
#ifdef __MMX__
slouken@2267
  1352
                        if (SDL_HasMMX())
slouken@2267
  1353
                            return Blit555to555SurfaceAlphaMMX;
slouken@2267
  1354
                        else
slouken@2267
  1355
#endif
slouken@2267
  1356
                            return Blit555to555SurfaceAlpha;
slouken@2267
  1357
                    }
slouken@2267
  1358
                }
slouken@2267
  1359
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1360
slouken@2267
  1361
            case 4:
slouken@2267
  1362
                if (sf->Rmask == df->Rmask
slouken@2267
  1363
                    && sf->Gmask == df->Gmask
slouken@2267
  1364
                    && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@2267
  1365
#ifdef __MMX__
slouken@2267
  1366
                    if (sf->Rshift % 8 == 0
slouken@2267
  1367
                        && sf->Gshift % 8 == 0
slouken@2267
  1368
                        && sf->Bshift % 8 == 0 && SDL_HasMMX())
slouken@2267
  1369
                        return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@2267
  1370
#endif
slouken@2267
  1371
                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
slouken@2267
  1372
                        return BlitRGBtoRGBSurfaceAlpha;
slouken@2267
  1373
                    }
slouken@2267
  1374
                }
slouken@7502
  1375
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1376
slouken@2267
  1377
            case 3:
slouken@2267
  1378
            default:
slouken@2267
  1379
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1380
            }
slouken@2267
  1381
        }
slouken@2267
  1382
        break;
slouken@2267
  1383
slouken@2267
  1384
    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1385
        if (sf->Amask == 0) {
slouken@7502
  1386
            if (df->BytesPerPixel == 1) {
sylvain@12585
  1387
sylvain@12585
  1388
                if (df->palette != NULL) {
sylvain@12585
  1389
                    return BlitNto1SurfaceAlphaKey;
sylvain@12585
  1390
                } else {
sylvain@12585
  1391
                    /* RGB332 has no palette ! */
sylvain@12585
  1392
                    return BlitNtoNSurfaceAlphaKey;
sylvain@12585
  1393
                }
slouken@7502
  1394
            } else {
slouken@2267
  1395
                return BlitNtoNSurfaceAlphaKey;
slouken@7502
  1396
            }
slouken@2267
  1397
        }
slouken@2267
  1398
        break;
slouken@0
  1399
    }
slouken@2267
  1400
slouken@2267
  1401
    return NULL;
slouken@0
  1402
}
slouken@0
  1403
slouken@1895
  1404
/* vi: set ts=4 sw=4 expandtab: */