src/video/SDL_blit_A.c
author Gabriel Jacobo <gabomdq@gmail.com>
Wed, 21 Aug 2013 09:43:09 -0300
changeset 7677 871d43c6968a
parent 7641 0cd36d20df2b
child 7790 8136ce6b3950
permissions -rw-r--r--
OCD fixes: Adds a space before */
slouken@0
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@6885
     3
  Copyright (C) 1997-2013 Sam Lantinga <slouken@libsdl.org>
slouken@0
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@0
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@0
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
slouken@0
    20
*/
slouken@1402
    21
#include "SDL_config.h"
slouken@0
    22
slouken@0
    23
#include "SDL_video.h"
slouken@0
    24
#include "SDL_blit.h"
slouken@0
    25
slouken@0
    26
/* Functions to perform alpha blended blitting */
slouken@0
    27
slouken@0
    28
/* N->1 blending with per-surface alpha */
slouken@1895
    29
static void
slouken@1895
    30
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
    31
{
slouken@2262
    32
    int width = info->dst_w;
slouken@2262
    33
    int height = info->dst_h;
slouken@2262
    34
    Uint8 *src = info->src;
slouken@2267
    35
    int srcskip = info->src_skip;
slouken@2262
    36
    Uint8 *dst = info->dst;
slouken@2267
    37
    int dstskip = info->dst_skip;
slouken@1895
    38
    Uint8 *palmap = info->table;
slouken@2267
    39
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    40
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    41
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    42
    Uint32 Pixel;
slouken@7502
    43
    unsigned sR, sG, sB;
slouken@7502
    44
    unsigned dR, dG, dB;
slouken@2267
    45
    const unsigned A = info->a;
slouken@0
    46
slouken@1895
    47
    while (height--) {
slouken@1895
    48
	    /* *INDENT-OFF* */
slouken@0
    49
	    DUFFS_LOOP4(
slouken@0
    50
	    {
icculus@1162
    51
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    52
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    53
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    54
		dB = dstfmt->palette->colors[*dst].b;
slouken@7502
    55
		ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@0
    56
		dR &= 0xff;
slouken@0
    57
		dG &= 0xff;
slouken@0
    58
		dB &= 0xff;
slouken@0
    59
		/* Pack RGB into 8bit pixel */
slouken@0
    60
		if ( palmap == NULL ) {
slouken@7502
    61
		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
    62
		} else {
slouken@7502
    63
		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
    64
		}
slouken@0
    65
		dst++;
slouken@0
    66
		src += srcbpp;
slouken@0
    67
	    },
slouken@0
    68
	    width);
slouken@1895
    69
	    /* *INDENT-ON* */
slouken@1895
    70
        src += srcskip;
slouken@1895
    71
        dst += dstskip;
slouken@1895
    72
    }
slouken@0
    73
}
slouken@0
    74
slouken@0
    75
/* N->1 blending with pixel alpha */
slouken@1895
    76
static void
slouken@1895
    77
BlitNto1PixelAlpha(SDL_BlitInfo * info)
slouken@0
    78
{
slouken@2262
    79
    int width = info->dst_w;
slouken@2262
    80
    int height = info->dst_h;
slouken@2262
    81
    Uint8 *src = info->src;
slouken@2267
    82
    int srcskip = info->src_skip;
slouken@2262
    83
    Uint8 *dst = info->dst;
slouken@2267
    84
    int dstskip = info->dst_skip;
slouken@1895
    85
    Uint8 *palmap = info->table;
slouken@2267
    86
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    87
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    88
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    89
    Uint32 Pixel;
slouken@7502
    90
    unsigned sR, sG, sB, sA;
slouken@7502
    91
    unsigned dR, dG, dB;
slouken@0
    92
slouken@1895
    93
    while (height--) {
slouken@1895
    94
	    /* *INDENT-OFF* */
slouken@0
    95
	    DUFFS_LOOP4(
slouken@0
    96
	    {
icculus@1162
    97
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
    98
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    99
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   100
		dB = dstfmt->palette->colors[*dst].b;
slouken@7502
   101
		ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   102
		dR &= 0xff;
slouken@0
   103
		dG &= 0xff;
slouken@0
   104
		dB &= 0xff;
slouken@0
   105
		/* Pack RGB into 8bit pixel */
slouken@0
   106
		if ( palmap == NULL ) {
slouken@7502
   107
		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
   108
		} else {
slouken@7502
   109
		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
   110
		}
slouken@0
   111
		dst++;
slouken@0
   112
		src += srcbpp;
slouken@0
   113
	    },
slouken@0
   114
	    width);
slouken@1895
   115
	    /* *INDENT-ON* */
slouken@1895
   116
        src += srcskip;
slouken@1895
   117
        dst += dstskip;
slouken@1895
   118
    }
slouken@0
   119
}
slouken@0
   120
slouken@0
   121
/* colorkeyed N->1 blending with per-surface alpha */
slouken@1895
   122
static void
slouken@1895
   123
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
   124
{
slouken@2262
   125
    int width = info->dst_w;
slouken@2262
   126
    int height = info->dst_h;
slouken@2262
   127
    Uint8 *src = info->src;
slouken@2267
   128
    int srcskip = info->src_skip;
slouken@2262
   129
    Uint8 *dst = info->dst;
slouken@2267
   130
    int dstskip = info->dst_skip;
slouken@1895
   131
    Uint8 *palmap = info->table;
slouken@2267
   132
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
   133
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
   134
    int srcbpp = srcfmt->BytesPerPixel;
slouken@2267
   135
    Uint32 ckey = info->colorkey;
slouken@7502
   136
    Uint32 Pixel;
slouken@7502
   137
    unsigned sR, sG, sB;
slouken@7502
   138
    unsigned dR, dG, dB;
slouken@7502
   139
    const unsigned A = info->a;
slouken@0
   140
slouken@1895
   141
    while (height--) {
slouken@1895
   142
	    /* *INDENT-OFF* */
slouken@0
   143
	    DUFFS_LOOP(
slouken@0
   144
	    {
icculus@1162
   145
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   146
		if ( Pixel != ckey ) {
slouken@0
   147
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   148
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   149
		    dB = dstfmt->palette->colors[*dst].b;
slouken@7502
   150
		    ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@0
   151
		    dR &= 0xff;
slouken@0
   152
		    dG &= 0xff;
slouken@0
   153
		    dB &= 0xff;
slouken@0
   154
		    /* Pack RGB into 8bit pixel */
slouken@0
   155
		    if ( palmap == NULL ) {
slouken@7502
   156
                *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
   157
		    } else {
slouken@7502
   158
                *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
   159
		    }
slouken@0
   160
		}
slouken@0
   161
		dst++;
slouken@0
   162
		src += srcbpp;
slouken@0
   163
	    },
slouken@0
   164
	    width);
slouken@1895
   165
	    /* *INDENT-ON* */
slouken@1895
   166
        src += srcskip;
slouken@1895
   167
        dst += dstskip;
slouken@1895
   168
    }
slouken@0
   169
}
slouken@0
   170
slouken@2255
   171
#ifdef __MMX__
slouken@1542
   172
slouken@1542
   173
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   174
static void
slouken@1895
   175
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@1542
   176
{
slouken@2262
   177
    int width = info->dst_w;
slouken@2262
   178
    int height = info->dst_h;
slouken@2262
   179
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   180
    int srcskip = info->src_skip >> 2;
slouken@2262
   181
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   182
    int dstskip = info->dst_skip >> 2;
slouken@2267
   183
    Uint32 dalpha = info->dst_fmt->Amask;
slouken@1542
   184
slouken@1895
   185
    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   186
slouken@1895
   187
    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
slouken@1895
   188
    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
slouken@1895
   189
    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
slouken@1542
   190
slouken@1895
   191
    while (height--) {
slouken@1895
   192
        int n = width;
slouken@1895
   193
        if (n & 1) {
slouken@1895
   194
            Uint32 s = *srcp++;
slouken@1895
   195
            Uint32 d = *dstp;
slouken@1895
   196
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1895
   197
                       + (s & d & 0x00010101)) | dalpha;
slouken@1895
   198
            n--;
slouken@1895
   199
        }
slouken@1542
   200
slouken@1895
   201
        for (n >>= 1; n > 0; --n) {
slouken@1895
   202
            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   203
            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   204
slouken@1895
   205
            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   206
            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   207
slouken@1895
   208
            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
slouken@1895
   209
            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
slouken@1895
   210
            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
slouken@1895
   211
            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
slouken@1895
   212
slouken@1895
   213
            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
slouken@1895
   214
            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
slouken@1895
   215
            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
slouken@1895
   216
            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
slouken@1895
   217
slouken@1895
   218
            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
slouken@1895
   219
            dstp += 2;
slouken@1895
   220
            srcp += 2;
slouken@1895
   221
        }
slouken@1895
   222
slouken@1895
   223
        srcp += srcskip;
slouken@1895
   224
        dstp += dstskip;
slouken@1895
   225
    }
slouken@1895
   226
    _mm_empty();
slouken@1542
   227
}
slouken@1542
   228
slouken@1542
   229
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   230
static void
slouken@1895
   231
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   232
{
slouken@2267
   233
    SDL_PixelFormat *df = info->dst_fmt;
slouken@6863
   234
    Uint32 chanmask;
slouken@2267
   235
    unsigned alpha = info->a;
slouken@1542
   236
slouken@1895
   237
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   238
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   239
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   240
    } else {
slouken@2262
   241
        int width = info->dst_w;
slouken@2262
   242
        int height = info->dst_h;
slouken@2262
   243
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   244
        int srcskip = info->src_skip >> 2;
slouken@2262
   245
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   246
        int dstskip = info->dst_skip >> 2;
slouken@1895
   247
        Uint32 dalpha = df->Amask;
slouken@1895
   248
        Uint32 amult;
slouken@1542
   249
slouken@1895
   250
        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   251
slouken@1895
   252
        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
slouken@1895
   253
        /* form the alpha mult */
slouken@1895
   254
        amult = alpha | (alpha << 8);
slouken@1895
   255
        amult = amult | (amult << 16);
slouken@1895
   256
        chanmask =
slouken@3013
   257
            (0xff << df->Rshift) | (0xff << df->
slouken@3013
   258
                                    Gshift) | (0xff << df->Bshift);
slouken@1895
   259
        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1895
   260
        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1895
   261
        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   262
        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
slouken@1542
   263
slouken@1895
   264
        while (height--) {
slouken@1895
   265
            int n = width;
slouken@1895
   266
            if (n & 1) {
slouken@1895
   267
                /* One Pixel Blend */
slouken@1895
   268
                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
slouken@1895
   269
                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1542
   270
slouken@1895
   271
                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1895
   272
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   273
slouken@1895
   274
                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
slouken@1895
   275
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   276
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   277
                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1542
   278
slouken@1895
   279
                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
slouken@1895
   280
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   281
                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   282
slouken@1895
   283
                ++srcp;
slouken@1895
   284
                ++dstp;
slouken@1542
   285
slouken@1895
   286
                n--;
slouken@1895
   287
            }
slouken@1542
   288
slouken@1895
   289
            for (n >>= 1; n > 0; --n) {
slouken@1895
   290
                /* Two Pixels Blend */
slouken@1895
   291
                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   292
                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   293
                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1895
   294
                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   295
slouken@1895
   296
                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   297
                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
slouken@1895
   298
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1895
   299
                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1895
   300
slouken@1895
   301
                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
slouken@1895
   302
                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
slouken@1895
   303
                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
slouken@1895
   304
                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1895
   305
slouken@1895
   306
                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
slouken@1895
   307
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   308
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   309
                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1895
   310
slouken@1895
   311
                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1895
   312
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   313
slouken@1895
   314
                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1895
   315
slouken@1895
   316
                srcp += 2;
slouken@1895
   317
                dstp += 2;
slouken@1895
   318
            }
slouken@1895
   319
            srcp += srcskip;
slouken@1895
   320
            dstp += dstskip;
slouken@1895
   321
        }
slouken@1895
   322
        _mm_empty();
slouken@1895
   323
    }
slouken@1542
   324
}
slouken@1542
   325
slouken@1542
   326
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   327
static void
slouken@1895
   328
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   329
{
slouken@2262
   330
    int width = info->dst_w;
slouken@2262
   331
    int height = info->dst_h;
slouken@2262
   332
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   333
    int srcskip = info->src_skip >> 2;
slouken@2262
   334
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   335
    int dstskip = info->dst_skip >> 2;
slouken@2267
   336
    SDL_PixelFormat *sf = info->src_fmt;
slouken@1895
   337
    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1895
   338
    Uint32 amask = sf->Amask;
slouken@1895
   339
    Uint32 ashift = sf->Ashift;
slouken@7640
   340
    Uint64 multmask, multmask2;
slouken@1542
   341
slouken@7640
   342
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@1542
   343
slouken@1895
   344
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   345
    multmask = 0x00FF;
slouken@7640
   346
	multmask <<= (ashift * 2);
slouken@7640
   347
	multmask2 = 0x00FF00FF00FF00FF;
slouken@1542
   348
slouken@1895
   349
    while (height--) {
slouken@1895
   350
		/* *INDENT-OFF* */
slouken@1542
   351
		DUFFS_LOOP4({
slouken@1542
   352
		Uint32 alpha = *srcp & amask;
slouken@1542
   353
		if (alpha == 0) {
slouken@1542
   354
			/* do nothing */
slouken@7641
   355
		} else if (alpha == amask) {
slouken@7640
   356
			*dstp = *srcp;
slouken@1542
   357
		} else {
gabomdq@7677
   358
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@1542
   359
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   360
gabomdq@7677
   361
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1542
   362
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   363
slouken@1542
   364
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
   365
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
   366
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@7640
   367
			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
gabomdq@7677
   368
			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
gabomdq@7677
   369
			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
slouken@1542
   370
slouken@1542
   371
			/* blend */		    
slouken@7640
   372
			src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@7640
   373
			src1 = _mm_srli_pi16(src1, 8);
slouken@7640
   374
			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@7640
   375
			dst1 = _mm_srli_pi16(dst1, 8);
slouken@7640
   376
			dst1 = _mm_add_pi16(src1, dst1);
slouken@7640
   377
			dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@1542
   378
			
slouken@1542
   379
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   380
		}
slouken@1542
   381
		++srcp;
slouken@1542
   382
		++dstp;
slouken@1542
   383
	    }, width);
slouken@1895
   384
		/* *INDENT-ON* */
slouken@1895
   385
        srcp += srcskip;
slouken@1895
   386
        dstp += dstskip;
slouken@1895
   387
    }
slouken@1895
   388
    _mm_empty();
slouken@1542
   389
}
slouken@1895
   390
slouken@2255
   391
#endif /* __MMX__ */
slouken@689
   392
slouken@1
   393
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   394
static void
slouken@1895
   395
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
slouken@0
   396
{
slouken@2262
   397
    int width = info->dst_w;
slouken@2262
   398
    int height = info->dst_h;
slouken@2262
   399
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   400
    int srcskip = info->src_skip >> 2;
slouken@2262
   401
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   402
    int dstskip = info->dst_skip >> 2;
slouken@0
   403
slouken@1895
   404
    while (height--) {
slouken@1895
   405
	    /* *INDENT-OFF* */
slouken@0
   406
	    DUFFS_LOOP4({
slouken@1
   407
		    Uint32 s = *srcp++;
slouken@1
   408
		    Uint32 d = *dstp;
slouken@1
   409
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
   410
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
   411
	    }, width);
slouken@1895
   412
	    /* *INDENT-ON* */
slouken@1895
   413
        srcp += srcskip;
slouken@1895
   414
        dstp += dstskip;
slouken@1895
   415
    }
slouken@0
   416
}
slouken@0
   417
slouken@1
   418
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   419
static void
slouken@1895
   420
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   421
{
slouken@2267
   422
    unsigned alpha = info->a;
slouken@1895
   423
    if (alpha == 128) {
slouken@1895
   424
        BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1895
   425
    } else {
slouken@2262
   426
        int width = info->dst_w;
slouken@2262
   427
        int height = info->dst_h;
slouken@2262
   428
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   429
        int srcskip = info->src_skip >> 2;
slouken@2262
   430
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   431
        int dstskip = info->dst_skip >> 2;
slouken@1895
   432
        Uint32 s;
slouken@1895
   433
        Uint32 d;
slouken@1895
   434
        Uint32 s1;
slouken@1895
   435
        Uint32 d1;
slouken@1
   436
slouken@1895
   437
        while (height--) {
slouken@1895
   438
			/* *INDENT-OFF* */
slouken@3035
   439
			DUFFS_LOOP4({
slouken@1
   440
				s = *srcp;
slouken@1
   441
				d = *dstp;
slouken@1
   442
				s1 = s & 0xff00ff;
slouken@1
   443
				d1 = d & 0xff00ff;
slouken@1
   444
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
   445
				     & 0xff00ff;
slouken@1
   446
				s &= 0xff00;
slouken@1
   447
				d &= 0xff00;
slouken@1
   448
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
   449
				*dstp = d1 | d | 0xff000000;
slouken@1
   450
				++srcp;
slouken@1
   451
				++dstp;
slouken@1
   452
			}, width);
slouken@1895
   453
			/* *INDENT-ON* */
slouken@1895
   454
            srcp += srcskip;
slouken@1895
   455
            dstp += dstskip;
slouken@1895
   456
        }
slouken@1895
   457
    }
slouken@1
   458
}
slouken@1
   459
slouken@0
   460
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   461
static void
slouken@1895
   462
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
slouken@0
   463
{
slouken@2262
   464
    int width = info->dst_w;
slouken@2262
   465
    int height = info->dst_h;
slouken@2262
   466
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   467
    int srcskip = info->src_skip >> 2;
slouken@2262
   468
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   469
    int dstskip = info->dst_skip >> 2;
slouken@0
   470
slouken@1895
   471
    while (height--) {
slouken@1895
   472
	    /* *INDENT-OFF* */
slouken@0
   473
	    DUFFS_LOOP4({
slouken@0
   474
		Uint32 dalpha;
slouken@0
   475
		Uint32 d;
slouken@0
   476
		Uint32 s1;
slouken@0
   477
		Uint32 d1;
slouken@0
   478
		Uint32 s = *srcp;
slouken@0
   479
		Uint32 alpha = s >> 24;
slouken@0
   480
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
   481
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
   482
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
   483
		   Benchmark this! */
slouken@7640
   484
		if (alpha) {
slouken@7640
   485
		  if (alpha == SDL_ALPHA_OPAQUE) {
slouken@7640
   486
			  *dstp = *srcp;
slouken@689
   487
		  } else {
slouken@0
   488
		    /*
slouken@0
   489
		     * take out the middle component (green), and process
slouken@0
   490
		     * the other two in parallel. One multiply less.
slouken@0
   491
		     */
slouken@0
   492
		    d = *dstp;
slouken@7640
   493
			dalpha = d >> 24;
slouken@0
   494
		    s1 = s & 0xff00ff;
slouken@0
   495
		    d1 = d & 0xff00ff;
slouken@0
   496
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
   497
		    s &= 0xff00;
slouken@0
   498
		    d &= 0xff00;
slouken@0
   499
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@7640
   500
			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
slouken@7640
   501
		    *dstp = d1 | d | (dalpha << 24);
slouken@689
   502
		  }
slouken@0
   503
		}
slouken@0
   504
		++srcp;
slouken@0
   505
		++dstp;
slouken@0
   506
	    }, width);
slouken@1895
   507
	    /* *INDENT-ON* */
slouken@1895
   508
        srcp += srcskip;
slouken@1895
   509
        dstp += dstskip;
slouken@1895
   510
    }
slouken@0
   511
}
slouken@0
   512
slouken@5389
   513
#ifdef __3dNOW__
slouken@5389
   514
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@5389
   515
static void
slouken@5389
   516
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
slouken@5389
   517
{
slouken@5389
   518
    int width = info->dst_w;
slouken@5389
   519
    int height = info->dst_h;
slouken@5389
   520
    Uint32 *srcp = (Uint32 *) info->src;
slouken@5389
   521
    int srcskip = info->src_skip >> 2;
slouken@5389
   522
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@5389
   523
    int dstskip = info->dst_skip >> 2;
slouken@5389
   524
    SDL_PixelFormat *sf = info->src_fmt;
slouken@5389
   525
    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@5389
   526
    Uint32 amask = sf->Amask;
slouken@5389
   527
    Uint32 ashift = sf->Ashift;
slouken@7640
   528
    Uint64 multmask, multmask2;
slouken@5389
   529
slouken@7640
   530
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@5389
   531
slouken@5389
   532
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   533
    multmask = 0x00FF;
slouken@5389
   534
    multmask <<= (ashift * 2);
slouken@7640
   535
    multmask2 = 0x00FF00FF00FF00FF;
slouken@5389
   536
slouken@5389
   537
    while (height--) {
slouken@5389
   538
	    /* *INDENT-OFF* */
slouken@5389
   539
	    DUFFS_LOOP4({
slouken@5389
   540
		Uint32 alpha;
slouken@5389
   541
slouken@5389
   542
		_m_prefetch(srcp + 16);
slouken@5389
   543
		_m_prefetch(dstp + 16);
slouken@5389
   544
slouken@5389
   545
		alpha = *srcp & amask;
slouken@5389
   546
		if (alpha == 0) {
slouken@5389
   547
			/* do nothing */
slouken@7641
   548
		} else if (alpha == amask) {
slouken@7640
   549
			*dstp = *srcp;
slouken@5389
   550
		} else {
gabomdq@7677
   551
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@5389
   552
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@5389
   553
gabomdq@7677
   554
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@5389
   555
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@5389
   556
slouken@5389
   557
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@5389
   558
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@5389
   559
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@7640
   560
			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
gabomdq@7677
   561
			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
gabomdq@7677
   562
			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
slouken@7640
   563
slouken@5389
   564
slouken@5389
   565
			/* blend */		    
slouken@7640
   566
			src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@7640
   567
			src1 = _mm_srli_pi16(src1, 8);
slouken@7640
   568
			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@7640
   569
			dst1 = _mm_srli_pi16(dst1, 8);
slouken@7640
   570
			dst1 = _mm_add_pi16(src1, dst1);
slouken@7640
   571
			dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@5389
   572
			
slouken@5389
   573
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@5389
   574
		}
slouken@5389
   575
		++srcp;
slouken@5389
   576
		++dstp;
slouken@5389
   577
	    }, width);
slouken@5389
   578
	    /* *INDENT-ON* */
slouken@5389
   579
        srcp += srcskip;
slouken@5389
   580
        dstp += dstskip;
slouken@5389
   581
    }
slouken@5389
   582
    _mm_empty();
slouken@5389
   583
}
slouken@5389
   584
slouken@5389
   585
#endif /* __MMX__ */
slouken@5389
   586
slouken@1
   587
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
   588
slouken@1
   589
/* blend a single 16 bit pixel at 50% */
slouken@1
   590
#define BLEND16_50(d, s, mask)						\
slouken@1
   591
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
   592
slouken@1
   593
/* blend two 16 bit pixels at 50% */
slouken@1
   594
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
   595
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
   596
	 + (s & d & (~(mask | mask << 16))))
slouken@1
   597
slouken@1895
   598
static void
slouken@1895
   599
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
slouken@0
   600
{
slouken@2262
   601
    int width = info->dst_w;
slouken@2262
   602
    int height = info->dst_h;
slouken@2262
   603
    Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   604
    int srcskip = info->src_skip >> 1;
slouken@2262
   605
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   606
    int dstskip = info->dst_skip >> 1;
slouken@0
   607
slouken@1895
   608
    while (height--) {
slouken@1895
   609
        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
slouken@1895
   610
            /*
slouken@1895
   611
             * Source and destination not aligned, pipeline it.
slouken@1895
   612
             * This is mostly a win for big blits but no loss for
slouken@1895
   613
             * small ones
slouken@1895
   614
             */
slouken@1895
   615
            Uint32 prev_sw;
slouken@1895
   616
            int w = width;
slouken@1
   617
slouken@1895
   618
            /* handle odd destination */
slouken@1895
   619
            if ((uintptr_t) dstp & 2) {
slouken@1895
   620
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   621
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   622
                dstp++;
slouken@1895
   623
                srcp++;
slouken@1895
   624
                w--;
slouken@1895
   625
            }
slouken@1895
   626
            srcp++;             /* srcp is now 32-bit aligned */
slouken@1
   627
slouken@1895
   628
            /* bootstrap pipeline with first halfword */
slouken@1895
   629
            prev_sw = ((Uint32 *) srcp)[-1];
slouken@1
   630
slouken@1895
   631
            while (w > 1) {
slouken@1895
   632
                Uint32 sw, dw, s;
slouken@1895
   633
                sw = *(Uint32 *) srcp;
slouken@1895
   634
                dw = *(Uint32 *) dstp;
slouken@1443
   635
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   636
                s = (prev_sw << 16) + (sw >> 16);
slouken@1443
   637
#else
slouken@1895
   638
                s = (prev_sw >> 16) + (sw << 16);
slouken@1443
   639
#endif
slouken@1895
   640
                prev_sw = sw;
slouken@1895
   641
                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
slouken@1895
   642
                dstp += 2;
slouken@1895
   643
                srcp += 2;
slouken@1895
   644
                w -= 2;
slouken@1895
   645
            }
slouken@1
   646
slouken@1895
   647
            /* final pixel if any */
slouken@1895
   648
            if (w) {
slouken@1895
   649
                Uint16 d = *dstp, s;
slouken@1443
   650
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   651
                s = (Uint16) prev_sw;
slouken@1443
   652
#else
slouken@1895
   653
                s = (Uint16) (prev_sw >> 16);
slouken@1443
   654
#endif
slouken@1895
   655
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   656
                srcp++;
slouken@1895
   657
                dstp++;
slouken@1895
   658
            }
slouken@1895
   659
            srcp += srcskip - 1;
slouken@1895
   660
            dstp += dstskip;
slouken@1895
   661
        } else {
slouken@1895
   662
            /* source and destination are aligned */
slouken@1895
   663
            int w = width;
slouken@1
   664
slouken@1895
   665
            /* first odd pixel? */
slouken@1895
   666
            if ((uintptr_t) srcp & 2) {
slouken@1895
   667
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   668
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   669
                srcp++;
slouken@1895
   670
                dstp++;
slouken@1895
   671
                w--;
slouken@1895
   672
            }
slouken@1895
   673
            /* srcp and dstp are now 32-bit aligned */
slouken@1
   674
slouken@1895
   675
            while (w > 1) {
slouken@1895
   676
                Uint32 sw = *(Uint32 *) srcp;
slouken@1895
   677
                Uint32 dw = *(Uint32 *) dstp;
slouken@1895
   678
                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
slouken@1895
   679
                srcp += 2;
slouken@1895
   680
                dstp += 2;
slouken@1895
   681
                w -= 2;
slouken@1895
   682
            }
slouken@1
   683
slouken@1895
   684
            /* last odd pixel? */
slouken@1895
   685
            if (w) {
slouken@1895
   686
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   687
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   688
                srcp++;
slouken@1895
   689
                dstp++;
slouken@1895
   690
            }
slouken@1895
   691
            srcp += srcskip;
slouken@1895
   692
            dstp += dstskip;
slouken@1895
   693
        }
slouken@1895
   694
    }
slouken@1
   695
}
slouken@1
   696
slouken@2255
   697
#ifdef __MMX__
slouken@689
   698
slouken@1542
   699
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   700
static void
slouken@1895
   701
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   702
{
slouken@2267
   703
    unsigned alpha = info->a;
slouken@1895
   704
    if (alpha == 128) {
slouken@1895
   705
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   706
    } else {
slouken@2262
   707
        int width = info->dst_w;
slouken@2262
   708
        int height = info->dst_h;
slouken@2262
   709
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   710
        int srcskip = info->src_skip >> 1;
slouken@2262
   711
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   712
        int dstskip = info->dst_skip >> 1;
slouken@1895
   713
        Uint32 s, d;
slouken@1542
   714
slouken@1895
   715
        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   716
slouken@1895
   717
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   718
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   719
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1895
   720
slouken@1895
   721
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   722
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   723
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   724
           to reduce the number of operations */
slouken@1895
   725
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   726
slouken@1895
   727
        /* Setup the 565 color channel masks */
slouken@1895
   728
        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
slouken@1895
   729
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   730
slouken@1895
   731
        while (height--) {
slouken@1895
   732
			/* *INDENT-OFF* */
slouken@3035
   733
			DUFFS_LOOP_124(
slouken@1542
   734
			{
slouken@1542
   735
				s = *srcp++;
slouken@1542
   736
				d = *dstp;
slouken@1542
   737
				/*
slouken@1542
   738
				 * shift out the middle component (green) to
slouken@1542
   739
				 * the high 16 bits, and process all three RGB
slouken@1542
   740
				 * components at the same time.
slouken@1542
   741
				 */
slouken@1542
   742
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   743
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   744
				d += (s - d) * alpha >> 5;
slouken@1542
   745
				d &= 0x07e0f81f;
slouken@1546
   746
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   747
			},{
slouken@1542
   748
				s = *srcp++;
slouken@1542
   749
				d = *dstp;
slouken@1542
   750
				/*
slouken@1542
   751
				 * shift out the middle component (green) to
slouken@1542
   752
				 * the high 16 bits, and process all three RGB
slouken@1542
   753
				 * components at the same time.
slouken@1542
   754
				 */
slouken@1542
   755
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   756
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   757
				d += (s - d) * alpha >> 5;
slouken@1542
   758
				d &= 0x07e0f81f;
slouken@1546
   759
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   760
				s = *srcp++;
slouken@1542
   761
				d = *dstp;
slouken@1542
   762
				/*
slouken@1542
   763
				 * shift out the middle component (green) to
slouken@1542
   764
				 * the high 16 bits, and process all three RGB
slouken@1542
   765
				 * components at the same time.
slouken@1542
   766
				 */
slouken@1542
   767
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   768
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   769
				d += (s - d) * alpha >> 5;
slouken@1542
   770
				d &= 0x07e0f81f;
slouken@1546
   771
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   772
			},{
slouken@1542
   773
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   774
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   775
slouken@1542
   776
				/* red */
slouken@1542
   777
				src2 = src1;
slouken@1542
   778
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
   779
slouken@1542
   780
				dst2 = dst1;
slouken@1542
   781
				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
   782
slouken@1542
   783
				/* blend */
slouken@1542
   784
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   785
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   786
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   787
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   788
				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
   789
slouken@1542
   790
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   791
slouken@1542
   792
				/* green -- process the bits in place */
slouken@1542
   793
				src2 = src1;
slouken@1542
   794
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   795
slouken@1542
   796
				dst2 = dst1;
slouken@1542
   797
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   798
slouken@1542
   799
				/* blend */
slouken@1542
   800
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   801
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   802
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   803
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   804
slouken@1542
   805
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   806
slouken@1542
   807
				/* blue */
slouken@1542
   808
				src2 = src1;
slouken@1542
   809
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   810
slouken@1542
   811
				dst2 = dst1;
slouken@1542
   812
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   813
slouken@1542
   814
				/* blend */
slouken@1542
   815
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   816
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   817
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   818
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   819
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   820
slouken@1542
   821
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   822
slouken@1542
   823
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   824
slouken@1542
   825
				srcp += 4;
slouken@1542
   826
				dstp += 4;
slouken@1895
   827
			}, width);
slouken@1895
   828
			/* *INDENT-ON* */
slouken@1895
   829
            srcp += srcskip;
slouken@1895
   830
            dstp += dstskip;
slouken@1895
   831
        }
slouken@1895
   832
        _mm_empty();
slouken@1895
   833
    }
slouken@1542
   834
}
slouken@1542
   835
slouken@1542
   836
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
   837
static void
slouken@1895
   838
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   839
{
slouken@2267
   840
    unsigned alpha = info->a;
slouken@1895
   841
    if (alpha == 128) {
slouken@1895
   842
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
   843
    } else {
slouken@2262
   844
        int width = info->dst_w;
slouken@2262
   845
        int height = info->dst_h;
slouken@2262
   846
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   847
        int srcskip = info->src_skip >> 1;
slouken@2262
   848
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   849
        int dstskip = info->dst_skip >> 1;
slouken@1895
   850
        Uint32 s, d;
slouken@1542
   851
slouken@1895
   852
        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   853
slouken@1895
   854
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   855
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   856
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
   857
slouken@1895
   858
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   859
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   860
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   861
           to reduce the number of operations */
slouken@1895
   862
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   863
slouken@1895
   864
        /* Setup the 555 color channel masks */
slouken@1895
   865
        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
slouken@1895
   866
        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
slouken@1895
   867
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   868
slouken@1895
   869
        while (height--) {
slouken@1895
   870
			/* *INDENT-OFF* */
slouken@3035
   871
			DUFFS_LOOP_124(
slouken@1542
   872
			{
slouken@1542
   873
				s = *srcp++;
slouken@1542
   874
				d = *dstp;
slouken@1542
   875
				/*
slouken@1542
   876
				 * shift out the middle component (green) to
slouken@1542
   877
				 * the high 16 bits, and process all three RGB
slouken@1542
   878
				 * components at the same time.
slouken@1542
   879
				 */
slouken@1542
   880
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   881
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   882
				d += (s - d) * alpha >> 5;
slouken@1542
   883
				d &= 0x03e07c1f;
slouken@1546
   884
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   885
			},{
slouken@1542
   886
				s = *srcp++;
slouken@1542
   887
				d = *dstp;
slouken@1542
   888
				/*
slouken@1542
   889
				 * shift out the middle component (green) to
slouken@1542
   890
				 * the high 16 bits, and process all three RGB
slouken@1542
   891
				 * components at the same time.
slouken@1542
   892
				 */
slouken@1542
   893
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   894
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   895
				d += (s - d) * alpha >> 5;
slouken@1542
   896
				d &= 0x03e07c1f;
slouken@1546
   897
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   898
			        s = *srcp++;
slouken@1542
   899
				d = *dstp;
slouken@1542
   900
				/*
slouken@1542
   901
				 * shift out the middle component (green) to
slouken@1542
   902
				 * the high 16 bits, and process all three RGB
slouken@1542
   903
				 * components at the same time.
slouken@1542
   904
				 */
slouken@1542
   905
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   906
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   907
				d += (s - d) * alpha >> 5;
slouken@1542
   908
				d &= 0x03e07c1f;
slouken@1546
   909
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   910
			},{
slouken@1542
   911
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   912
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   913
slouken@1542
   914
				/* red -- process the bits in place */
slouken@1542
   915
				src2 = src1;
slouken@1542
   916
				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
   917
slouken@1542
   918
				dst2 = dst1;
slouken@1542
   919
				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
   920
slouken@1542
   921
				/* blend */
slouken@1542
   922
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   923
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   924
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   925
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   926
				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
   927
slouken@1542
   928
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   929
				
slouken@1542
   930
				/* green -- process the bits in place */
slouken@1542
   931
				src2 = src1;
slouken@1542
   932
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   933
slouken@1542
   934
				dst2 = dst1;
slouken@1542
   935
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   936
slouken@1542
   937
				/* blend */
slouken@1542
   938
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   939
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   940
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   941
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   942
slouken@1542
   943
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   944
slouken@1542
   945
				/* blue */
slouken@1542
   946
				src2 = src1; /* src -> src2 */
slouken@1542
   947
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   948
slouken@1542
   949
				dst2 = dst1; /* dst -> dst2 */
slouken@1542
   950
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   951
slouken@1542
   952
				/* blend */
slouken@1542
   953
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   954
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   955
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   956
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   957
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   958
slouken@1542
   959
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   960
slouken@1542
   961
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   962
slouken@1542
   963
				srcp += 4;
slouken@1542
   964
				dstp += 4;
slouken@1895
   965
			}, width);
slouken@1895
   966
			/* *INDENT-ON* */
slouken@1895
   967
            srcp += srcskip;
slouken@1895
   968
            dstp += dstskip;
slouken@1895
   969
        }
slouken@1895
   970
        _mm_empty();
slouken@1895
   971
    }
slouken@1542
   972
}
slouken@2255
   973
slouken@2255
   974
#endif /* __MMX__ */
slouken@689
   975
slouken@1
   976
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   977
static void
slouken@1895
   978
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   979
{
slouken@2267
   980
    unsigned alpha = info->a;
slouken@1895
   981
    if (alpha == 128) {
slouken@1895
   982
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   983
    } else {
slouken@2262
   984
        int width = info->dst_w;
slouken@2262
   985
        int height = info->dst_h;
slouken@2262
   986
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   987
        int srcskip = info->src_skip >> 1;
slouken@2262
   988
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   989
        int dstskip = info->dst_skip >> 1;
slouken@1895
   990
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1
   991
slouken@1895
   992
        while (height--) {
slouken@1895
   993
			/* *INDENT-OFF* */
slouken@1
   994
			DUFFS_LOOP4({
slouken@1
   995
				Uint32 s = *srcp++;
slouken@1
   996
				Uint32 d = *dstp;
slouken@1
   997
				/*
slouken@1
   998
				 * shift out the middle component (green) to
slouken@1
   999
				 * the high 16 bits, and process all three RGB
slouken@1
  1000
				 * components at the same time.
slouken@1
  1001
				 */
slouken@1
  1002
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  1003
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  1004
				d += (s - d) * alpha >> 5;
slouken@1
  1005
				d &= 0x07e0f81f;
slouken@1428
  1006
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  1007
			}, width);
slouken@1895
  1008
			/* *INDENT-ON* */
slouken@1895
  1009
            srcp += srcskip;
slouken@1895
  1010
            dstp += dstskip;
slouken@1895
  1011
        }
slouken@1895
  1012
    }
slouken@0
  1013
}
slouken@0
  1014
slouken@0
  1015
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  1016
static void
slouken@1895
  1017
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1018
{
slouken@2267
  1019
    unsigned alpha = info->a;   /* downscale alpha to 5 bits */
slouken@1895
  1020
    if (alpha == 128) {
slouken@1895
  1021
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  1022
    } else {
slouken@2262
  1023
        int width = info->dst_w;
slouken@2262
  1024
        int height = info->dst_h;
slouken@2262
  1025
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
  1026
        int srcskip = info->src_skip >> 1;
slouken@2262
  1027
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1028
        int dstskip = info->dst_skip >> 1;
slouken@1895
  1029
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@0
  1030
slouken@1895
  1031
        while (height--) {
slouken@1895
  1032
			/* *INDENT-OFF* */
slouken@1
  1033
			DUFFS_LOOP4({
slouken@1
  1034
				Uint32 s = *srcp++;
slouken@1
  1035
				Uint32 d = *dstp;
slouken@1
  1036
				/*
slouken@1
  1037
				 * shift out the middle component (green) to
slouken@1
  1038
				 * the high 16 bits, and process all three RGB
slouken@1
  1039
				 * components at the same time.
slouken@1
  1040
				 */
slouken@1
  1041
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  1042
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  1043
				d += (s - d) * alpha >> 5;
slouken@1
  1044
				d &= 0x03e07c1f;
slouken@1428
  1045
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  1046
			}, width);
slouken@1895
  1047
			/* *INDENT-ON* */
slouken@1895
  1048
            srcp += srcskip;
slouken@1895
  1049
            dstp += dstskip;
slouken@1895
  1050
        }
slouken@1895
  1051
    }
slouken@0
  1052
}
slouken@0
  1053
slouken@0
  1054
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@1895
  1055
static void
slouken@1895
  1056
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1057
{
slouken@2262
  1058
    int width = info->dst_w;
slouken@2262
  1059
    int height = info->dst_h;
slouken@2262
  1060
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1061
    int srcskip = info->src_skip >> 2;
slouken@2262
  1062
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1063
    int dstskip = info->dst_skip >> 1;
slouken@0
  1064
slouken@1895
  1065
    while (height--) {
slouken@1895
  1066
	    /* *INDENT-OFF* */
slouken@0
  1067
	    DUFFS_LOOP4({
slouken@0
  1068
		Uint32 s = *srcp;
slouken@0
  1069
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1070
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1071
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1072
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1073
		   Benchmark this! */
slouken@689
  1074
		if(alpha) {   
slouken@689
  1075
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1076
		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@689
  1077
		  } else {
slouken@0
  1078
		    Uint32 d = *dstp;
slouken@0
  1079
		    /*
slouken@0
  1080
		     * convert source and destination to G0RAB65565
slouken@0
  1081
		     * and blend all components at the same time
slouken@0
  1082
		     */
slouken@0
  1083
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  1084
		      + (s >> 3 & 0x1f);
slouken@0
  1085
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  1086
		    d += (s - d) * alpha >> 5;
slouken@0
  1087
		    d &= 0x07e0f81f;
slouken@1428
  1088
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1089
		  }
slouken@0
  1090
		}
slouken@0
  1091
		srcp++;
slouken@0
  1092
		dstp++;
slouken@0
  1093
	    }, width);
slouken@1895
  1094
	    /* *INDENT-ON* */
slouken@1895
  1095
        srcp += srcskip;
slouken@1895
  1096
        dstp += dstskip;
slouken@1895
  1097
    }
slouken@0
  1098
}
slouken@0
  1099
slouken@0
  1100
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@1895
  1101
static void
slouken@1895
  1102
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1103
{
slouken@2262
  1104
    int width = info->dst_w;
slouken@2262
  1105
    int height = info->dst_h;
slouken@2262
  1106
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1107
    int srcskip = info->src_skip >> 2;
slouken@2262
  1108
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1109
    int dstskip = info->dst_skip >> 1;
slouken@0
  1110
slouken@1895
  1111
    while (height--) {
slouken@1895
  1112
	    /* *INDENT-OFF* */
slouken@0
  1113
	    DUFFS_LOOP4({
slouken@0
  1114
		unsigned alpha;
slouken@0
  1115
		Uint32 s = *srcp;
slouken@0
  1116
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1117
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1118
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1119
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1120
		   Benchmark this! */
slouken@689
  1121
		if(alpha) {   
slouken@689
  1122
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1123
		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@689
  1124
		  } else {
slouken@0
  1125
		    Uint32 d = *dstp;
slouken@0
  1126
		    /*
slouken@0
  1127
		     * convert source and destination to G0RAB65565
slouken@0
  1128
		     * and blend all components at the same time
slouken@0
  1129
		     */
slouken@0
  1130
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  1131
		      + (s >> 3 & 0x1f);
slouken@0
  1132
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  1133
		    d += (s - d) * alpha >> 5;
slouken@0
  1134
		    d &= 0x03e07c1f;
slouken@1428
  1135
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1136
		  }
slouken@0
  1137
		}
slouken@0
  1138
		srcp++;
slouken@0
  1139
		dstp++;
slouken@0
  1140
	    }, width);
slouken@1895
  1141
	    /* *INDENT-ON* */
slouken@1895
  1142
        srcp += srcskip;
slouken@1895
  1143
        dstp += dstskip;
slouken@1895
  1144
    }
slouken@0
  1145
}
slouken@0
  1146
slouken@0
  1147
/* General (slow) N->N blending with per-surface alpha */
slouken@1895
  1148
static void
slouken@1895
  1149
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1150
{
slouken@2262
  1151
    int width = info->dst_w;
slouken@2262
  1152
    int height = info->dst_h;
slouken@2262
  1153
    Uint8 *src = info->src;
slouken@2267
  1154
    int srcskip = info->src_skip;
slouken@2262
  1155
    Uint8 *dst = info->dst;
slouken@2267
  1156
    int dstskip = info->dst_skip;
slouken@2267
  1157
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1158
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1159
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1160
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1161
    Uint32 Pixel;
slouken@7502
  1162
    unsigned sR, sG, sB;
slouken@7502
  1163
    unsigned dR, dG, dB, dA;
slouken@7502
  1164
    const unsigned sA = info->a;
slouken@0
  1165
slouken@1895
  1166
    if (sA) {
slouken@1895
  1167
        while (height--) {
slouken@1895
  1168
	    /* *INDENT-OFF* */
slouken@0
  1169
	    DUFFS_LOOP4(
slouken@0
  1170
	    {
icculus@1162
  1171
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@7502
  1172
		DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1173
		ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@0
  1174
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1175
		src += srcbpp;
slouken@0
  1176
		dst += dstbpp;
slouken@0
  1177
	    },
slouken@0
  1178
	    width);
slouken@1895
  1179
	    /* *INDENT-ON* */
slouken@1895
  1180
            src += srcskip;
slouken@1895
  1181
            dst += dstskip;
slouken@1895
  1182
        }
slouken@1895
  1183
    }
slouken@0
  1184
}
slouken@0
  1185
slouken@0
  1186
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@1895
  1187
static void
slouken@1895
  1188
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
  1189
{
slouken@2262
  1190
    int width = info->dst_w;
slouken@2262
  1191
    int height = info->dst_h;
slouken@2262
  1192
    Uint8 *src = info->src;
slouken@2267
  1193
    int srcskip = info->src_skip;
slouken@2262
  1194
    Uint8 *dst = info->dst;
slouken@2267
  1195
    int dstskip = info->dst_skip;
slouken@2267
  1196
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1197
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@2267
  1198
    Uint32 ckey = info->colorkey;
slouken@1895
  1199
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1200
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1201
    Uint32 Pixel;
slouken@7502
  1202
    unsigned sR, sG, sB;
slouken@7502
  1203
    unsigned dR, dG, dB, dA;
slouken@7502
  1204
    const unsigned sA = info->a;
slouken@0
  1205
slouken@1895
  1206
    while (height--) {
slouken@1895
  1207
	    /* *INDENT-OFF* */
slouken@0
  1208
	    DUFFS_LOOP4(
slouken@0
  1209
	    {
icculus@1162
  1210
		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
icculus@1162
  1211
		if(sA && Pixel != ckey) {
icculus@1162
  1212
		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
slouken@7502
  1213
		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1214
		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@0
  1215
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1216
		}
slouken@0
  1217
		src += srcbpp;
slouken@0
  1218
		dst += dstbpp;
slouken@0
  1219
	    },
slouken@0
  1220
	    width);
slouken@1895
  1221
	    /* *INDENT-ON* */
slouken@1895
  1222
        src += srcskip;
slouken@1895
  1223
        dst += dstskip;
slouken@1895
  1224
    }
slouken@0
  1225
}
slouken@0
  1226
slouken@0
  1227
/* General (slow) N->N blending with pixel alpha */
slouken@1895
  1228
static void
slouken@1895
  1229
BlitNtoNPixelAlpha(SDL_BlitInfo * info)
slouken@0
  1230
{
slouken@2262
  1231
    int width = info->dst_w;
slouken@2262
  1232
    int height = info->dst_h;
slouken@2262
  1233
    Uint8 *src = info->src;
slouken@2267
  1234
    int srcskip = info->src_skip;
slouken@2262
  1235
    Uint8 *dst = info->dst;
slouken@2267
  1236
    int dstskip = info->dst_skip;
slouken@2267
  1237
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1238
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1239
    int srcbpp;
slouken@1895
  1240
    int dstbpp;
slouken@7502
  1241
    Uint32 Pixel;
slouken@7502
  1242
    unsigned sR, sG, sB, sA;
slouken@7502
  1243
    unsigned dR, dG, dB, dA;
slouken@0
  1244
slouken@1895
  1245
    /* Set up some basic variables */
slouken@1895
  1246
    srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1247
    dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1248
slouken@1895
  1249
    while (height--) {
slouken@1895
  1250
	    /* *INDENT-OFF* */
slouken@0
  1251
	    DUFFS_LOOP4(
slouken@0
  1252
	    {
icculus@1162
  1253
		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@689
  1254
		if(sA) {
slouken@7502
  1255
		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1256
		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@7502
  1257
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  1258
		}
slouken@0
  1259
		src += srcbpp;
slouken@0
  1260
		dst += dstbpp;
slouken@0
  1261
	    },
slouken@0
  1262
	    width);
slouken@1895
  1263
	    /* *INDENT-ON* */
slouken@1895
  1264
        src += srcskip;
slouken@1895
  1265
        dst += dstskip;
slouken@1895
  1266
    }
slouken@0
  1267
}
slouken@0
  1268
slouken@0
  1269
slouken@2267
  1270
SDL_BlitFunc
slouken@2267
  1271
SDL_CalculateBlitA(SDL_Surface * surface)
slouken@0
  1272
{
slouken@0
  1273
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1274
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  1275
slouken@2853
  1276
    switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
slouken@2267
  1277
    case SDL_COPY_BLEND:
slouken@1895
  1278
        /* Per-pixel alpha blits */
slouken@1895
  1279
        switch (df->BytesPerPixel) {
slouken@1895
  1280
        case 1:
slouken@1895
  1281
            return BlitNto1PixelAlpha;
slouken@0
  1282
slouken@1895
  1283
        case 2:
slouken@5389
  1284
                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@5389
  1285
                    && sf->Gmask == 0xff00
slouken@5389
  1286
                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@5389
  1287
                        || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@1895
  1288
                if (df->Gmask == 0x7e0)
slouken@1895
  1289
                    return BlitARGBto565PixelAlpha;
slouken@1895
  1290
                else if (df->Gmask == 0x3e0)
slouken@1895
  1291
                    return BlitARGBto555PixelAlpha;
slouken@1895
  1292
            }
slouken@1895
  1293
            return BlitNtoNPixelAlpha;
slouken@0
  1294
slouken@1895
  1295
        case 4:
slouken@1895
  1296
            if (sf->Rmask == df->Rmask
slouken@1895
  1297
                && sf->Gmask == df->Gmask
slouken@1895
  1298
                && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@5389
  1299
#if defined(__MMX__) || defined(__3dNOW__)
slouken@1895
  1300
                if (sf->Rshift % 8 == 0
slouken@1895
  1301
                    && sf->Gshift % 8 == 0
slouken@1895
  1302
                    && sf->Bshift % 8 == 0
slouken@1895
  1303
                    && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
slouken@5389
  1304
#ifdef __3dNOW__
slouken@5389
  1305
                    if (SDL_Has3DNow())
slouken@5389
  1306
                        return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@5389
  1307
#endif
slouken@5389
  1308
#ifdef __MMX__
slouken@1895
  1309
                    if (SDL_HasMMX())
slouken@1895
  1310
                        return BlitRGBtoRGBPixelAlphaMMX;
slouken@5389
  1311
#endif
slouken@1895
  1312
                }
slouken@5389
  1313
#endif /* __MMX__ || __3dNOW__ */
slouken@1895
  1314
                if (sf->Amask == 0xff000000) {
slouken@1895
  1315
                    return BlitRGBtoRGBPixelAlpha;
slouken@1895
  1316
                }
slouken@1895
  1317
            }
slouken@7502
  1318
            return BlitNtoNPixelAlpha;
slouken@0
  1319
slouken@1895
  1320
        case 3:
slouken@1895
  1321
        default:
slouken@1895
  1322
            return BlitNtoNPixelAlpha;
slouken@1895
  1323
        }
slouken@2267
  1324
        break;
slouken@2267
  1325
slouken@2267
  1326
    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1327
        if (sf->Amask == 0) {
slouken@2267
  1328
            /* Per-surface alpha blits */
slouken@2267
  1329
            switch (df->BytesPerPixel) {
slouken@2267
  1330
            case 1:
slouken@2267
  1331
                return BlitNto1SurfaceAlpha;
slouken@2267
  1332
slouken@2267
  1333
            case 2:
slouken@2267
  1334
                if (surface->map->identity) {
slouken@2267
  1335
                    if (df->Gmask == 0x7e0) {
slouken@2267
  1336
#ifdef __MMX__
slouken@2267
  1337
                        if (SDL_HasMMX())
slouken@2267
  1338
                            return Blit565to565SurfaceAlphaMMX;
slouken@2267
  1339
                        else
slouken@2267
  1340
#endif
slouken@2267
  1341
                            return Blit565to565SurfaceAlpha;
slouken@2267
  1342
                    } else if (df->Gmask == 0x3e0) {
slouken@2267
  1343
#ifdef __MMX__
slouken@2267
  1344
                        if (SDL_HasMMX())
slouken@2267
  1345
                            return Blit555to555SurfaceAlphaMMX;
slouken@2267
  1346
                        else
slouken@2267
  1347
#endif
slouken@2267
  1348
                            return Blit555to555SurfaceAlpha;
slouken@2267
  1349
                    }
slouken@2267
  1350
                }
slouken@2267
  1351
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1352
slouken@2267
  1353
            case 4:
slouken@2267
  1354
                if (sf->Rmask == df->Rmask
slouken@2267
  1355
                    && sf->Gmask == df->Gmask
slouken@2267
  1356
                    && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@2267
  1357
#ifdef __MMX__
slouken@2267
  1358
                    if (sf->Rshift % 8 == 0
slouken@2267
  1359
                        && sf->Gshift % 8 == 0
slouken@2267
  1360
                        && sf->Bshift % 8 == 0 && SDL_HasMMX())
slouken@2267
  1361
                        return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@2267
  1362
#endif
slouken@2267
  1363
                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
slouken@2267
  1364
                        return BlitRGBtoRGBSurfaceAlpha;
slouken@2267
  1365
                    }
slouken@2267
  1366
                }
slouken@7502
  1367
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1368
slouken@2267
  1369
            case 3:
slouken@2267
  1370
            default:
slouken@2267
  1371
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1372
            }
slouken@2267
  1373
        }
slouken@2267
  1374
        break;
slouken@2267
  1375
slouken@2267
  1376
    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1377
        if (sf->Amask == 0) {
slouken@7502
  1378
            if (df->BytesPerPixel == 1) {
slouken@2267
  1379
                return BlitNto1SurfaceAlphaKey;
slouken@7502
  1380
            } else {
slouken@2267
  1381
                return BlitNtoNSurfaceAlphaKey;
slouken@7502
  1382
            }
slouken@2267
  1383
        }
slouken@2267
  1384
        break;
slouken@0
  1385
    }
slouken@2267
  1386
slouken@2267
  1387
    return NULL;
slouken@0
  1388
}
slouken@0
  1389
slouken@1895
  1390
/* vi: set ts=4 sw=4 expandtab: */