src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Fri, 03 Mar 2017 16:38:17 -0500
changeset 10926 97c829825e0e
parent 10737 3406a0f8b041
child 11811 5d94cb6b24d3
permissions -rw-r--r--
Fix some more compiler warnings on armcc.
slouken@0
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@10737
     3
  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
slouken@0
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@0
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@0
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
slouken@0
    20
*/
icculus@8093
    21
#include "../SDL_internal.h"
slouken@0
    22
slouken@0
    23
#include "SDL_video.h"
slouken@0
    24
#include "SDL_blit.h"
slouken@0
    25
slouken@0
    26
/* Functions to perform alpha blended blitting */
slouken@0
    27
slouken@0
    28
/* N->1 blending with per-surface alpha */
slouken@1895
    29
static void
slouken@1895
    30
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
    31
{
slouken@2262
    32
    int width = info->dst_w;
slouken@2262
    33
    int height = info->dst_h;
slouken@2262
    34
    Uint8 *src = info->src;
slouken@2267
    35
    int srcskip = info->src_skip;
slouken@2262
    36
    Uint8 *dst = info->dst;
slouken@2267
    37
    int dstskip = info->dst_skip;
slouken@1895
    38
    Uint8 *palmap = info->table;
slouken@2267
    39
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    40
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    41
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    42
    Uint32 Pixel;
slouken@7502
    43
    unsigned sR, sG, sB;
slouken@7502
    44
    unsigned dR, dG, dB;
slouken@2267
    45
    const unsigned A = info->a;
slouken@0
    46
slouken@1895
    47
    while (height--) {
slouken@1895
    48
	    /* *INDENT-OFF* */
slouken@0
    49
	    DUFFS_LOOP4(
slouken@0
    50
	    {
icculus@1162
    51
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    52
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    53
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    54
		dB = dstfmt->palette->colors[*dst].b;
slouken@7502
    55
		ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@0
    56
		dR &= 0xff;
slouken@0
    57
		dG &= 0xff;
slouken@0
    58
		dB &= 0xff;
slouken@0
    59
		/* Pack RGB into 8bit pixel */
slouken@0
    60
		if ( palmap == NULL ) {
slouken@7502
    61
		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
    62
		} else {
slouken@7502
    63
		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
    64
		}
slouken@0
    65
		dst++;
slouken@0
    66
		src += srcbpp;
slouken@0
    67
	    },
slouken@0
    68
	    width);
slouken@1895
    69
	    /* *INDENT-ON* */
slouken@1895
    70
        src += srcskip;
slouken@1895
    71
        dst += dstskip;
slouken@1895
    72
    }
slouken@0
    73
}
slouken@0
    74
slouken@0
    75
/* N->1 blending with pixel alpha */
slouken@1895
    76
static void
slouken@1895
    77
BlitNto1PixelAlpha(SDL_BlitInfo * info)
slouken@0
    78
{
slouken@2262
    79
    int width = info->dst_w;
slouken@2262
    80
    int height = info->dst_h;
slouken@2262
    81
    Uint8 *src = info->src;
slouken@2267
    82
    int srcskip = info->src_skip;
slouken@2262
    83
    Uint8 *dst = info->dst;
slouken@2267
    84
    int dstskip = info->dst_skip;
slouken@1895
    85
    Uint8 *palmap = info->table;
slouken@2267
    86
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    87
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    88
    int srcbpp = srcfmt->BytesPerPixel;
slouken@7502
    89
    Uint32 Pixel;
slouken@7502
    90
    unsigned sR, sG, sB, sA;
slouken@7502
    91
    unsigned dR, dG, dB;
slouken@0
    92
slouken@1895
    93
    while (height--) {
slouken@1895
    94
	    /* *INDENT-OFF* */
slouken@0
    95
	    DUFFS_LOOP4(
slouken@0
    96
	    {
icculus@1162
    97
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
    98
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    99
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   100
		dB = dstfmt->palette->colors[*dst].b;
slouken@7502
   101
		ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   102
		dR &= 0xff;
slouken@0
   103
		dG &= 0xff;
slouken@0
   104
		dB &= 0xff;
slouken@0
   105
		/* Pack RGB into 8bit pixel */
slouken@0
   106
		if ( palmap == NULL ) {
slouken@7502
   107
		    *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
   108
		} else {
slouken@7502
   109
		    *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
   110
		}
slouken@0
   111
		dst++;
slouken@0
   112
		src += srcbpp;
slouken@0
   113
	    },
slouken@0
   114
	    width);
slouken@1895
   115
	    /* *INDENT-ON* */
slouken@1895
   116
        src += srcskip;
slouken@1895
   117
        dst += dstskip;
slouken@1895
   118
    }
slouken@0
   119
}
slouken@0
   120
slouken@0
   121
/* colorkeyed N->1 blending with per-surface alpha */
slouken@1895
   122
static void
slouken@1895
   123
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
   124
{
slouken@2262
   125
    int width = info->dst_w;
slouken@2262
   126
    int height = info->dst_h;
slouken@2262
   127
    Uint8 *src = info->src;
slouken@2267
   128
    int srcskip = info->src_skip;
slouken@2262
   129
    Uint8 *dst = info->dst;
slouken@2267
   130
    int dstskip = info->dst_skip;
slouken@1895
   131
    Uint8 *palmap = info->table;
slouken@2267
   132
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
   133
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
   134
    int srcbpp = srcfmt->BytesPerPixel;
slouken@2267
   135
    Uint32 ckey = info->colorkey;
slouken@7502
   136
    Uint32 Pixel;
slouken@7502
   137
    unsigned sR, sG, sB;
slouken@7502
   138
    unsigned dR, dG, dB;
slouken@7502
   139
    const unsigned A = info->a;
slouken@0
   140
slouken@1895
   141
    while (height--) {
slouken@1895
   142
	    /* *INDENT-OFF* */
slouken@0
   143
	    DUFFS_LOOP(
slouken@0
   144
	    {
icculus@1162
   145
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   146
		if ( Pixel != ckey ) {
slouken@0
   147
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   148
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   149
		    dB = dstfmt->palette->colors[*dst].b;
slouken@7502
   150
		    ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
slouken@0
   151
		    dR &= 0xff;
slouken@0
   152
		    dG &= 0xff;
slouken@0
   153
		    dB &= 0xff;
slouken@0
   154
		    /* Pack RGB into 8bit pixel */
slouken@0
   155
		    if ( palmap == NULL ) {
slouken@7502
   156
                *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
slouken@0
   157
		    } else {
slouken@7502
   158
                *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
slouken@0
   159
		    }
slouken@0
   160
		}
slouken@0
   161
		dst++;
slouken@0
   162
		src += srcbpp;
slouken@0
   163
	    },
slouken@0
   164
	    width);
slouken@1895
   165
	    /* *INDENT-ON* */
slouken@1895
   166
        src += srcskip;
slouken@1895
   167
        dst += dstskip;
slouken@1895
   168
    }
slouken@0
   169
}
slouken@0
   170
slouken@2255
   171
#ifdef __MMX__
slouken@1542
   172
slouken@1542
   173
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   174
static void
slouken@1895
   175
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@1542
   176
{
slouken@2262
   177
    int width = info->dst_w;
slouken@2262
   178
    int height = info->dst_h;
slouken@2262
   179
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   180
    int srcskip = info->src_skip >> 2;
slouken@2262
   181
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   182
    int dstskip = info->dst_skip >> 2;
slouken@2267
   183
    Uint32 dalpha = info->dst_fmt->Amask;
slouken@1542
   184
slouken@1895
   185
    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   186
slouken@1895
   187
    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
slouken@1895
   188
    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
slouken@1895
   189
    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
slouken@1542
   190
slouken@1895
   191
    while (height--) {
slouken@1895
   192
        int n = width;
slouken@1895
   193
        if (n & 1) {
slouken@1895
   194
            Uint32 s = *srcp++;
slouken@1895
   195
            Uint32 d = *dstp;
slouken@1895
   196
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1895
   197
                       + (s & d & 0x00010101)) | dalpha;
slouken@1895
   198
            n--;
slouken@1895
   199
        }
slouken@1542
   200
slouken@1895
   201
        for (n >>= 1; n > 0; --n) {
slouken@1895
   202
            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   203
            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   204
slouken@1895
   205
            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   206
            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   207
slouken@1895
   208
            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
slouken@1895
   209
            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
slouken@1895
   210
            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
slouken@1895
   211
            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
slouken@1895
   212
slouken@1895
   213
            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
slouken@1895
   214
            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
slouken@1895
   215
            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
slouken@1895
   216
            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
slouken@1895
   217
slouken@1895
   218
            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
slouken@1895
   219
            dstp += 2;
slouken@1895
   220
            srcp += 2;
slouken@1895
   221
        }
slouken@1895
   222
slouken@1895
   223
        srcp += srcskip;
slouken@1895
   224
        dstp += dstskip;
slouken@1895
   225
    }
slouken@1895
   226
    _mm_empty();
slouken@1542
   227
}
slouken@1542
   228
slouken@1542
   229
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   230
static void
slouken@1895
   231
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   232
{
slouken@2267
   233
    SDL_PixelFormat *df = info->dst_fmt;
slouken@6863
   234
    Uint32 chanmask;
slouken@2267
   235
    unsigned alpha = info->a;
slouken@1542
   236
slouken@1895
   237
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   238
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   239
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   240
    } else {
slouken@2262
   241
        int width = info->dst_w;
slouken@2262
   242
        int height = info->dst_h;
slouken@2262
   243
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   244
        int srcskip = info->src_skip >> 2;
slouken@2262
   245
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   246
        int dstskip = info->dst_skip >> 2;
slouken@1895
   247
        Uint32 dalpha = df->Amask;
slouken@1895
   248
        Uint32 amult;
slouken@1542
   249
slouken@1895
   250
        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   251
slouken@1895
   252
        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
slouken@1895
   253
        /* form the alpha mult */
slouken@1895
   254
        amult = alpha | (alpha << 8);
slouken@1895
   255
        amult = amult | (amult << 16);
slouken@1895
   256
        chanmask =
slouken@3013
   257
            (0xff << df->Rshift) | (0xff << df->
slouken@3013
   258
                                    Gshift) | (0xff << df->Bshift);
slouken@1895
   259
        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1895
   260
        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1895
   261
        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   262
        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
slouken@1542
   263
slouken@1895
   264
        while (height--) {
slouken@1895
   265
            int n = width;
slouken@1895
   266
            if (n & 1) {
slouken@1895
   267
                /* One Pixel Blend */
slouken@1895
   268
                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
slouken@1895
   269
                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1542
   270
slouken@1895
   271
                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1895
   272
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   273
slouken@1895
   274
                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
slouken@1895
   275
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   276
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   277
                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1542
   278
slouken@1895
   279
                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
slouken@1895
   280
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   281
                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   282
slouken@1895
   283
                ++srcp;
slouken@1895
   284
                ++dstp;
slouken@1542
   285
slouken@1895
   286
                n--;
slouken@1895
   287
            }
slouken@1542
   288
slouken@1895
   289
            for (n >>= 1; n > 0; --n) {
slouken@1895
   290
                /* Two Pixels Blend */
slouken@1895
   291
                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   292
                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   293
                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1895
   294
                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   295
slouken@1895
   296
                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   297
                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
slouken@1895
   298
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1895
   299
                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1895
   300
slouken@1895
   301
                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
slouken@1895
   302
                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
slouken@1895
   303
                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
slouken@1895
   304
                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1895
   305
slouken@1895
   306
                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
slouken@1895
   307
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   308
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   309
                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1895
   310
slouken@1895
   311
                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1895
   312
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   313
slouken@1895
   314
                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1895
   315
slouken@1895
   316
                srcp += 2;
slouken@1895
   317
                dstp += 2;
slouken@1895
   318
            }
slouken@1895
   319
            srcp += srcskip;
slouken@1895
   320
            dstp += dstskip;
slouken@1895
   321
        }
slouken@1895
   322
        _mm_empty();
slouken@1895
   323
    }
slouken@1542
   324
}
slouken@1542
   325
slouken@1542
   326
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   327
static void
slouken@1895
   328
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   329
{
slouken@2262
   330
    int width = info->dst_w;
slouken@2262
   331
    int height = info->dst_h;
slouken@2262
   332
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   333
    int srcskip = info->src_skip >> 2;
slouken@2262
   334
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   335
    int dstskip = info->dst_skip >> 2;
slouken@2267
   336
    SDL_PixelFormat *sf = info->src_fmt;
slouken@1895
   337
    Uint32 amask = sf->Amask;
slouken@1895
   338
    Uint32 ashift = sf->Ashift;
slouken@7640
   339
    Uint64 multmask, multmask2;
slouken@1542
   340
slouken@7640
   341
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@1542
   342
slouken@1895
   343
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   344
    multmask = 0x00FF;
slouken@7640
   345
	multmask <<= (ashift * 2);
sbc@8879
   346
	multmask2 = 0x00FF00FF00FF00FFULL;
slouken@1542
   347
slouken@1895
   348
    while (height--) {
slouken@1895
   349
		/* *INDENT-OFF* */
slouken@1542
   350
		DUFFS_LOOP4({
slouken@1542
   351
		Uint32 alpha = *srcp & amask;
slouken@1542
   352
		if (alpha == 0) {
slouken@1542
   353
			/* do nothing */
slouken@7641
   354
		} else if (alpha == amask) {
slouken@7640
   355
			*dstp = *srcp;
slouken@1542
   356
		} else {
gabomdq@7677
   357
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@1542
   358
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   359
gabomdq@7677
   360
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1542
   361
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   362
slouken@1542
   363
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
   364
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
   365
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@7640
   366
			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
gabomdq@7677
   367
			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
gabomdq@7677
   368
			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
slouken@1542
   369
slouken@1542
   370
			/* blend */		    
slouken@7640
   371
			src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@7640
   372
			src1 = _mm_srli_pi16(src1, 8);
slouken@7640
   373
			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@7640
   374
			dst1 = _mm_srli_pi16(dst1, 8);
slouken@7640
   375
			dst1 = _mm_add_pi16(src1, dst1);
slouken@7640
   376
			dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@1542
   377
			
slouken@1542
   378
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   379
		}
slouken@1542
   380
		++srcp;
slouken@1542
   381
		++dstp;
slouken@1542
   382
	    }, width);
slouken@1895
   383
		/* *INDENT-ON* */
slouken@1895
   384
        srcp += srcskip;
slouken@1895
   385
        dstp += dstskip;
slouken@1895
   386
    }
slouken@1895
   387
    _mm_empty();
slouken@1542
   388
}
slouken@1895
   389
slouken@2255
   390
#endif /* __MMX__ */
slouken@689
   391
slouken@1
   392
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   393
static void
slouken@1895
   394
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
slouken@0
   395
{
slouken@2262
   396
    int width = info->dst_w;
slouken@2262
   397
    int height = info->dst_h;
slouken@2262
   398
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   399
    int srcskip = info->src_skip >> 2;
slouken@2262
   400
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   401
    int dstskip = info->dst_skip >> 2;
slouken@0
   402
slouken@1895
   403
    while (height--) {
slouken@1895
   404
	    /* *INDENT-OFF* */
slouken@0
   405
	    DUFFS_LOOP4({
slouken@1
   406
		    Uint32 s = *srcp++;
slouken@1
   407
		    Uint32 d = *dstp;
slouken@1
   408
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
   409
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
   410
	    }, width);
slouken@1895
   411
	    /* *INDENT-ON* */
slouken@1895
   412
        srcp += srcskip;
slouken@1895
   413
        dstp += dstskip;
slouken@1895
   414
    }
slouken@0
   415
}
slouken@0
   416
slouken@1
   417
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   418
static void
slouken@1895
   419
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   420
{
slouken@2267
   421
    unsigned alpha = info->a;
slouken@1895
   422
    if (alpha == 128) {
slouken@1895
   423
        BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1895
   424
    } else {
slouken@2262
   425
        int width = info->dst_w;
slouken@2262
   426
        int height = info->dst_h;
slouken@2262
   427
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   428
        int srcskip = info->src_skip >> 2;
slouken@2262
   429
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   430
        int dstskip = info->dst_skip >> 2;
slouken@1895
   431
        Uint32 s;
slouken@1895
   432
        Uint32 d;
slouken@1895
   433
        Uint32 s1;
slouken@1895
   434
        Uint32 d1;
slouken@1
   435
slouken@1895
   436
        while (height--) {
slouken@1895
   437
			/* *INDENT-OFF* */
slouken@3035
   438
			DUFFS_LOOP4({
slouken@1
   439
				s = *srcp;
slouken@1
   440
				d = *dstp;
slouken@1
   441
				s1 = s & 0xff00ff;
slouken@1
   442
				d1 = d & 0xff00ff;
slouken@1
   443
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
   444
				     & 0xff00ff;
slouken@1
   445
				s &= 0xff00;
slouken@1
   446
				d &= 0xff00;
slouken@1
   447
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
   448
				*dstp = d1 | d | 0xff000000;
slouken@1
   449
				++srcp;
slouken@1
   450
				++dstp;
slouken@1
   451
			}, width);
slouken@1895
   452
			/* *INDENT-ON* */
slouken@1895
   453
            srcp += srcskip;
slouken@1895
   454
            dstp += dstskip;
slouken@1895
   455
        }
slouken@1895
   456
    }
slouken@1
   457
}
slouken@1
   458
slouken@0
   459
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   460
static void
slouken@1895
   461
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
slouken@0
   462
{
slouken@2262
   463
    int width = info->dst_w;
slouken@2262
   464
    int height = info->dst_h;
slouken@2262
   465
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   466
    int srcskip = info->src_skip >> 2;
slouken@2262
   467
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   468
    int dstskip = info->dst_skip >> 2;
slouken@0
   469
slouken@1895
   470
    while (height--) {
slouken@1895
   471
	    /* *INDENT-OFF* */
slouken@0
   472
	    DUFFS_LOOP4({
slouken@0
   473
		Uint32 dalpha;
slouken@0
   474
		Uint32 d;
slouken@0
   475
		Uint32 s1;
slouken@0
   476
		Uint32 d1;
slouken@0
   477
		Uint32 s = *srcp;
slouken@0
   478
		Uint32 alpha = s >> 24;
slouken@0
   479
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
   480
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
   481
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
   482
		   Benchmark this! */
slouken@7640
   483
		if (alpha) {
slouken@7640
   484
		  if (alpha == SDL_ALPHA_OPAQUE) {
slouken@7640
   485
			  *dstp = *srcp;
slouken@689
   486
		  } else {
slouken@0
   487
		    /*
slouken@0
   488
		     * take out the middle component (green), and process
slouken@0
   489
		     * the other two in parallel. One multiply less.
slouken@0
   490
		     */
slouken@0
   491
		    d = *dstp;
slouken@7640
   492
			dalpha = d >> 24;
slouken@0
   493
		    s1 = s & 0xff00ff;
slouken@0
   494
		    d1 = d & 0xff00ff;
slouken@0
   495
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
   496
		    s &= 0xff00;
slouken@0
   497
		    d &= 0xff00;
slouken@0
   498
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@7640
   499
			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
slouken@7640
   500
		    *dstp = d1 | d | (dalpha << 24);
slouken@689
   501
		  }
slouken@0
   502
		}
slouken@0
   503
		++srcp;
slouken@0
   504
		++dstp;
slouken@0
   505
	    }, width);
slouken@1895
   506
	    /* *INDENT-ON* */
slouken@1895
   507
        srcp += srcskip;
slouken@1895
   508
        dstp += dstskip;
slouken@1895
   509
    }
slouken@0
   510
}
slouken@0
   511
slouken@5389
   512
#ifdef __3dNOW__
slouken@5389
   513
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@5389
   514
static void
slouken@5389
   515
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
slouken@5389
   516
{
slouken@5389
   517
    int width = info->dst_w;
slouken@5389
   518
    int height = info->dst_h;
slouken@5389
   519
    Uint32 *srcp = (Uint32 *) info->src;
slouken@5389
   520
    int srcskip = info->src_skip >> 2;
slouken@5389
   521
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@5389
   522
    int dstskip = info->dst_skip >> 2;
slouken@5389
   523
    SDL_PixelFormat *sf = info->src_fmt;
slouken@5389
   524
    Uint32 amask = sf->Amask;
slouken@5389
   525
    Uint32 ashift = sf->Ashift;
slouken@7640
   526
    Uint64 multmask, multmask2;
slouken@5389
   527
slouken@7640
   528
    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
slouken@5389
   529
slouken@5389
   530
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@7640
   531
    multmask = 0x00FF;
slouken@5389
   532
    multmask <<= (ashift * 2);
sbc@8879
   533
    multmask2 = 0x00FF00FF00FF00FFULL;
slouken@5389
   534
slouken@5389
   535
    while (height--) {
slouken@5389
   536
	    /* *INDENT-OFF* */
slouken@5389
   537
	    DUFFS_LOOP4({
slouken@5389
   538
		Uint32 alpha;
slouken@5389
   539
slouken@5389
   540
		_m_prefetch(srcp + 16);
slouken@5389
   541
		_m_prefetch(dstp + 16);
slouken@5389
   542
slouken@5389
   543
		alpha = *srcp & amask;
slouken@5389
   544
		if (alpha == 0) {
slouken@5389
   545
			/* do nothing */
slouken@7641
   546
		} else if (alpha == amask) {
slouken@7640
   547
			*dstp = *srcp;
slouken@5389
   548
		} else {
gabomdq@7677
   549
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
slouken@5389
   550
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@5389
   551
gabomdq@7677
   552
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@5389
   553
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@5389
   554
slouken@5389
   555
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@5389
   556
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@5389
   557
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@7640
   558
			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
gabomdq@7677
   559
			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha */
gabomdq@7677
   560
			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha */
slouken@7640
   561
slouken@5389
   562
slouken@5389
   563
			/* blend */		    
slouken@7640
   564
			src1 = _mm_mullo_pi16(src1, mm_alpha);
slouken@7640
   565
			src1 = _mm_srli_pi16(src1, 8);
slouken@7640
   566
			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
slouken@7640
   567
			dst1 = _mm_srli_pi16(dst1, 8);
slouken@7640
   568
			dst1 = _mm_add_pi16(src1, dst1);
slouken@7640
   569
			dst1 = _mm_packs_pu16(dst1, mm_zero);
slouken@5389
   570
			
slouken@5389
   571
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@5389
   572
		}
slouken@5389
   573
		++srcp;
slouken@5389
   574
		++dstp;
slouken@5389
   575
	    }, width);
slouken@5389
   576
	    /* *INDENT-ON* */
slouken@5389
   577
        srcp += srcskip;
slouken@5389
   578
        dstp += dstskip;
slouken@5389
   579
    }
slouken@5389
   580
    _mm_empty();
slouken@5389
   581
}
slouken@5389
   582
philipp@9709
   583
#endif /* __3dNOW__ */
slouken@5389
   584
slouken@1
   585
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
   586
slouken@1
   587
/* blend a single 16 bit pixel at 50% */
slouken@1
   588
#define BLEND16_50(d, s, mask)						\
slouken@1
   589
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
   590
slouken@1
   591
/* blend two 16 bit pixels at 50% */
slouken@1
   592
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
   593
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
   594
	 + (s & d & (~(mask | mask << 16))))
slouken@1
   595
slouken@1895
   596
static void
slouken@1895
   597
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
slouken@0
   598
{
slouken@2262
   599
    int width = info->dst_w;
slouken@2262
   600
    int height = info->dst_h;
slouken@2262
   601
    Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   602
    int srcskip = info->src_skip >> 1;
slouken@2262
   603
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   604
    int dstskip = info->dst_skip >> 1;
slouken@0
   605
slouken@1895
   606
    while (height--) {
slouken@1895
   607
        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
slouken@1895
   608
            /*
slouken@1895
   609
             * Source and destination not aligned, pipeline it.
slouken@1895
   610
             * This is mostly a win for big blits but no loss for
slouken@1895
   611
             * small ones
slouken@1895
   612
             */
slouken@1895
   613
            Uint32 prev_sw;
slouken@1895
   614
            int w = width;
slouken@1
   615
slouken@1895
   616
            /* handle odd destination */
slouken@1895
   617
            if ((uintptr_t) dstp & 2) {
slouken@1895
   618
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   619
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   620
                dstp++;
slouken@1895
   621
                srcp++;
slouken@1895
   622
                w--;
slouken@1895
   623
            }
slouken@1895
   624
            srcp++;             /* srcp is now 32-bit aligned */
slouken@1
   625
slouken@1895
   626
            /* bootstrap pipeline with first halfword */
slouken@1895
   627
            prev_sw = ((Uint32 *) srcp)[-1];
slouken@1
   628
slouken@1895
   629
            while (w > 1) {
slouken@1895
   630
                Uint32 sw, dw, s;
slouken@1895
   631
                sw = *(Uint32 *) srcp;
slouken@1895
   632
                dw = *(Uint32 *) dstp;
slouken@1443
   633
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   634
                s = (prev_sw << 16) + (sw >> 16);
slouken@1443
   635
#else
slouken@1895
   636
                s = (prev_sw >> 16) + (sw << 16);
slouken@1443
   637
#endif
slouken@1895
   638
                prev_sw = sw;
slouken@1895
   639
                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
slouken@1895
   640
                dstp += 2;
slouken@1895
   641
                srcp += 2;
slouken@1895
   642
                w -= 2;
slouken@1895
   643
            }
slouken@1
   644
slouken@1895
   645
            /* final pixel if any */
slouken@1895
   646
            if (w) {
slouken@1895
   647
                Uint16 d = *dstp, s;
slouken@1443
   648
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   649
                s = (Uint16) prev_sw;
slouken@1443
   650
#else
slouken@1895
   651
                s = (Uint16) (prev_sw >> 16);
slouken@1443
   652
#endif
slouken@1895
   653
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   654
                srcp++;
slouken@1895
   655
                dstp++;
slouken@1895
   656
            }
slouken@1895
   657
            srcp += srcskip - 1;
slouken@1895
   658
            dstp += dstskip;
slouken@1895
   659
        } else {
slouken@1895
   660
            /* source and destination are aligned */
slouken@1895
   661
            int w = width;
slouken@1
   662
slouken@1895
   663
            /* first odd pixel? */
slouken@1895
   664
            if ((uintptr_t) srcp & 2) {
slouken@1895
   665
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   666
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   667
                srcp++;
slouken@1895
   668
                dstp++;
slouken@1895
   669
                w--;
slouken@1895
   670
            }
slouken@1895
   671
            /* srcp and dstp are now 32-bit aligned */
slouken@1
   672
slouken@1895
   673
            while (w > 1) {
slouken@1895
   674
                Uint32 sw = *(Uint32 *) srcp;
slouken@1895
   675
                Uint32 dw = *(Uint32 *) dstp;
slouken@1895
   676
                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
slouken@1895
   677
                srcp += 2;
slouken@1895
   678
                dstp += 2;
slouken@1895
   679
                w -= 2;
slouken@1895
   680
            }
slouken@1
   681
slouken@1895
   682
            /* last odd pixel? */
slouken@1895
   683
            if (w) {
slouken@1895
   684
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   685
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   686
                srcp++;
slouken@1895
   687
                dstp++;
slouken@1895
   688
            }
slouken@1895
   689
            srcp += srcskip;
slouken@1895
   690
            dstp += dstskip;
slouken@1895
   691
        }
slouken@1895
   692
    }
slouken@1
   693
}
slouken@1
   694
slouken@2255
   695
#ifdef __MMX__
slouken@689
   696
slouken@1542
   697
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   698
static void
slouken@1895
   699
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   700
{
slouken@2267
   701
    unsigned alpha = info->a;
slouken@1895
   702
    if (alpha == 128) {
slouken@1895
   703
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   704
    } else {
slouken@2262
   705
        int width = info->dst_w;
slouken@2262
   706
        int height = info->dst_h;
slouken@2262
   707
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   708
        int srcskip = info->src_skip >> 1;
slouken@2262
   709
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   710
        int dstskip = info->dst_skip >> 1;
slouken@1895
   711
        Uint32 s, d;
slouken@1542
   712
slouken@1895
   713
        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   714
slouken@1895
   715
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   716
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   717
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1895
   718
slouken@1895
   719
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   720
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   721
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   722
           to reduce the number of operations */
slouken@1895
   723
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   724
slouken@1895
   725
        /* Setup the 565 color channel masks */
slouken@1895
   726
        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
slouken@1895
   727
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   728
slouken@1895
   729
        while (height--) {
slouken@1895
   730
			/* *INDENT-OFF* */
slouken@3035
   731
			DUFFS_LOOP_124(
slouken@1542
   732
			{
slouken@1542
   733
				s = *srcp++;
slouken@1542
   734
				d = *dstp;
slouken@1542
   735
				/*
slouken@1542
   736
				 * shift out the middle component (green) to
slouken@1542
   737
				 * the high 16 bits, and process all three RGB
slouken@1542
   738
				 * components at the same time.
slouken@1542
   739
				 */
slouken@1542
   740
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   741
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   742
				d += (s - d) * alpha >> 5;
slouken@1542
   743
				d &= 0x07e0f81f;
slouken@1546
   744
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   745
			},{
slouken@1542
   746
				s = *srcp++;
slouken@1542
   747
				d = *dstp;
slouken@1542
   748
				/*
slouken@1542
   749
				 * shift out the middle component (green) to
slouken@1542
   750
				 * the high 16 bits, and process all three RGB
slouken@1542
   751
				 * components at the same time.
slouken@1542
   752
				 */
slouken@1542
   753
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   754
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   755
				d += (s - d) * alpha >> 5;
slouken@1542
   756
				d &= 0x07e0f81f;
slouken@1546
   757
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   758
				s = *srcp++;
slouken@1542
   759
				d = *dstp;
slouken@1542
   760
				/*
slouken@1542
   761
				 * shift out the middle component (green) to
slouken@1542
   762
				 * the high 16 bits, and process all three RGB
slouken@1542
   763
				 * components at the same time.
slouken@1542
   764
				 */
slouken@1542
   765
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   766
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   767
				d += (s - d) * alpha >> 5;
slouken@1542
   768
				d &= 0x07e0f81f;
slouken@1546
   769
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   770
			},{
slouken@1542
   771
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   772
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   773
slouken@1542
   774
				/* red */
slouken@1542
   775
				src2 = src1;
slouken@1542
   776
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
   777
slouken@1542
   778
				dst2 = dst1;
slouken@1542
   779
				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
   780
slouken@1542
   781
				/* blend */
slouken@1542
   782
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   783
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   784
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   785
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   786
				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
   787
slouken@1542
   788
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   789
slouken@1542
   790
				/* green -- process the bits in place */
slouken@1542
   791
				src2 = src1;
slouken@1542
   792
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   793
slouken@1542
   794
				dst2 = dst1;
slouken@1542
   795
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   796
slouken@1542
   797
				/* blend */
slouken@1542
   798
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   799
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   800
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   801
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   802
slouken@1542
   803
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   804
slouken@1542
   805
				/* blue */
slouken@1542
   806
				src2 = src1;
slouken@1542
   807
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   808
slouken@1542
   809
				dst2 = dst1;
slouken@1542
   810
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   811
slouken@1542
   812
				/* blend */
slouken@1542
   813
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   814
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   815
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   816
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   817
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   818
slouken@1542
   819
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   820
slouken@1542
   821
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   822
slouken@1542
   823
				srcp += 4;
slouken@1542
   824
				dstp += 4;
slouken@1895
   825
			}, width);
slouken@1895
   826
			/* *INDENT-ON* */
slouken@1895
   827
            srcp += srcskip;
slouken@1895
   828
            dstp += dstskip;
slouken@1895
   829
        }
slouken@1895
   830
        _mm_empty();
slouken@1895
   831
    }
slouken@1542
   832
}
slouken@1542
   833
slouken@1542
   834
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
   835
static void
slouken@1895
   836
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   837
{
slouken@2267
   838
    unsigned alpha = info->a;
slouken@1895
   839
    if (alpha == 128) {
slouken@1895
   840
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
   841
    } else {
slouken@2262
   842
        int width = info->dst_w;
slouken@2262
   843
        int height = info->dst_h;
slouken@2262
   844
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   845
        int srcskip = info->src_skip >> 1;
slouken@2262
   846
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   847
        int dstskip = info->dst_skip >> 1;
slouken@1895
   848
        Uint32 s, d;
slouken@1542
   849
slouken@1895
   850
        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   851
slouken@1895
   852
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   853
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   854
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
   855
slouken@1895
   856
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   857
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   858
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   859
           to reduce the number of operations */
slouken@1895
   860
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   861
slouken@1895
   862
        /* Setup the 555 color channel masks */
slouken@1895
   863
        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
slouken@1895
   864
        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
slouken@1895
   865
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   866
slouken@1895
   867
        while (height--) {
slouken@1895
   868
			/* *INDENT-OFF* */
slouken@3035
   869
			DUFFS_LOOP_124(
slouken@1542
   870
			{
slouken@1542
   871
				s = *srcp++;
slouken@1542
   872
				d = *dstp;
slouken@1542
   873
				/*
slouken@1542
   874
				 * shift out the middle component (green) to
slouken@1542
   875
				 * the high 16 bits, and process all three RGB
slouken@1542
   876
				 * components at the same time.
slouken@1542
   877
				 */
slouken@1542
   878
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   879
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   880
				d += (s - d) * alpha >> 5;
slouken@1542
   881
				d &= 0x03e07c1f;
slouken@1546
   882
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   883
			},{
slouken@1542
   884
				s = *srcp++;
slouken@1542
   885
				d = *dstp;
slouken@1542
   886
				/*
slouken@1542
   887
				 * shift out the middle component (green) to
slouken@1542
   888
				 * the high 16 bits, and process all three RGB
slouken@1542
   889
				 * components at the same time.
slouken@1542
   890
				 */
slouken@1542
   891
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   892
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   893
				d += (s - d) * alpha >> 5;
slouken@1542
   894
				d &= 0x03e07c1f;
slouken@1546
   895
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   896
			        s = *srcp++;
slouken@1542
   897
				d = *dstp;
slouken@1542
   898
				/*
slouken@1542
   899
				 * shift out the middle component (green) to
slouken@1542
   900
				 * the high 16 bits, and process all three RGB
slouken@1542
   901
				 * components at the same time.
slouken@1542
   902
				 */
slouken@1542
   903
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   904
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   905
				d += (s - d) * alpha >> 5;
slouken@1542
   906
				d &= 0x03e07c1f;
slouken@1546
   907
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   908
			},{
slouken@1542
   909
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   910
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   911
slouken@1542
   912
				/* red -- process the bits in place */
slouken@1542
   913
				src2 = src1;
slouken@1542
   914
				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
   915
slouken@1542
   916
				dst2 = dst1;
slouken@1542
   917
				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
   918
slouken@1542
   919
				/* blend */
slouken@1542
   920
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   921
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   922
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   923
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   924
				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
   925
slouken@1542
   926
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   927
				
slouken@1542
   928
				/* green -- process the bits in place */
slouken@1542
   929
				src2 = src1;
slouken@1542
   930
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   931
slouken@1542
   932
				dst2 = dst1;
slouken@1542
   933
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   934
slouken@1542
   935
				/* blend */
slouken@1542
   936
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   937
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   938
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   939
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   940
slouken@1542
   941
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   942
slouken@1542
   943
				/* blue */
slouken@1542
   944
				src2 = src1; /* src -> src2 */
slouken@1542
   945
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   946
slouken@1542
   947
				dst2 = dst1; /* dst -> dst2 */
slouken@1542
   948
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   949
slouken@1542
   950
				/* blend */
slouken@1542
   951
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   952
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   953
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   954
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   955
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   956
slouken@1542
   957
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   958
slouken@1542
   959
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   960
slouken@1542
   961
				srcp += 4;
slouken@1542
   962
				dstp += 4;
slouken@1895
   963
			}, width);
slouken@1895
   964
			/* *INDENT-ON* */
slouken@1895
   965
            srcp += srcskip;
slouken@1895
   966
            dstp += dstskip;
slouken@1895
   967
        }
slouken@1895
   968
        _mm_empty();
slouken@1895
   969
    }
slouken@1542
   970
}
slouken@2255
   971
slouken@2255
   972
#endif /* __MMX__ */
slouken@689
   973
slouken@1
   974
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   975
static void
slouken@1895
   976
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   977
{
slouken@2267
   978
    unsigned alpha = info->a;
slouken@1895
   979
    if (alpha == 128) {
slouken@1895
   980
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   981
    } else {
slouken@2262
   982
        int width = info->dst_w;
slouken@2262
   983
        int height = info->dst_h;
slouken@2262
   984
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   985
        int srcskip = info->src_skip >> 1;
slouken@2262
   986
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   987
        int dstskip = info->dst_skip >> 1;
slouken@1895
   988
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1
   989
slouken@1895
   990
        while (height--) {
slouken@1895
   991
			/* *INDENT-OFF* */
slouken@1
   992
			DUFFS_LOOP4({
slouken@1
   993
				Uint32 s = *srcp++;
slouken@1
   994
				Uint32 d = *dstp;
slouken@1
   995
				/*
slouken@1
   996
				 * shift out the middle component (green) to
slouken@1
   997
				 * the high 16 bits, and process all three RGB
slouken@1
   998
				 * components at the same time.
slouken@1
   999
				 */
slouken@1
  1000
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  1001
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  1002
				d += (s - d) * alpha >> 5;
slouken@1
  1003
				d &= 0x07e0f81f;
slouken@1428
  1004
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  1005
			}, width);
slouken@1895
  1006
			/* *INDENT-ON* */
slouken@1895
  1007
            srcp += srcskip;
slouken@1895
  1008
            dstp += dstskip;
slouken@1895
  1009
        }
slouken@1895
  1010
    }
slouken@0
  1011
}
slouken@0
  1012
slouken@0
  1013
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  1014
static void
slouken@1895
  1015
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1016
{
slouken@2267
  1017
    unsigned alpha = info->a;   /* downscale alpha to 5 bits */
slouken@1895
  1018
    if (alpha == 128) {
slouken@1895
  1019
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  1020
    } else {
slouken@2262
  1021
        int width = info->dst_w;
slouken@2262
  1022
        int height = info->dst_h;
slouken@2262
  1023
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
  1024
        int srcskip = info->src_skip >> 1;
slouken@2262
  1025
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1026
        int dstskip = info->dst_skip >> 1;
slouken@1895
  1027
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@0
  1028
slouken@1895
  1029
        while (height--) {
slouken@1895
  1030
			/* *INDENT-OFF* */
slouken@1
  1031
			DUFFS_LOOP4({
slouken@1
  1032
				Uint32 s = *srcp++;
slouken@1
  1033
				Uint32 d = *dstp;
slouken@1
  1034
				/*
slouken@1
  1035
				 * shift out the middle component (green) to
slouken@1
  1036
				 * the high 16 bits, and process all three RGB
slouken@1
  1037
				 * components at the same time.
slouken@1
  1038
				 */
slouken@1
  1039
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  1040
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  1041
				d += (s - d) * alpha >> 5;
slouken@1
  1042
				d &= 0x03e07c1f;
slouken@1428
  1043
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  1044
			}, width);
slouken@1895
  1045
			/* *INDENT-ON* */
slouken@1895
  1046
            srcp += srcskip;
slouken@1895
  1047
            dstp += dstskip;
slouken@1895
  1048
        }
slouken@1895
  1049
    }
slouken@0
  1050
}
slouken@0
  1051
slouken@0
  1052
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@1895
  1053
static void
slouken@1895
  1054
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1055
{
slouken@2262
  1056
    int width = info->dst_w;
slouken@2262
  1057
    int height = info->dst_h;
slouken@2262
  1058
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1059
    int srcskip = info->src_skip >> 2;
slouken@2262
  1060
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1061
    int dstskip = info->dst_skip >> 1;
slouken@0
  1062
slouken@1895
  1063
    while (height--) {
slouken@1895
  1064
	    /* *INDENT-OFF* */
slouken@0
  1065
	    DUFFS_LOOP4({
slouken@0
  1066
		Uint32 s = *srcp;
slouken@0
  1067
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1068
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1069
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1070
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1071
		   Benchmark this! */
slouken@689
  1072
		if(alpha) {   
slouken@689
  1073
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1074
		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@689
  1075
		  } else {
slouken@0
  1076
		    Uint32 d = *dstp;
slouken@0
  1077
		    /*
slouken@0
  1078
		     * convert source and destination to G0RAB65565
slouken@0
  1079
		     * and blend all components at the same time
slouken@0
  1080
		     */
slouken@0
  1081
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  1082
		      + (s >> 3 & 0x1f);
slouken@0
  1083
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  1084
		    d += (s - d) * alpha >> 5;
slouken@0
  1085
		    d &= 0x07e0f81f;
slouken@1428
  1086
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1087
		  }
slouken@0
  1088
		}
slouken@0
  1089
		srcp++;
slouken@0
  1090
		dstp++;
slouken@0
  1091
	    }, width);
slouken@1895
  1092
	    /* *INDENT-ON* */
slouken@1895
  1093
        srcp += srcskip;
slouken@1895
  1094
        dstp += dstskip;
slouken@1895
  1095
    }
slouken@0
  1096
}
slouken@0
  1097
slouken@0
  1098
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@1895
  1099
static void
slouken@1895
  1100
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1101
{
slouken@2262
  1102
    int width = info->dst_w;
slouken@2262
  1103
    int height = info->dst_h;
slouken@2262
  1104
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1105
    int srcskip = info->src_skip >> 2;
slouken@2262
  1106
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1107
    int dstskip = info->dst_skip >> 1;
slouken@0
  1108
slouken@1895
  1109
    while (height--) {
slouken@1895
  1110
	    /* *INDENT-OFF* */
slouken@0
  1111
	    DUFFS_LOOP4({
slouken@0
  1112
		unsigned alpha;
slouken@0
  1113
		Uint32 s = *srcp;
slouken@0
  1114
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1115
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1116
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1117
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1118
		   Benchmark this! */
slouken@689
  1119
		if(alpha) {   
slouken@689
  1120
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1121
		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@689
  1122
		  } else {
slouken@0
  1123
		    Uint32 d = *dstp;
slouken@0
  1124
		    /*
slouken@0
  1125
		     * convert source and destination to G0RAB65565
slouken@0
  1126
		     * and blend all components at the same time
slouken@0
  1127
		     */
slouken@0
  1128
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  1129
		      + (s >> 3 & 0x1f);
slouken@0
  1130
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  1131
		    d += (s - d) * alpha >> 5;
slouken@0
  1132
		    d &= 0x03e07c1f;
slouken@1428
  1133
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1134
		  }
slouken@0
  1135
		}
slouken@0
  1136
		srcp++;
slouken@0
  1137
		dstp++;
slouken@0
  1138
	    }, width);
slouken@1895
  1139
	    /* *INDENT-ON* */
slouken@1895
  1140
        srcp += srcskip;
slouken@1895
  1141
        dstp += dstskip;
slouken@1895
  1142
    }
slouken@0
  1143
}
slouken@0
  1144
slouken@0
  1145
/* General (slow) N->N blending with per-surface alpha */
slouken@1895
  1146
static void
slouken@1895
  1147
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1148
{
slouken@2262
  1149
    int width = info->dst_w;
slouken@2262
  1150
    int height = info->dst_h;
slouken@2262
  1151
    Uint8 *src = info->src;
slouken@2267
  1152
    int srcskip = info->src_skip;
slouken@2262
  1153
    Uint8 *dst = info->dst;
slouken@2267
  1154
    int dstskip = info->dst_skip;
slouken@2267
  1155
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1156
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1157
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1158
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1159
    Uint32 Pixel;
slouken@7502
  1160
    unsigned sR, sG, sB;
slouken@7502
  1161
    unsigned dR, dG, dB, dA;
slouken@7502
  1162
    const unsigned sA = info->a;
slouken@0
  1163
slouken@1895
  1164
    if (sA) {
slouken@1895
  1165
        while (height--) {
slouken@1895
  1166
	    /* *INDENT-OFF* */
slouken@0
  1167
	    DUFFS_LOOP4(
slouken@0
  1168
	    {
icculus@1162
  1169
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@7502
  1170
		DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1171
		ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@0
  1172
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1173
		src += srcbpp;
slouken@0
  1174
		dst += dstbpp;
slouken@0
  1175
	    },
slouken@0
  1176
	    width);
slouken@1895
  1177
	    /* *INDENT-ON* */
slouken@1895
  1178
            src += srcskip;
slouken@1895
  1179
            dst += dstskip;
slouken@1895
  1180
        }
slouken@1895
  1181
    }
slouken@0
  1182
}
slouken@0
  1183
slouken@0
  1184
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@1895
  1185
static void
slouken@1895
  1186
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
  1187
{
slouken@2262
  1188
    int width = info->dst_w;
slouken@2262
  1189
    int height = info->dst_h;
slouken@2262
  1190
    Uint8 *src = info->src;
slouken@2267
  1191
    int srcskip = info->src_skip;
slouken@2262
  1192
    Uint8 *dst = info->dst;
slouken@2267
  1193
    int dstskip = info->dst_skip;
slouken@2267
  1194
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1195
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@2267
  1196
    Uint32 ckey = info->colorkey;
slouken@1895
  1197
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1198
    int dstbpp = dstfmt->BytesPerPixel;
slouken@7502
  1199
    Uint32 Pixel;
slouken@7502
  1200
    unsigned sR, sG, sB;
slouken@7502
  1201
    unsigned dR, dG, dB, dA;
slouken@7502
  1202
    const unsigned sA = info->a;
slouken@0
  1203
slouken@1895
  1204
    while (height--) {
slouken@1895
  1205
	    /* *INDENT-OFF* */
slouken@0
  1206
	    DUFFS_LOOP4(
slouken@0
  1207
	    {
icculus@1162
  1208
		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
icculus@1162
  1209
		if(sA && Pixel != ckey) {
icculus@1162
  1210
		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
slouken@7502
  1211
		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1212
		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@0
  1213
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1214
		}
slouken@0
  1215
		src += srcbpp;
slouken@0
  1216
		dst += dstbpp;
slouken@0
  1217
	    },
slouken@0
  1218
	    width);
slouken@1895
  1219
	    /* *INDENT-ON* */
slouken@1895
  1220
        src += srcskip;
slouken@1895
  1221
        dst += dstskip;
slouken@1895
  1222
    }
slouken@0
  1223
}
slouken@0
  1224
slouken@0
  1225
/* General (slow) N->N blending with pixel alpha */
slouken@1895
  1226
static void
slouken@1895
  1227
BlitNtoNPixelAlpha(SDL_BlitInfo * info)
slouken@0
  1228
{
slouken@2262
  1229
    int width = info->dst_w;
slouken@2262
  1230
    int height = info->dst_h;
slouken@2262
  1231
    Uint8 *src = info->src;
slouken@2267
  1232
    int srcskip = info->src_skip;
slouken@2262
  1233
    Uint8 *dst = info->dst;
slouken@2267
  1234
    int dstskip = info->dst_skip;
slouken@2267
  1235
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1236
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1237
    int srcbpp;
slouken@1895
  1238
    int dstbpp;
slouken@7502
  1239
    Uint32 Pixel;
slouken@7502
  1240
    unsigned sR, sG, sB, sA;
slouken@7502
  1241
    unsigned dR, dG, dB, dA;
slouken@0
  1242
slouken@1895
  1243
    /* Set up some basic variables */
slouken@1895
  1244
    srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1245
    dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1246
slouken@1895
  1247
    while (height--) {
slouken@1895
  1248
	    /* *INDENT-OFF* */
slouken@0
  1249
	    DUFFS_LOOP4(
slouken@0
  1250
	    {
icculus@1162
  1251
		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@689
  1252
		if(sA) {
slouken@7502
  1253
		    DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@7502
  1254
		    ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
slouken@7502
  1255
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  1256
		}
slouken@0
  1257
		src += srcbpp;
slouken@0
  1258
		dst += dstbpp;
slouken@0
  1259
	    },
slouken@0
  1260
	    width);
slouken@1895
  1261
	    /* *INDENT-ON* */
slouken@1895
  1262
        src += srcskip;
slouken@1895
  1263
        dst += dstskip;
slouken@1895
  1264
    }
slouken@0
  1265
}
slouken@0
  1266
slouken@0
  1267
slouken@2267
  1268
SDL_BlitFunc
slouken@2267
  1269
SDL_CalculateBlitA(SDL_Surface * surface)
slouken@0
  1270
{
slouken@0
  1271
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1272
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  1273
slouken@2853
  1274
    switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
slouken@2267
  1275
    case SDL_COPY_BLEND:
slouken@1895
  1276
        /* Per-pixel alpha blits */
slouken@1895
  1277
        switch (df->BytesPerPixel) {
slouken@1895
  1278
        case 1:
slouken@1895
  1279
            return BlitNto1PixelAlpha;
slouken@0
  1280
slouken@1895
  1281
        case 2:
slouken@5389
  1282
                if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@5389
  1283
                    && sf->Gmask == 0xff00
slouken@5389
  1284
                    && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@5389
  1285
                        || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@1895
  1286
                if (df->Gmask == 0x7e0)
slouken@1895
  1287
                    return BlitARGBto565PixelAlpha;
slouken@1895
  1288
                else if (df->Gmask == 0x3e0)
slouken@1895
  1289
                    return BlitARGBto555PixelAlpha;
slouken@1895
  1290
            }
slouken@1895
  1291
            return BlitNtoNPixelAlpha;
slouken@0
  1292
slouken@1895
  1293
        case 4:
slouken@1895
  1294
            if (sf->Rmask == df->Rmask
slouken@1895
  1295
                && sf->Gmask == df->Gmask
slouken@1895
  1296
                && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@5389
  1297
#if defined(__MMX__) || defined(__3dNOW__)
slouken@1895
  1298
                if (sf->Rshift % 8 == 0
slouken@1895
  1299
                    && sf->Gshift % 8 == 0
slouken@1895
  1300
                    && sf->Bshift % 8 == 0
slouken@1895
  1301
                    && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
slouken@5389
  1302
#ifdef __3dNOW__
slouken@5389
  1303
                    if (SDL_Has3DNow())
slouken@5389
  1304
                        return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@5389
  1305
#endif
slouken@5389
  1306
#ifdef __MMX__
slouken@1895
  1307
                    if (SDL_HasMMX())
slouken@1895
  1308
                        return BlitRGBtoRGBPixelAlphaMMX;
slouken@5389
  1309
#endif
slouken@1895
  1310
                }
slouken@5389
  1311
#endif /* __MMX__ || __3dNOW__ */
slouken@1895
  1312
                if (sf->Amask == 0xff000000) {
slouken@1895
  1313
                    return BlitRGBtoRGBPixelAlpha;
slouken@1895
  1314
                }
slouken@1895
  1315
            }
slouken@7502
  1316
            return BlitNtoNPixelAlpha;
slouken@0
  1317
slouken@1895
  1318
        case 3:
slouken@1895
  1319
        default:
icculus@10926
  1320
            break;
slouken@1895
  1321
        }
icculus@10926
  1322
        return BlitNtoNPixelAlpha;
slouken@2267
  1323
slouken@2267
  1324
    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1325
        if (sf->Amask == 0) {
slouken@2267
  1326
            /* Per-surface alpha blits */
slouken@2267
  1327
            switch (df->BytesPerPixel) {
slouken@2267
  1328
            case 1:
slouken@2267
  1329
                return BlitNto1SurfaceAlpha;
slouken@2267
  1330
slouken@2267
  1331
            case 2:
slouken@2267
  1332
                if (surface->map->identity) {
slouken@2267
  1333
                    if (df->Gmask == 0x7e0) {
slouken@2267
  1334
#ifdef __MMX__
slouken@2267
  1335
                        if (SDL_HasMMX())
slouken@2267
  1336
                            return Blit565to565SurfaceAlphaMMX;
slouken@2267
  1337
                        else
slouken@2267
  1338
#endif
slouken@2267
  1339
                            return Blit565to565SurfaceAlpha;
slouken@2267
  1340
                    } else if (df->Gmask == 0x3e0) {
slouken@2267
  1341
#ifdef __MMX__
slouken@2267
  1342
                        if (SDL_HasMMX())
slouken@2267
  1343
                            return Blit555to555SurfaceAlphaMMX;
slouken@2267
  1344
                        else
slouken@2267
  1345
#endif
slouken@2267
  1346
                            return Blit555to555SurfaceAlpha;
slouken@2267
  1347
                    }
slouken@2267
  1348
                }
slouken@2267
  1349
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1350
slouken@2267
  1351
            case 4:
slouken@2267
  1352
                if (sf->Rmask == df->Rmask
slouken@2267
  1353
                    && sf->Gmask == df->Gmask
slouken@2267
  1354
                    && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@2267
  1355
#ifdef __MMX__
slouken@2267
  1356
                    if (sf->Rshift % 8 == 0
slouken@2267
  1357
                        && sf->Gshift % 8 == 0
slouken@2267
  1358
                        && sf->Bshift % 8 == 0 && SDL_HasMMX())
slouken@2267
  1359
                        return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@2267
  1360
#endif
slouken@2267
  1361
                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
slouken@2267
  1362
                        return BlitRGBtoRGBSurfaceAlpha;
slouken@2267
  1363
                    }
slouken@2267
  1364
                }
slouken@7502
  1365
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1366
slouken@2267
  1367
            case 3:
slouken@2267
  1368
            default:
slouken@2267
  1369
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1370
            }
slouken@2267
  1371
        }
slouken@2267
  1372
        break;
slouken@2267
  1373
slouken@2267
  1374
    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1375
        if (sf->Amask == 0) {
slouken@7502
  1376
            if (df->BytesPerPixel == 1) {
slouken@2267
  1377
                return BlitNto1SurfaceAlphaKey;
slouken@7502
  1378
            } else {
slouken@2267
  1379
                return BlitNtoNSurfaceAlphaKey;
slouken@7502
  1380
            }
slouken@2267
  1381
        }
slouken@2267
  1382
        break;
slouken@0
  1383
    }
slouken@2267
  1384
slouken@2267
  1385
    return NULL;
slouken@0
  1386
}
slouken@0
  1387
slouken@1895
  1388
/* vi: set ts=4 sw=4 expandtab: */