src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Fri, 11 Feb 2011 22:37:15 -0800
changeset 5262 b530ef003506
parent 5259 6a65c1fc07af
child 5389 24903690f48a
permissions -rw-r--r--
Happy 2011! :)
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@5262
     3
    Copyright (C) 1997-2011 Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@1312
     6
    modify it under the terms of the GNU Lesser General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@1312
     8
    version 2.1 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@1312
    13
    Lesser General Public License for more details.
slouken@0
    14
slouken@1312
    15
    You should have received a copy of the GNU Lesser General Public
slouken@1312
    16
    License along with this library; if not, write to the Free Software
slouken@1312
    17
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@1402
    22
#include "SDL_config.h"
slouken@0
    23
slouken@0
    24
#include "SDL_video.h"
slouken@0
    25
#include "SDL_blit.h"
slouken@0
    26
slouken@0
    27
/* Functions to perform alpha blended blitting */
slouken@0
    28
slouken@0
    29
/* N->1 blending with per-surface alpha */
slouken@1895
    30
static void
slouken@1895
    31
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
    32
{
slouken@2262
    33
    int width = info->dst_w;
slouken@2262
    34
    int height = info->dst_h;
slouken@2262
    35
    Uint8 *src = info->src;
slouken@2267
    36
    int srcskip = info->src_skip;
slouken@2262
    37
    Uint8 *dst = info->dst;
slouken@2267
    38
    int dstskip = info->dst_skip;
slouken@1895
    39
    Uint8 *palmap = info->table;
slouken@2267
    40
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    41
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    42
    int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    43
slouken@2267
    44
    const unsigned A = info->a;
slouken@0
    45
slouken@1895
    46
    while (height--) {
slouken@1895
    47
	    /* *INDENT-OFF* */
slouken@0
    48
	    DUFFS_LOOP4(
slouken@0
    49
	    {
icculus@1162
    50
		Uint32 Pixel;
slouken@0
    51
		unsigned sR;
slouken@0
    52
		unsigned sG;
slouken@0
    53
		unsigned sB;
slouken@0
    54
		unsigned dR;
slouken@0
    55
		unsigned dG;
slouken@0
    56
		unsigned dB;
icculus@1162
    57
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    58
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    59
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    60
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
    61
		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
    62
		dR &= 0xff;
slouken@0
    63
		dG &= 0xff;
slouken@0
    64
		dB &= 0xff;
slouken@0
    65
		/* Pack RGB into 8bit pixel */
slouken@0
    66
		if ( palmap == NULL ) {
slouken@0
    67
		    *dst =((dR>>5)<<(3+2))|
slouken@0
    68
			  ((dG>>5)<<(2))|
slouken@0
    69
			  ((dB>>6)<<(0));
slouken@0
    70
		} else {
slouken@0
    71
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
    72
				  ((dG>>5)<<(2))  |
slouken@0
    73
				  ((dB>>6)<<(0))];
slouken@0
    74
		}
slouken@0
    75
		dst++;
slouken@0
    76
		src += srcbpp;
slouken@0
    77
	    },
slouken@0
    78
	    width);
slouken@1895
    79
	    /* *INDENT-ON* */
slouken@1895
    80
        src += srcskip;
slouken@1895
    81
        dst += dstskip;
slouken@1895
    82
    }
slouken@0
    83
}
slouken@0
    84
slouken@0
    85
/* N->1 blending with pixel alpha */
slouken@1895
    86
static void
slouken@1895
    87
BlitNto1PixelAlpha(SDL_BlitInfo * info)
slouken@0
    88
{
slouken@2262
    89
    int width = info->dst_w;
slouken@2262
    90
    int height = info->dst_h;
slouken@2262
    91
    Uint8 *src = info->src;
slouken@2267
    92
    int srcskip = info->src_skip;
slouken@2262
    93
    Uint8 *dst = info->dst;
slouken@2267
    94
    int dstskip = info->dst_skip;
slouken@1895
    95
    Uint8 *palmap = info->table;
slouken@2267
    96
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
    97
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
    98
    int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    99
slouken@1895
   100
    /* FIXME: fix alpha bit field expansion here too? */
slouken@1895
   101
    while (height--) {
slouken@1895
   102
	    /* *INDENT-OFF* */
slouken@0
   103
	    DUFFS_LOOP4(
slouken@0
   104
	    {
icculus@1162
   105
		Uint32 Pixel;
slouken@0
   106
		unsigned sR;
slouken@0
   107
		unsigned sG;
slouken@0
   108
		unsigned sB;
slouken@0
   109
		unsigned sA;
slouken@0
   110
		unsigned dR;
slouken@0
   111
		unsigned dG;
slouken@0
   112
		unsigned dB;
icculus@1162
   113
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
   114
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
   115
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   116
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
   117
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   118
		dR &= 0xff;
slouken@0
   119
		dG &= 0xff;
slouken@0
   120
		dB &= 0xff;
slouken@0
   121
		/* Pack RGB into 8bit pixel */
slouken@0
   122
		if ( palmap == NULL ) {
slouken@0
   123
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   124
			  ((dG>>5)<<(2))|
slouken@0
   125
			  ((dB>>6)<<(0));
slouken@0
   126
		} else {
slouken@0
   127
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   128
				  ((dG>>5)<<(2))  |
slouken@0
   129
				  ((dB>>6)<<(0))  ];
slouken@0
   130
		}
slouken@0
   131
		dst++;
slouken@0
   132
		src += srcbpp;
slouken@0
   133
	    },
slouken@0
   134
	    width);
slouken@1895
   135
	    /* *INDENT-ON* */
slouken@1895
   136
        src += srcskip;
slouken@1895
   137
        dst += dstskip;
slouken@1895
   138
    }
slouken@0
   139
}
slouken@0
   140
slouken@0
   141
/* colorkeyed N->1 blending with per-surface alpha */
slouken@1895
   142
static void
slouken@1895
   143
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
   144
{
slouken@2262
   145
    int width = info->dst_w;
slouken@2262
   146
    int height = info->dst_h;
slouken@2262
   147
    Uint8 *src = info->src;
slouken@2267
   148
    int srcskip = info->src_skip;
slouken@2262
   149
    Uint8 *dst = info->dst;
slouken@2267
   150
    int dstskip = info->dst_skip;
slouken@1895
   151
    Uint8 *palmap = info->table;
slouken@2267
   152
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
   153
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
   154
    int srcbpp = srcfmt->BytesPerPixel;
slouken@2267
   155
    Uint32 ckey = info->colorkey;
slouken@0
   156
slouken@2267
   157
    const int A = info->a;
slouken@0
   158
slouken@1895
   159
    while (height--) {
slouken@1895
   160
	    /* *INDENT-OFF* */
slouken@0
   161
	    DUFFS_LOOP(
slouken@0
   162
	    {
icculus@1162
   163
		Uint32 Pixel;
slouken@0
   164
		unsigned sR;
slouken@0
   165
		unsigned sG;
slouken@0
   166
		unsigned sB;
slouken@0
   167
		unsigned dR;
slouken@0
   168
		unsigned dG;
slouken@0
   169
		unsigned dB;
icculus@1162
   170
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   171
		if ( Pixel != ckey ) {
slouken@0
   172
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   173
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   174
		    dB = dstfmt->palette->colors[*dst].b;
slouken@0
   175
		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
   176
		    dR &= 0xff;
slouken@0
   177
		    dG &= 0xff;
slouken@0
   178
		    dB &= 0xff;
slouken@0
   179
		    /* Pack RGB into 8bit pixel */
slouken@0
   180
		    if ( palmap == NULL ) {
slouken@0
   181
			*dst =((dR>>5)<<(3+2))|
slouken@0
   182
			      ((dG>>5)<<(2)) |
slouken@0
   183
			      ((dB>>6)<<(0));
slouken@0
   184
		    } else {
slouken@0
   185
			*dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   186
				      ((dG>>5)<<(2))  |
slouken@0
   187
				      ((dB>>6)<<(0))  ];
slouken@0
   188
		    }
slouken@0
   189
		}
slouken@0
   190
		dst++;
slouken@0
   191
		src += srcbpp;
slouken@0
   192
	    },
slouken@0
   193
	    width);
slouken@1895
   194
	    /* *INDENT-ON* */
slouken@1895
   195
        src += srcskip;
slouken@1895
   196
        dst += dstskip;
slouken@1895
   197
    }
slouken@0
   198
}
slouken@0
   199
slouken@2255
   200
#ifdef __MMX__
slouken@1542
   201
slouken@1542
   202
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   203
static void
slouken@1895
   204
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@1542
   205
{
slouken@2262
   206
    int width = info->dst_w;
slouken@2262
   207
    int height = info->dst_h;
slouken@2262
   208
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   209
    int srcskip = info->src_skip >> 2;
slouken@2262
   210
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   211
    int dstskip = info->dst_skip >> 2;
slouken@2267
   212
    Uint32 dalpha = info->dst_fmt->Amask;
slouken@1542
   213
slouken@1895
   214
    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   215
slouken@1895
   216
    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
slouken@1895
   217
    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
slouken@1895
   218
    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
slouken@1542
   219
slouken@1895
   220
    while (height--) {
slouken@1895
   221
        int n = width;
slouken@1895
   222
        if (n & 1) {
slouken@1895
   223
            Uint32 s = *srcp++;
slouken@1895
   224
            Uint32 d = *dstp;
slouken@1895
   225
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1895
   226
                       + (s & d & 0x00010101)) | dalpha;
slouken@1895
   227
            n--;
slouken@1895
   228
        }
slouken@1542
   229
slouken@1895
   230
        for (n >>= 1; n > 0; --n) {
slouken@1895
   231
            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   232
            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   233
slouken@1895
   234
            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   235
            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   236
slouken@1895
   237
            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
slouken@1895
   238
            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
slouken@1895
   239
            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
slouken@1895
   240
            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
slouken@1895
   241
slouken@1895
   242
            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
slouken@1895
   243
            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
slouken@1895
   244
            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
slouken@1895
   245
            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
slouken@1895
   246
slouken@1895
   247
            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
slouken@1895
   248
            dstp += 2;
slouken@1895
   249
            srcp += 2;
slouken@1895
   250
        }
slouken@1895
   251
slouken@1895
   252
        srcp += srcskip;
slouken@1895
   253
        dstp += dstskip;
slouken@1895
   254
    }
slouken@1895
   255
    _mm_empty();
slouken@1542
   256
}
slouken@1542
   257
slouken@1542
   258
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   259
static void
slouken@1895
   260
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   261
{
slouken@2267
   262
    SDL_PixelFormat *df = info->dst_fmt;
slouken@1895
   263
    Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
slouken@2267
   264
    unsigned alpha = info->a;
slouken@1542
   265
slouken@1895
   266
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   267
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   268
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   269
    } else {
slouken@2262
   270
        int width = info->dst_w;
slouken@2262
   271
        int height = info->dst_h;
slouken@2262
   272
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   273
        int srcskip = info->src_skip >> 2;
slouken@2262
   274
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   275
        int dstskip = info->dst_skip >> 2;
slouken@1895
   276
        Uint32 dalpha = df->Amask;
slouken@1895
   277
        Uint32 amult;
slouken@1542
   278
slouken@1895
   279
        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   280
slouken@1895
   281
        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
slouken@1895
   282
        /* form the alpha mult */
slouken@1895
   283
        amult = alpha | (alpha << 8);
slouken@1895
   284
        amult = amult | (amult << 16);
slouken@1895
   285
        chanmask =
slouken@3013
   286
            (0xff << df->Rshift) | (0xff << df->
slouken@3013
   287
                                    Gshift) | (0xff << df->Bshift);
slouken@1895
   288
        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1895
   289
        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1895
   290
        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   291
        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
slouken@1542
   292
slouken@1895
   293
        while (height--) {
slouken@1895
   294
            int n = width;
slouken@1895
   295
            if (n & 1) {
slouken@1895
   296
                /* One Pixel Blend */
slouken@1895
   297
                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
slouken@1895
   298
                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1542
   299
slouken@1895
   300
                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1895
   301
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   302
slouken@1895
   303
                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
slouken@1895
   304
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   305
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   306
                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1542
   307
slouken@1895
   308
                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
slouken@1895
   309
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   310
                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   311
slouken@1895
   312
                ++srcp;
slouken@1895
   313
                ++dstp;
slouken@1542
   314
slouken@1895
   315
                n--;
slouken@1895
   316
            }
slouken@1542
   317
slouken@1895
   318
            for (n >>= 1; n > 0; --n) {
slouken@1895
   319
                /* Two Pixels Blend */
slouken@1895
   320
                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   321
                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   322
                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1895
   323
                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   324
slouken@1895
   325
                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   326
                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
slouken@1895
   327
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1895
   328
                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1895
   329
slouken@1895
   330
                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
slouken@1895
   331
                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
slouken@1895
   332
                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
slouken@1895
   333
                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1895
   334
slouken@1895
   335
                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
slouken@1895
   336
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   337
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   338
                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1895
   339
slouken@1895
   340
                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1895
   341
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   342
slouken@1895
   343
                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1895
   344
slouken@1895
   345
                srcp += 2;
slouken@1895
   346
                dstp += 2;
slouken@1895
   347
            }
slouken@1895
   348
            srcp += srcskip;
slouken@1895
   349
            dstp += dstskip;
slouken@1895
   350
        }
slouken@1895
   351
        _mm_empty();
slouken@1895
   352
    }
slouken@1542
   353
}
slouken@1542
   354
slouken@1542
   355
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   356
static void
slouken@1895
   357
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   358
{
slouken@2262
   359
    int width = info->dst_w;
slouken@2262
   360
    int height = info->dst_h;
slouken@2262
   361
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   362
    int srcskip = info->src_skip >> 2;
slouken@2262
   363
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   364
    int dstskip = info->dst_skip >> 2;
slouken@2267
   365
    SDL_PixelFormat *sf = info->src_fmt;
slouken@1895
   366
    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1895
   367
    Uint32 amask = sf->Amask;
slouken@1895
   368
    Uint32 ashift = sf->Ashift;
slouken@1895
   369
    Uint64 multmask;
slouken@1542
   370
slouken@1895
   371
    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
slouken@1542
   372
slouken@1895
   373
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@2260
   374
    multmask = 0xFFFF;
slouken@2255
   375
    multmask <<= (ashift * 2);
slouken@2255
   376
    multmask = ~multmask;
slouken@1895
   377
    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
slouken@1542
   378
slouken@1895
   379
    while (height--) {
slouken@1895
   380
		/* *INDENT-OFF* */
slouken@1542
   381
		DUFFS_LOOP4({
slouken@1542
   382
		Uint32 alpha = *srcp & amask;
slouken@1542
   383
		if (alpha == 0) {
slouken@1542
   384
			/* do nothing */
slouken@1542
   385
		} else if (alpha == amask) {
slouken@1542
   386
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
   387
			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
slouken@1542
   388
		} else {
slouken@1542
   389
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
slouken@1542
   390
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   391
slouken@1542
   392
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
   393
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   394
slouken@1542
   395
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
   396
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
   397
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
   398
			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
   399
			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
slouken@1542
   400
slouken@1542
   401
			/* blend */		    
slouken@1542
   402
			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
slouken@1542
   403
			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
slouken@1542
   404
			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
slouken@1542
   405
			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
slouken@1542
   406
			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
   407
			
slouken@1542
   408
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   409
		}
slouken@1542
   410
		++srcp;
slouken@1542
   411
		++dstp;
slouken@1542
   412
	    }, width);
slouken@1895
   413
		/* *INDENT-ON* */
slouken@1895
   414
        srcp += srcskip;
slouken@1895
   415
        dstp += dstskip;
slouken@1895
   416
    }
slouken@1895
   417
    _mm_empty();
slouken@1542
   418
}
slouken@1895
   419
slouken@2255
   420
#endif /* __MMX__ */
slouken@689
   421
slouken@1
   422
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   423
static void
slouken@1895
   424
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
slouken@0
   425
{
slouken@2262
   426
    int width = info->dst_w;
slouken@2262
   427
    int height = info->dst_h;
slouken@2262
   428
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   429
    int srcskip = info->src_skip >> 2;
slouken@2262
   430
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   431
    int dstskip = info->dst_skip >> 2;
slouken@0
   432
slouken@1895
   433
    while (height--) {
slouken@1895
   434
	    /* *INDENT-OFF* */
slouken@0
   435
	    DUFFS_LOOP4({
slouken@1
   436
		    Uint32 s = *srcp++;
slouken@1
   437
		    Uint32 d = *dstp;
slouken@1
   438
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
   439
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
   440
	    }, width);
slouken@1895
   441
	    /* *INDENT-ON* */
slouken@1895
   442
        srcp += srcskip;
slouken@1895
   443
        dstp += dstskip;
slouken@1895
   444
    }
slouken@0
   445
}
slouken@0
   446
slouken@1
   447
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   448
static void
slouken@1895
   449
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   450
{
slouken@2267
   451
    unsigned alpha = info->a;
slouken@1895
   452
    if (alpha == 128) {
slouken@1895
   453
        BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1895
   454
    } else {
slouken@2262
   455
        int width = info->dst_w;
slouken@2262
   456
        int height = info->dst_h;
slouken@2262
   457
        Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   458
        int srcskip = info->src_skip >> 2;
slouken@2262
   459
        Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   460
        int dstskip = info->dst_skip >> 2;
slouken@1895
   461
        Uint32 s;
slouken@1895
   462
        Uint32 d;
slouken@1895
   463
        Uint32 s1;
slouken@1895
   464
        Uint32 d1;
slouken@1
   465
slouken@1895
   466
        while (height--) {
slouken@1895
   467
			/* *INDENT-OFF* */
slouken@3035
   468
			DUFFS_LOOP4({
slouken@1
   469
				s = *srcp;
slouken@1
   470
				d = *dstp;
slouken@1
   471
				s1 = s & 0xff00ff;
slouken@1
   472
				d1 = d & 0xff00ff;
slouken@1
   473
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
   474
				     & 0xff00ff;
slouken@1
   475
				s &= 0xff00;
slouken@1
   476
				d &= 0xff00;
slouken@1
   477
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
   478
				*dstp = d1 | d | 0xff000000;
slouken@1
   479
				++srcp;
slouken@1
   480
				++dstp;
slouken@1
   481
			}, width);
slouken@1895
   482
			/* *INDENT-ON* */
slouken@1895
   483
            srcp += srcskip;
slouken@1895
   484
            dstp += dstskip;
slouken@1895
   485
        }
slouken@1895
   486
    }
slouken@1
   487
}
slouken@1
   488
slouken@0
   489
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   490
static void
slouken@1895
   491
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
slouken@0
   492
{
slouken@2262
   493
    int width = info->dst_w;
slouken@2262
   494
    int height = info->dst_h;
slouken@2262
   495
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
   496
    int srcskip = info->src_skip >> 2;
slouken@2262
   497
    Uint32 *dstp = (Uint32 *) info->dst;
slouken@2267
   498
    int dstskip = info->dst_skip >> 2;
slouken@0
   499
slouken@1895
   500
    while (height--) {
slouken@1895
   501
	    /* *INDENT-OFF* */
slouken@0
   502
	    DUFFS_LOOP4({
slouken@0
   503
		Uint32 dalpha;
slouken@0
   504
		Uint32 d;
slouken@0
   505
		Uint32 s1;
slouken@0
   506
		Uint32 d1;
slouken@0
   507
		Uint32 s = *srcp;
slouken@0
   508
		Uint32 alpha = s >> 24;
slouken@0
   509
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
   510
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
   511
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
   512
		   Benchmark this! */
slouken@689
   513
		if(alpha) {   
slouken@689
   514
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@0
   515
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
   516
		  } else {
slouken@0
   517
		    /*
slouken@0
   518
		     * take out the middle component (green), and process
slouken@0
   519
		     * the other two in parallel. One multiply less.
slouken@0
   520
		     */
slouken@0
   521
		    d = *dstp;
slouken@0
   522
		    dalpha = d & 0xff000000;
slouken@0
   523
		    s1 = s & 0xff00ff;
slouken@0
   524
		    d1 = d & 0xff00ff;
slouken@0
   525
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
   526
		    s &= 0xff00;
slouken@0
   527
		    d &= 0xff00;
slouken@0
   528
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@0
   529
		    *dstp = d1 | d | dalpha;
slouken@689
   530
		  }
slouken@0
   531
		}
slouken@0
   532
		++srcp;
slouken@0
   533
		++dstp;
slouken@0
   534
	    }, width);
slouken@1895
   535
	    /* *INDENT-ON* */
slouken@1895
   536
        srcp += srcskip;
slouken@1895
   537
        dstp += dstskip;
slouken@1895
   538
    }
slouken@0
   539
}
slouken@0
   540
slouken@1
   541
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
   542
slouken@1
   543
/* blend a single 16 bit pixel at 50% */
slouken@1
   544
#define BLEND16_50(d, s, mask)						\
slouken@1
   545
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
   546
slouken@1
   547
/* blend two 16 bit pixels at 50% */
slouken@1
   548
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
   549
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
   550
	 + (s & d & (~(mask | mask << 16))))
slouken@1
   551
slouken@1895
   552
static void
slouken@1895
   553
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
slouken@0
   554
{
slouken@2262
   555
    int width = info->dst_w;
slouken@2262
   556
    int height = info->dst_h;
slouken@2262
   557
    Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   558
    int srcskip = info->src_skip >> 1;
slouken@2262
   559
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   560
    int dstskip = info->dst_skip >> 1;
slouken@0
   561
slouken@1895
   562
    while (height--) {
slouken@1895
   563
        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
slouken@1895
   564
            /*
slouken@1895
   565
             * Source and destination not aligned, pipeline it.
slouken@1895
   566
             * This is mostly a win for big blits but no loss for
slouken@1895
   567
             * small ones
slouken@1895
   568
             */
slouken@1895
   569
            Uint32 prev_sw;
slouken@1895
   570
            int w = width;
slouken@1
   571
slouken@1895
   572
            /* handle odd destination */
slouken@1895
   573
            if ((uintptr_t) dstp & 2) {
slouken@1895
   574
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   575
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   576
                dstp++;
slouken@1895
   577
                srcp++;
slouken@1895
   578
                w--;
slouken@1895
   579
            }
slouken@1895
   580
            srcp++;             /* srcp is now 32-bit aligned */
slouken@1
   581
slouken@1895
   582
            /* bootstrap pipeline with first halfword */
slouken@1895
   583
            prev_sw = ((Uint32 *) srcp)[-1];
slouken@1
   584
slouken@1895
   585
            while (w > 1) {
slouken@1895
   586
                Uint32 sw, dw, s;
slouken@1895
   587
                sw = *(Uint32 *) srcp;
slouken@1895
   588
                dw = *(Uint32 *) dstp;
slouken@1443
   589
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   590
                s = (prev_sw << 16) + (sw >> 16);
slouken@1443
   591
#else
slouken@1895
   592
                s = (prev_sw >> 16) + (sw << 16);
slouken@1443
   593
#endif
slouken@1895
   594
                prev_sw = sw;
slouken@1895
   595
                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
slouken@1895
   596
                dstp += 2;
slouken@1895
   597
                srcp += 2;
slouken@1895
   598
                w -= 2;
slouken@1895
   599
            }
slouken@1
   600
slouken@1895
   601
            /* final pixel if any */
slouken@1895
   602
            if (w) {
slouken@1895
   603
                Uint16 d = *dstp, s;
slouken@1443
   604
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
   605
                s = (Uint16) prev_sw;
slouken@1443
   606
#else
slouken@1895
   607
                s = (Uint16) (prev_sw >> 16);
slouken@1443
   608
#endif
slouken@1895
   609
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   610
                srcp++;
slouken@1895
   611
                dstp++;
slouken@1895
   612
            }
slouken@1895
   613
            srcp += srcskip - 1;
slouken@1895
   614
            dstp += dstskip;
slouken@1895
   615
        } else {
slouken@1895
   616
            /* source and destination are aligned */
slouken@1895
   617
            int w = width;
slouken@1
   618
slouken@1895
   619
            /* first odd pixel? */
slouken@1895
   620
            if ((uintptr_t) srcp & 2) {
slouken@1895
   621
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   622
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   623
                srcp++;
slouken@1895
   624
                dstp++;
slouken@1895
   625
                w--;
slouken@1895
   626
            }
slouken@1895
   627
            /* srcp and dstp are now 32-bit aligned */
slouken@1
   628
slouken@1895
   629
            while (w > 1) {
slouken@1895
   630
                Uint32 sw = *(Uint32 *) srcp;
slouken@1895
   631
                Uint32 dw = *(Uint32 *) dstp;
slouken@1895
   632
                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
slouken@1895
   633
                srcp += 2;
slouken@1895
   634
                dstp += 2;
slouken@1895
   635
                w -= 2;
slouken@1895
   636
            }
slouken@1
   637
slouken@1895
   638
            /* last odd pixel? */
slouken@1895
   639
            if (w) {
slouken@1895
   640
                Uint16 d = *dstp, s = *srcp;
slouken@1895
   641
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
   642
                srcp++;
slouken@1895
   643
                dstp++;
slouken@1895
   644
            }
slouken@1895
   645
            srcp += srcskip;
slouken@1895
   646
            dstp += dstskip;
slouken@1895
   647
        }
slouken@1895
   648
    }
slouken@1
   649
}
slouken@1
   650
slouken@2255
   651
#ifdef __MMX__
slouken@689
   652
slouken@1542
   653
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   654
static void
slouken@1895
   655
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   656
{
slouken@2267
   657
    unsigned alpha = info->a;
slouken@1895
   658
    if (alpha == 128) {
slouken@1895
   659
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   660
    } else {
slouken@2262
   661
        int width = info->dst_w;
slouken@2262
   662
        int height = info->dst_h;
slouken@2262
   663
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   664
        int srcskip = info->src_skip >> 1;
slouken@2262
   665
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   666
        int dstskip = info->dst_skip >> 1;
slouken@1895
   667
        Uint32 s, d;
slouken@1542
   668
slouken@1895
   669
        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   670
slouken@1895
   671
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   672
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   673
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1895
   674
slouken@1895
   675
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   676
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   677
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   678
           to reduce the number of operations */
slouken@1895
   679
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   680
slouken@1895
   681
        /* Setup the 565 color channel masks */
slouken@1895
   682
        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
slouken@1895
   683
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   684
slouken@1895
   685
        while (height--) {
slouken@1895
   686
			/* *INDENT-OFF* */
slouken@3035
   687
			DUFFS_LOOP_124(
slouken@1542
   688
			{
slouken@1542
   689
				s = *srcp++;
slouken@1542
   690
				d = *dstp;
slouken@1542
   691
				/*
slouken@1542
   692
				 * shift out the middle component (green) to
slouken@1542
   693
				 * the high 16 bits, and process all three RGB
slouken@1542
   694
				 * components at the same time.
slouken@1542
   695
				 */
slouken@1542
   696
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   697
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   698
				d += (s - d) * alpha >> 5;
slouken@1542
   699
				d &= 0x07e0f81f;
slouken@1546
   700
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   701
			},{
slouken@1542
   702
				s = *srcp++;
slouken@1542
   703
				d = *dstp;
slouken@1542
   704
				/*
slouken@1542
   705
				 * shift out the middle component (green) to
slouken@1542
   706
				 * the high 16 bits, and process all three RGB
slouken@1542
   707
				 * components at the same time.
slouken@1542
   708
				 */
slouken@1542
   709
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   710
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   711
				d += (s - d) * alpha >> 5;
slouken@1542
   712
				d &= 0x07e0f81f;
slouken@1546
   713
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   714
				s = *srcp++;
slouken@1542
   715
				d = *dstp;
slouken@1542
   716
				/*
slouken@1542
   717
				 * shift out the middle component (green) to
slouken@1542
   718
				 * the high 16 bits, and process all three RGB
slouken@1542
   719
				 * components at the same time.
slouken@1542
   720
				 */
slouken@1542
   721
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
   722
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
   723
				d += (s - d) * alpha >> 5;
slouken@1542
   724
				d &= 0x07e0f81f;
slouken@1546
   725
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   726
			},{
slouken@1542
   727
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   728
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   729
slouken@1542
   730
				/* red */
slouken@1542
   731
				src2 = src1;
slouken@1542
   732
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
   733
slouken@1542
   734
				dst2 = dst1;
slouken@1542
   735
				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
   736
slouken@1542
   737
				/* blend */
slouken@1542
   738
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   739
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   740
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   741
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   742
				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
   743
slouken@1542
   744
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   745
slouken@1542
   746
				/* green -- process the bits in place */
slouken@1542
   747
				src2 = src1;
slouken@1542
   748
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   749
slouken@1542
   750
				dst2 = dst1;
slouken@1542
   751
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   752
slouken@1542
   753
				/* blend */
slouken@1542
   754
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   755
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   756
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   757
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   758
slouken@1542
   759
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   760
slouken@1542
   761
				/* blue */
slouken@1542
   762
				src2 = src1;
slouken@1542
   763
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   764
slouken@1542
   765
				dst2 = dst1;
slouken@1542
   766
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   767
slouken@1542
   768
				/* blend */
slouken@1542
   769
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   770
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   771
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   772
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   773
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   774
slouken@1542
   775
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   776
slouken@1542
   777
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   778
slouken@1542
   779
				srcp += 4;
slouken@1542
   780
				dstp += 4;
slouken@1895
   781
			}, width);
slouken@1895
   782
			/* *INDENT-ON* */
slouken@1895
   783
            srcp += srcskip;
slouken@1895
   784
            dstp += dstskip;
slouken@1895
   785
        }
slouken@1895
   786
        _mm_empty();
slouken@1895
   787
    }
slouken@1542
   788
}
slouken@1542
   789
slouken@1542
   790
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
   791
static void
slouken@1895
   792
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   793
{
slouken@2267
   794
    unsigned alpha = info->a;
slouken@1895
   795
    if (alpha == 128) {
slouken@1895
   796
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
   797
    } else {
slouken@2262
   798
        int width = info->dst_w;
slouken@2262
   799
        int height = info->dst_h;
slouken@2262
   800
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   801
        int srcskip = info->src_skip >> 1;
slouken@2262
   802
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   803
        int dstskip = info->dst_skip >> 1;
slouken@1895
   804
        Uint32 s, d;
slouken@1542
   805
slouken@1895
   806
        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
   807
slouken@1895
   808
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
   809
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
   810
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
   811
slouken@1895
   812
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
   813
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
   814
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
   815
           to reduce the number of operations */
slouken@1895
   816
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
   817
slouken@1895
   818
        /* Setup the 555 color channel masks */
slouken@1895
   819
        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
slouken@1895
   820
        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
slouken@1895
   821
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
   822
slouken@1895
   823
        while (height--) {
slouken@1895
   824
			/* *INDENT-OFF* */
slouken@3035
   825
			DUFFS_LOOP_124(
slouken@1542
   826
			{
slouken@1542
   827
				s = *srcp++;
slouken@1542
   828
				d = *dstp;
slouken@1542
   829
				/*
slouken@1542
   830
				 * shift out the middle component (green) to
slouken@1542
   831
				 * the high 16 bits, and process all three RGB
slouken@1542
   832
				 * components at the same time.
slouken@1542
   833
				 */
slouken@1542
   834
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   835
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   836
				d += (s - d) * alpha >> 5;
slouken@1542
   837
				d &= 0x03e07c1f;
slouken@1546
   838
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   839
			},{
slouken@1542
   840
				s = *srcp++;
slouken@1542
   841
				d = *dstp;
slouken@1542
   842
				/*
slouken@1542
   843
				 * shift out the middle component (green) to
slouken@1542
   844
				 * the high 16 bits, and process all three RGB
slouken@1542
   845
				 * components at the same time.
slouken@1542
   846
				 */
slouken@1542
   847
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   848
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   849
				d += (s - d) * alpha >> 5;
slouken@1542
   850
				d &= 0x03e07c1f;
slouken@1546
   851
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   852
			        s = *srcp++;
slouken@1542
   853
				d = *dstp;
slouken@1542
   854
				/*
slouken@1542
   855
				 * shift out the middle component (green) to
slouken@1542
   856
				 * the high 16 bits, and process all three RGB
slouken@1542
   857
				 * components at the same time.
slouken@1542
   858
				 */
slouken@1542
   859
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
   860
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
   861
				d += (s - d) * alpha >> 5;
slouken@1542
   862
				d &= 0x03e07c1f;
slouken@1546
   863
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
   864
			},{
slouken@1542
   865
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
   866
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
   867
slouken@1542
   868
				/* red -- process the bits in place */
slouken@1542
   869
				src2 = src1;
slouken@1542
   870
				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
   871
slouken@1542
   872
				dst2 = dst1;
slouken@1542
   873
				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
   874
slouken@1542
   875
				/* blend */
slouken@1542
   876
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   877
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   878
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   879
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   880
				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
   881
slouken@1542
   882
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
   883
				
slouken@1542
   884
				/* green -- process the bits in place */
slouken@1542
   885
				src2 = src1;
slouken@1542
   886
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
   887
slouken@1542
   888
				dst2 = dst1;
slouken@1542
   889
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
   890
slouken@1542
   891
				/* blend */
slouken@1542
   892
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   893
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   894
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
   895
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   896
slouken@1542
   897
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
   898
slouken@1542
   899
				/* blue */
slouken@1542
   900
				src2 = src1; /* src -> src2 */
slouken@1542
   901
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
   902
slouken@1542
   903
				dst2 = dst1; /* dst -> dst2 */
slouken@1542
   904
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
   905
slouken@1542
   906
				/* blend */
slouken@1542
   907
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
   908
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   909
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
   910
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
   911
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
   912
slouken@1542
   913
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
   914
slouken@1542
   915
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
   916
slouken@1542
   917
				srcp += 4;
slouken@1542
   918
				dstp += 4;
slouken@1895
   919
			}, width);
slouken@1895
   920
			/* *INDENT-ON* */
slouken@1895
   921
            srcp += srcskip;
slouken@1895
   922
            dstp += dstskip;
slouken@1895
   923
        }
slouken@1895
   924
        _mm_empty();
slouken@1895
   925
    }
slouken@1542
   926
}
slouken@2255
   927
slouken@2255
   928
#endif /* __MMX__ */
slouken@689
   929
slouken@1
   930
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
   931
static void
slouken@1895
   932
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
slouken@1
   933
{
slouken@2267
   934
    unsigned alpha = info->a;
slouken@1895
   935
    if (alpha == 128) {
slouken@1895
   936
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
   937
    } else {
slouken@2262
   938
        int width = info->dst_w;
slouken@2262
   939
        int height = info->dst_h;
slouken@2262
   940
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   941
        int srcskip = info->src_skip >> 1;
slouken@2262
   942
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   943
        int dstskip = info->dst_skip >> 1;
slouken@1895
   944
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1
   945
slouken@1895
   946
        while (height--) {
slouken@1895
   947
			/* *INDENT-OFF* */
slouken@1
   948
			DUFFS_LOOP4({
slouken@1
   949
				Uint32 s = *srcp++;
slouken@1
   950
				Uint32 d = *dstp;
slouken@1
   951
				/*
slouken@1
   952
				 * shift out the middle component (green) to
slouken@1
   953
				 * the high 16 bits, and process all three RGB
slouken@1
   954
				 * components at the same time.
slouken@1
   955
				 */
slouken@1
   956
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
   957
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
   958
				d += (s - d) * alpha >> 5;
slouken@1
   959
				d &= 0x07e0f81f;
slouken@1428
   960
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
   961
			}, width);
slouken@1895
   962
			/* *INDENT-ON* */
slouken@1895
   963
            srcp += srcskip;
slouken@1895
   964
            dstp += dstskip;
slouken@1895
   965
        }
slouken@1895
   966
    }
slouken@0
   967
}
slouken@0
   968
slouken@0
   969
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
   970
static void
slouken@1895
   971
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
   972
{
slouken@2267
   973
    unsigned alpha = info->a;   /* downscale alpha to 5 bits */
slouken@1895
   974
    if (alpha == 128) {
slouken@1895
   975
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
   976
    } else {
slouken@2262
   977
        int width = info->dst_w;
slouken@2262
   978
        int height = info->dst_h;
slouken@2262
   979
        Uint16 *srcp = (Uint16 *) info->src;
slouken@2267
   980
        int srcskip = info->src_skip >> 1;
slouken@2262
   981
        Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
   982
        int dstskip = info->dst_skip >> 1;
slouken@1895
   983
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@0
   984
slouken@1895
   985
        while (height--) {
slouken@1895
   986
			/* *INDENT-OFF* */
slouken@1
   987
			DUFFS_LOOP4({
slouken@1
   988
				Uint32 s = *srcp++;
slouken@1
   989
				Uint32 d = *dstp;
slouken@1
   990
				/*
slouken@1
   991
				 * shift out the middle component (green) to
slouken@1
   992
				 * the high 16 bits, and process all three RGB
slouken@1
   993
				 * components at the same time.
slouken@1
   994
				 */
slouken@1
   995
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
   996
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
   997
				d += (s - d) * alpha >> 5;
slouken@1
   998
				d &= 0x03e07c1f;
slouken@1428
   999
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  1000
			}, width);
slouken@1895
  1001
			/* *INDENT-ON* */
slouken@1895
  1002
            srcp += srcskip;
slouken@1895
  1003
            dstp += dstskip;
slouken@1895
  1004
        }
slouken@1895
  1005
    }
slouken@0
  1006
}
slouken@0
  1007
slouken@0
  1008
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@1895
  1009
static void
slouken@1895
  1010
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1011
{
slouken@2262
  1012
    int width = info->dst_w;
slouken@2262
  1013
    int height = info->dst_h;
slouken@2262
  1014
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1015
    int srcskip = info->src_skip >> 2;
slouken@2262
  1016
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1017
    int dstskip = info->dst_skip >> 1;
slouken@0
  1018
slouken@1895
  1019
    while (height--) {
slouken@1895
  1020
	    /* *INDENT-OFF* */
slouken@0
  1021
	    DUFFS_LOOP4({
slouken@0
  1022
		Uint32 s = *srcp;
slouken@0
  1023
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1024
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1025
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1026
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1027
		   Benchmark this! */
slouken@689
  1028
		if(alpha) {   
slouken@689
  1029
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1030
		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@689
  1031
		  } else {
slouken@0
  1032
		    Uint32 d = *dstp;
slouken@0
  1033
		    /*
slouken@0
  1034
		     * convert source and destination to G0RAB65565
slouken@0
  1035
		     * and blend all components at the same time
slouken@0
  1036
		     */
slouken@0
  1037
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  1038
		      + (s >> 3 & 0x1f);
slouken@0
  1039
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  1040
		    d += (s - d) * alpha >> 5;
slouken@0
  1041
		    d &= 0x07e0f81f;
slouken@1428
  1042
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1043
		  }
slouken@0
  1044
		}
slouken@0
  1045
		srcp++;
slouken@0
  1046
		dstp++;
slouken@0
  1047
	    }, width);
slouken@1895
  1048
	    /* *INDENT-ON* */
slouken@1895
  1049
        srcp += srcskip;
slouken@1895
  1050
        dstp += dstskip;
slouken@1895
  1051
    }
slouken@0
  1052
}
slouken@0
  1053
slouken@0
  1054
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@1895
  1055
static void
slouken@1895
  1056
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
slouken@0
  1057
{
slouken@2262
  1058
    int width = info->dst_w;
slouken@2262
  1059
    int height = info->dst_h;
slouken@2262
  1060
    Uint32 *srcp = (Uint32 *) info->src;
slouken@2267
  1061
    int srcskip = info->src_skip >> 2;
slouken@2262
  1062
    Uint16 *dstp = (Uint16 *) info->dst;
slouken@2267
  1063
    int dstskip = info->dst_skip >> 1;
slouken@0
  1064
slouken@1895
  1065
    while (height--) {
slouken@1895
  1066
	    /* *INDENT-OFF* */
slouken@0
  1067
	    DUFFS_LOOP4({
slouken@0
  1068
		unsigned alpha;
slouken@0
  1069
		Uint32 s = *srcp;
slouken@0
  1070
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1071
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1072
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1073
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1074
		   Benchmark this! */
slouken@689
  1075
		if(alpha) {   
slouken@689
  1076
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  1077
		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@689
  1078
		  } else {
slouken@0
  1079
		    Uint32 d = *dstp;
slouken@0
  1080
		    /*
slouken@0
  1081
		     * convert source and destination to G0RAB65565
slouken@0
  1082
		     * and blend all components at the same time
slouken@0
  1083
		     */
slouken@0
  1084
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  1085
		      + (s >> 3 & 0x1f);
slouken@0
  1086
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  1087
		    d += (s - d) * alpha >> 5;
slouken@0
  1088
		    d &= 0x03e07c1f;
slouken@1428
  1089
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  1090
		  }
slouken@0
  1091
		}
slouken@0
  1092
		srcp++;
slouken@0
  1093
		dstp++;
slouken@0
  1094
	    }, width);
slouken@1895
  1095
	    /* *INDENT-ON* */
slouken@1895
  1096
        srcp += srcskip;
slouken@1895
  1097
        dstp += dstskip;
slouken@1895
  1098
    }
slouken@0
  1099
}
slouken@0
  1100
slouken@0
  1101
/* General (slow) N->N blending with per-surface alpha */
slouken@1895
  1102
static void
slouken@1895
  1103
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  1104
{
slouken@2262
  1105
    int width = info->dst_w;
slouken@2262
  1106
    int height = info->dst_h;
slouken@2262
  1107
    Uint8 *src = info->src;
slouken@2267
  1108
    int srcskip = info->src_skip;
slouken@2262
  1109
    Uint8 *dst = info->dst;
slouken@2267
  1110
    int dstskip = info->dst_skip;
slouken@2267
  1111
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1112
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@1895
  1113
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1114
    int dstbpp = dstfmt->BytesPerPixel;
slouken@2267
  1115
    unsigned sA = info->a;
slouken@1895
  1116
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  1117
slouken@1895
  1118
    if (sA) {
slouken@1895
  1119
        while (height--) {
slouken@1895
  1120
	    /* *INDENT-OFF* */
slouken@0
  1121
	    DUFFS_LOOP4(
slouken@0
  1122
	    {
icculus@1162
  1123
		Uint32 Pixel;
slouken@0
  1124
		unsigned sR;
slouken@0
  1125
		unsigned sG;
slouken@0
  1126
		unsigned sB;
slouken@0
  1127
		unsigned dR;
slouken@0
  1128
		unsigned dG;
slouken@0
  1129
		unsigned dB;
icculus@1162
  1130
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
  1131
		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  1132
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  1133
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1134
		src += srcbpp;
slouken@0
  1135
		dst += dstbpp;
slouken@0
  1136
	    },
slouken@0
  1137
	    width);
slouken@1895
  1138
	    /* *INDENT-ON* */
slouken@1895
  1139
            src += srcskip;
slouken@1895
  1140
            dst += dstskip;
slouken@1895
  1141
        }
slouken@1895
  1142
    }
slouken@0
  1143
}
slouken@0
  1144
slouken@0
  1145
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@1895
  1146
static void
slouken@1895
  1147
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
  1148
{
slouken@2262
  1149
    int width = info->dst_w;
slouken@2262
  1150
    int height = info->dst_h;
slouken@2262
  1151
    Uint8 *src = info->src;
slouken@2267
  1152
    int srcskip = info->src_skip;
slouken@2262
  1153
    Uint8 *dst = info->dst;
slouken@2267
  1154
    int dstskip = info->dst_skip;
slouken@2267
  1155
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1156
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@2267
  1157
    Uint32 ckey = info->colorkey;
slouken@1895
  1158
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1159
    int dstbpp = dstfmt->BytesPerPixel;
slouken@2267
  1160
    unsigned sA = info->a;
slouken@1895
  1161
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  1162
slouken@1895
  1163
    while (height--) {
slouken@1895
  1164
	    /* *INDENT-OFF* */
slouken@0
  1165
	    DUFFS_LOOP4(
slouken@0
  1166
	    {
icculus@1162
  1167
		Uint32 Pixel;
slouken@0
  1168
		unsigned sR;
slouken@0
  1169
		unsigned sG;
slouken@0
  1170
		unsigned sB;
slouken@0
  1171
		unsigned dR;
slouken@0
  1172
		unsigned dG;
slouken@0
  1173
		unsigned dB;
icculus@1162
  1174
		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
icculus@1162
  1175
		if(sA && Pixel != ckey) {
icculus@1162
  1176
		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
icculus@1162
  1177
		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  1178
		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  1179
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1180
		}
slouken@0
  1181
		src += srcbpp;
slouken@0
  1182
		dst += dstbpp;
slouken@0
  1183
	    },
slouken@0
  1184
	    width);
slouken@1895
  1185
	    /* *INDENT-ON* */
slouken@1895
  1186
        src += srcskip;
slouken@1895
  1187
        dst += dstskip;
slouken@1895
  1188
    }
slouken@0
  1189
}
slouken@0
  1190
slouken@0
  1191
/* General (slow) N->N blending with pixel alpha */
slouken@1895
  1192
static void
slouken@1895
  1193
BlitNtoNPixelAlpha(SDL_BlitInfo * info)
slouken@0
  1194
{
slouken@2262
  1195
    int width = info->dst_w;
slouken@2262
  1196
    int height = info->dst_h;
slouken@2262
  1197
    Uint8 *src = info->src;
slouken@2267
  1198
    int srcskip = info->src_skip;
slouken@2262
  1199
    Uint8 *dst = info->dst;
slouken@2267
  1200
    int dstskip = info->dst_skip;
slouken@2267
  1201
    SDL_PixelFormat *srcfmt = info->src_fmt;
slouken@2267
  1202
    SDL_PixelFormat *dstfmt = info->dst_fmt;
slouken@0
  1203
slouken@1895
  1204
    int srcbpp;
slouken@1895
  1205
    int dstbpp;
slouken@0
  1206
slouken@1895
  1207
    /* Set up some basic variables */
slouken@1895
  1208
    srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  1209
    dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1210
slouken@1895
  1211
    /* FIXME: for 8bpp source alpha, this doesn't get opaque values
slouken@1895
  1212
       quite right. for <8bpp source alpha, it gets them very wrong
slouken@1895
  1213
       (check all macros!)
slouken@1895
  1214
       It is unclear whether there is a good general solution that doesn't
slouken@1895
  1215
       need a branch (or a divide). */
slouken@1895
  1216
    while (height--) {
slouken@1895
  1217
	    /* *INDENT-OFF* */
slouken@0
  1218
	    DUFFS_LOOP4(
slouken@0
  1219
	    {
icculus@1162
  1220
		Uint32 Pixel;
slouken@0
  1221
		unsigned sR;
slouken@0
  1222
		unsigned sG;
slouken@0
  1223
		unsigned sB;
slouken@0
  1224
		unsigned dR;
slouken@0
  1225
		unsigned dG;
slouken@0
  1226
		unsigned dB;
slouken@0
  1227
		unsigned sA;
slouken@0
  1228
		unsigned dA;
icculus@1162
  1229
		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@689
  1230
		if(sA) {
icculus@1162
  1231
		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@689
  1232
		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@689
  1233
		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  1234
		}
slouken@0
  1235
		src += srcbpp;
slouken@0
  1236
		dst += dstbpp;
slouken@0
  1237
	    },
slouken@0
  1238
	    width);
slouken@1895
  1239
	    /* *INDENT-ON* */
slouken@1895
  1240
        src += srcskip;
slouken@1895
  1241
        dst += dstskip;
slouken@1895
  1242
    }
slouken@0
  1243
}
slouken@0
  1244
slouken@0
  1245
slouken@2267
  1246
SDL_BlitFunc
slouken@2267
  1247
SDL_CalculateBlitA(SDL_Surface * surface)
slouken@0
  1248
{
slouken@0
  1249
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1250
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  1251
slouken@2853
  1252
    switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
slouken@2267
  1253
    case SDL_COPY_BLEND:
slouken@1895
  1254
        /* Per-pixel alpha blits */
slouken@1895
  1255
        switch (df->BytesPerPixel) {
slouken@1895
  1256
        case 1:
slouken@1895
  1257
            return BlitNto1PixelAlpha;
slouken@0
  1258
slouken@1895
  1259
        case 2:
slouken@5259
  1260
            if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@5259
  1261
                && sf->Gmask == 0xff00
slouken@5259
  1262
                && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@5259
  1263
                    || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@1895
  1264
                if (df->Gmask == 0x7e0)
slouken@1895
  1265
                    return BlitARGBto565PixelAlpha;
slouken@1895
  1266
                else if (df->Gmask == 0x3e0)
slouken@1895
  1267
                    return BlitARGBto555PixelAlpha;
slouken@1895
  1268
            }
slouken@1895
  1269
            return BlitNtoNPixelAlpha;
slouken@0
  1270
slouken@1895
  1271
        case 4:
slouken@1895
  1272
            if (sf->Rmask == df->Rmask
slouken@1895
  1273
                && sf->Gmask == df->Gmask
slouken@1895
  1274
                && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@5259
  1275
#if defined(__MMX__)
slouken@1895
  1276
                if (sf->Rshift % 8 == 0
slouken@1895
  1277
                    && sf->Gshift % 8 == 0
slouken@1895
  1278
                    && sf->Bshift % 8 == 0
slouken@1895
  1279
                    && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
slouken@1895
  1280
                    if (SDL_HasMMX())
slouken@1895
  1281
                        return BlitRGBtoRGBPixelAlphaMMX;
slouken@1895
  1282
                }
slouken@5259
  1283
#endif /* __MMX__ */
slouken@1895
  1284
                if (sf->Amask == 0xff000000) {
slouken@1895
  1285
                    return BlitRGBtoRGBPixelAlpha;
slouken@1895
  1286
                }
slouken@1895
  1287
            }
slouken@5259
  1288
            return BlitNtoNPixelAlpha;
slouken@0
  1289
slouken@1895
  1290
        case 3:
slouken@1895
  1291
        default:
slouken@1895
  1292
            return BlitNtoNPixelAlpha;
slouken@1895
  1293
        }
slouken@2267
  1294
        break;
slouken@2267
  1295
slouken@2267
  1296
    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1297
        if (sf->Amask == 0) {
slouken@2267
  1298
            /* Per-surface alpha blits */
slouken@2267
  1299
            switch (df->BytesPerPixel) {
slouken@2267
  1300
            case 1:
slouken@2267
  1301
                return BlitNto1SurfaceAlpha;
slouken@2267
  1302
slouken@2267
  1303
            case 2:
slouken@2267
  1304
                if (surface->map->identity) {
slouken@2267
  1305
                    if (df->Gmask == 0x7e0) {
slouken@2267
  1306
#ifdef __MMX__
slouken@2267
  1307
                        if (SDL_HasMMX())
slouken@2267
  1308
                            return Blit565to565SurfaceAlphaMMX;
slouken@2267
  1309
                        else
slouken@2267
  1310
#endif
slouken@2267
  1311
                            return Blit565to565SurfaceAlpha;
slouken@2267
  1312
                    } else if (df->Gmask == 0x3e0) {
slouken@2267
  1313
#ifdef __MMX__
slouken@2267
  1314
                        if (SDL_HasMMX())
slouken@2267
  1315
                            return Blit555to555SurfaceAlphaMMX;
slouken@2267
  1316
                        else
slouken@2267
  1317
#endif
slouken@2267
  1318
                            return Blit555to555SurfaceAlpha;
slouken@2267
  1319
                    }
slouken@2267
  1320
                }
slouken@2267
  1321
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1322
slouken@2267
  1323
            case 4:
slouken@2267
  1324
                if (sf->Rmask == df->Rmask
slouken@2267
  1325
                    && sf->Gmask == df->Gmask
slouken@2267
  1326
                    && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
slouken@2267
  1327
#ifdef __MMX__
slouken@2267
  1328
                    if (sf->Rshift % 8 == 0
slouken@2267
  1329
                        && sf->Gshift % 8 == 0
slouken@2267
  1330
                        && sf->Bshift % 8 == 0 && SDL_HasMMX())
slouken@2267
  1331
                        return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@2267
  1332
#endif
slouken@2267
  1333
                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
slouken@2267
  1334
                        return BlitRGBtoRGBSurfaceAlpha;
slouken@2267
  1335
                    }
slouken@2267
  1336
                }
slouken@5259
  1337
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1338
slouken@2267
  1339
            case 3:
slouken@2267
  1340
            default:
slouken@2267
  1341
                return BlitNtoNSurfaceAlpha;
slouken@2267
  1342
            }
slouken@2267
  1343
        }
slouken@2267
  1344
        break;
slouken@2267
  1345
slouken@2267
  1346
    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
slouken@2267
  1347
        if (sf->Amask == 0) {
slouken@2267
  1348
            if (df->BytesPerPixel == 1)
slouken@2267
  1349
                return BlitNto1SurfaceAlphaKey;
slouken@2267
  1350
            else
slouken@2267
  1351
                return BlitNtoNSurfaceAlphaKey;
slouken@2267
  1352
        }
slouken@2267
  1353
        break;
slouken@0
  1354
    }
slouken@2267
  1355
slouken@2267
  1356
    return NULL;
slouken@0
  1357
}
slouken@0
  1358
slouken@1895
  1359
/* vi: set ts=4 sw=4 expandtab: */