src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Thu, 21 Jun 2007 18:21:49 +0000
changeset 2132 46648dc418ec
parent 2120 2c835d58faad
child 2141 e1a70460c354
permissions -rw-r--r--
Merged r3094:3095 from branches/SDL-1.2: Visual C++ 6.0 fixes.
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@1312
     3
    Copyright (C) 1997-2006 Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@1312
     6
    modify it under the terms of the GNU Lesser General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@1312
     8
    version 2.1 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@1312
    13
    Lesser General Public License for more details.
slouken@0
    14
slouken@1312
    15
    You should have received a copy of the GNU Lesser General Public
slouken@1312
    16
    License along with this library; if not, write to the Free Software
slouken@1312
    17
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@1402
    22
#include "SDL_config.h"
slouken@0
    23
slouken@0
    24
#include "SDL_video.h"
slouken@0
    25
#include "SDL_blit.h"
slouken@0
    26
icculus@2132
    27
/*
icculus@2132
    28
  In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
icculus@2132
    29
   Checking if _mm_free is #defined in malloc.h is is the only way to
icculus@2132
    30
   determine if the Processor Pack is installed, as far as I can tell.
icculus@2132
    31
*/
icculus@2132
    32
slouken@1542
    33
#if SDL_ASSEMBLY_ROUTINES
icculus@2132
    34
#  if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
icculus@2132
    35
#    define MMX_ASMBLIT 1
icculus@2132
    36
#    define GCC_ASMBLIT 1
icculus@2132
    37
#  elif defined(_MSC_VER) && defined(_M_IX86)
icculus@2132
    38
#    if (_MSC_VER <= 1200)  
icculus@2132
    39
#      include <malloc.h>   
icculus@2132
    40
#      if defined(_mm_free)
icculus@2132
    41
#          define HAVE_MMINTRIN_H 1
icculus@2132
    42
#      endif
icculus@2132
    43
#    else  /* Visual Studio > VC6 always has mmintrin.h */
icculus@2132
    44
#      define HAVE_MMINTRIN_H 1
icculus@2132
    45
#    endif
icculus@2132
    46
#    if HAVE_MMINTRIN_H
icculus@2132
    47
#      define MMX_ASMBLIT 1
icculus@2132
    48
#      define MSVC_ASMBLIT 1
icculus@2132
    49
#    endif
icculus@2132
    50
#  endif
slouken@1542
    51
#endif /* SDL_ASSEMBLY_ROUTINES */
slouken@880
    52
slouken@739
    53
/* Function to check the CPU flags */
slouken@739
    54
#include "SDL_cpuinfo.h"
slouken@1542
    55
#if GCC_ASMBLIT
slouken@689
    56
#include "mmx.h"
slouken@1542
    57
#elif MSVC_ASMBLIT
slouken@1542
    58
#include <mmintrin.h>
slouken@1542
    59
#include <mm3dnow.h>
slouken@689
    60
#endif
slouken@689
    61
slouken@0
    62
/* Functions to perform alpha blended blitting */
slouken@0
    63
slouken@0
    64
/* N->1 blending with per-surface alpha */
slouken@1895
    65
static void
slouken@1895
    66
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
    67
{
slouken@1895
    68
    int width = info->d_width;
slouken@1895
    69
    int height = info->d_height;
slouken@1895
    70
    Uint8 *src = info->s_pixels;
slouken@1895
    71
    int srcskip = info->s_skip;
slouken@1895
    72
    Uint8 *dst = info->d_pixels;
slouken@1895
    73
    int dstskip = info->d_skip;
slouken@1895
    74
    Uint8 *palmap = info->table;
slouken@1895
    75
    SDL_PixelFormat *srcfmt = info->src;
slouken@1895
    76
    SDL_PixelFormat *dstfmt = info->dst;
slouken@1895
    77
    int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    78
slouken@1895
    79
    const unsigned A = srcfmt->alpha;
slouken@0
    80
slouken@1895
    81
    while (height--) {
slouken@1895
    82
	    /* *INDENT-OFF* */
slouken@0
    83
	    DUFFS_LOOP4(
slouken@0
    84
	    {
icculus@1162
    85
		Uint32 Pixel;
slouken@0
    86
		unsigned sR;
slouken@0
    87
		unsigned sG;
slouken@0
    88
		unsigned sB;
slouken@0
    89
		unsigned dR;
slouken@0
    90
		unsigned dG;
slouken@0
    91
		unsigned dB;
icculus@1162
    92
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    93
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    94
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    95
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
    96
		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
    97
		dR &= 0xff;
slouken@0
    98
		dG &= 0xff;
slouken@0
    99
		dB &= 0xff;
slouken@0
   100
		/* Pack RGB into 8bit pixel */
slouken@0
   101
		if ( palmap == NULL ) {
slouken@0
   102
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   103
			  ((dG>>5)<<(2))|
slouken@0
   104
			  ((dB>>6)<<(0));
slouken@0
   105
		} else {
slouken@0
   106
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   107
				  ((dG>>5)<<(2))  |
slouken@0
   108
				  ((dB>>6)<<(0))];
slouken@0
   109
		}
slouken@0
   110
		dst++;
slouken@0
   111
		src += srcbpp;
slouken@0
   112
	    },
slouken@0
   113
	    width);
slouken@1895
   114
	    /* *INDENT-ON* */
slouken@1895
   115
        src += srcskip;
slouken@1895
   116
        dst += dstskip;
slouken@1895
   117
    }
slouken@0
   118
}
slouken@0
   119
slouken@0
   120
/* N->1 blending with pixel alpha */
slouken@1895
   121
static void
slouken@1895
   122
BlitNto1PixelAlpha(SDL_BlitInfo * info)
slouken@0
   123
{
slouken@1895
   124
    int width = info->d_width;
slouken@1895
   125
    int height = info->d_height;
slouken@1895
   126
    Uint8 *src = info->s_pixels;
slouken@1895
   127
    int srcskip = info->s_skip;
slouken@1895
   128
    Uint8 *dst = info->d_pixels;
slouken@1895
   129
    int dstskip = info->d_skip;
slouken@1895
   130
    Uint8 *palmap = info->table;
slouken@1895
   131
    SDL_PixelFormat *srcfmt = info->src;
slouken@1895
   132
    SDL_PixelFormat *dstfmt = info->dst;
slouken@1895
   133
    int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   134
slouken@1895
   135
    /* FIXME: fix alpha bit field expansion here too? */
slouken@1895
   136
    while (height--) {
slouken@1895
   137
	    /* *INDENT-OFF* */
slouken@0
   138
	    DUFFS_LOOP4(
slouken@0
   139
	    {
icculus@1162
   140
		Uint32 Pixel;
slouken@0
   141
		unsigned sR;
slouken@0
   142
		unsigned sG;
slouken@0
   143
		unsigned sB;
slouken@0
   144
		unsigned sA;
slouken@0
   145
		unsigned dR;
slouken@0
   146
		unsigned dG;
slouken@0
   147
		unsigned dB;
icculus@1162
   148
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
   149
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
   150
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   151
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
   152
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   153
		dR &= 0xff;
slouken@0
   154
		dG &= 0xff;
slouken@0
   155
		dB &= 0xff;
slouken@0
   156
		/* Pack RGB into 8bit pixel */
slouken@0
   157
		if ( palmap == NULL ) {
slouken@0
   158
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   159
			  ((dG>>5)<<(2))|
slouken@0
   160
			  ((dB>>6)<<(0));
slouken@0
   161
		} else {
slouken@0
   162
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   163
				  ((dG>>5)<<(2))  |
slouken@0
   164
				  ((dB>>6)<<(0))  ];
slouken@0
   165
		}
slouken@0
   166
		dst++;
slouken@0
   167
		src += srcbpp;
slouken@0
   168
	    },
slouken@0
   169
	    width);
slouken@1895
   170
	    /* *INDENT-ON* */
slouken@1895
   171
        src += srcskip;
slouken@1895
   172
        dst += dstskip;
slouken@1895
   173
    }
slouken@0
   174
}
slouken@0
   175
slouken@0
   176
/* colorkeyed N->1 blending with per-surface alpha */
slouken@1895
   177
static void
slouken@1895
   178
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
   179
{
slouken@1895
   180
    int width = info->d_width;
slouken@1895
   181
    int height = info->d_height;
slouken@1895
   182
    Uint8 *src = info->s_pixels;
slouken@1895
   183
    int srcskip = info->s_skip;
slouken@1895
   184
    Uint8 *dst = info->d_pixels;
slouken@1895
   185
    int dstskip = info->d_skip;
slouken@1895
   186
    Uint8 *palmap = info->table;
slouken@1895
   187
    SDL_PixelFormat *srcfmt = info->src;
slouken@1895
   188
    SDL_PixelFormat *dstfmt = info->dst;
slouken@1895
   189
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
   190
    Uint32 ckey = srcfmt->colorkey;
slouken@0
   191
slouken@1895
   192
    const int A = srcfmt->alpha;
slouken@0
   193
slouken@1895
   194
    while (height--) {
slouken@1895
   195
	    /* *INDENT-OFF* */
slouken@0
   196
	    DUFFS_LOOP(
slouken@0
   197
	    {
icculus@1162
   198
		Uint32 Pixel;
slouken@0
   199
		unsigned sR;
slouken@0
   200
		unsigned sG;
slouken@0
   201
		unsigned sB;
slouken@0
   202
		unsigned dR;
slouken@0
   203
		unsigned dG;
slouken@0
   204
		unsigned dB;
icculus@1162
   205
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   206
		if ( Pixel != ckey ) {
slouken@0
   207
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   208
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   209
		    dB = dstfmt->palette->colors[*dst].b;
slouken@0
   210
		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
   211
		    dR &= 0xff;
slouken@0
   212
		    dG &= 0xff;
slouken@0
   213
		    dB &= 0xff;
slouken@0
   214
		    /* Pack RGB into 8bit pixel */
slouken@0
   215
		    if ( palmap == NULL ) {
slouken@0
   216
			*dst =((dR>>5)<<(3+2))|
slouken@0
   217
			      ((dG>>5)<<(2)) |
slouken@0
   218
			      ((dB>>6)<<(0));
slouken@0
   219
		    } else {
slouken@0
   220
			*dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   221
				      ((dG>>5)<<(2))  |
slouken@0
   222
				      ((dB>>6)<<(0))  ];
slouken@0
   223
		    }
slouken@0
   224
		}
slouken@0
   225
		dst++;
slouken@0
   226
		src += srcbpp;
slouken@0
   227
	    },
slouken@0
   228
	    width);
slouken@1895
   229
	    /* *INDENT-ON* */
slouken@1895
   230
        src += srcskip;
slouken@1895
   231
        dst += dstskip;
slouken@1895
   232
    }
slouken@0
   233
}
slouken@0
   234
slouken@1542
   235
#if GCC_ASMBLIT
slouken@689
   236
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   237
static void
slouken@1895
   238
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@689
   239
{
slouken@1895
   240
    int width = info->d_width;
slouken@1895
   241
    int height = info->d_height;
slouken@1895
   242
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   243
    int srcskip = info->s_skip >> 2;
slouken@1895
   244
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   245
    int dstskip = info->d_skip >> 2;
slouken@1895
   246
    Uint32 dalpha = info->dst->Amask;
slouken@1895
   247
    Uint8 load[8];
slouken@1542
   248
slouken@1895
   249
    *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
slouken@1895
   250
    movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
slouken@1895
   251
    *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
slouken@1895
   252
    movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
slouken@1895
   253
    movd_m2r(dalpha, mm7);      /* dst alpha mask */
slouken@1895
   254
    punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
slouken@1895
   255
    while (height--) {
slouken@1895
   256
		/* *INDENT-OFF* */
slouken@1542
   257
		DUFFS_LOOP_DOUBLE2(
slouken@1542
   258
		{
slouken@1542
   259
			Uint32 s = *srcp++;
slouken@1542
   260
			Uint32 d = *dstp;
slouken@1542
   261
			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1542
   262
				   + (s & d & 0x00010101)) | dalpha;
slouken@1542
   263
		},{
slouken@1542
   264
			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@1542
   265
			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@1542
   266
slouken@1542
   267
			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
slouken@1542
   268
			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
slouken@1542
   269
slouken@1542
   270
			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
slouken@1542
   271
			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
slouken@1542
   272
			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
slouken@1542
   273
			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
slouken@1542
   274
			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
slouken@1542
   275
			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
slouken@1542
   276
			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
slouken@1542
   277
			
slouken@1542
   278
			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@1542
   279
			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
slouken@1542
   280
			dstp += 2;
slouken@1542
   281
			srcp += 2;
slouken@1542
   282
		}, width);
slouken@1895
   283
		/* *INDENT-ON* */
slouken@1895
   284
        srcp += srcskip;
slouken@1895
   285
        dstp += dstskip;
slouken@1895
   286
    }
slouken@1895
   287
    emms();
slouken@689
   288
}
slouken@689
   289
slouken@689
   290
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   291
static void
slouken@1895
   292
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@689
   293
{
slouken@1895
   294
    SDL_PixelFormat *df = info->dst;
slouken@1895
   295
    unsigned alpha = info->src->alpha;
slouken@1895
   296
slouken@1895
   297
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   298
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   299
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   300
    } else {
slouken@1895
   301
        int width = info->d_width;
slouken@1895
   302
        int height = info->d_height;
slouken@1895
   303
        Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   304
        int srcskip = info->s_skip >> 2;
slouken@1895
   305
        Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   306
        int dstskip = info->d_skip >> 2;
slouken@1542
   307
slouken@1895
   308
        pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
slouken@1895
   309
        /* form the alpha mult */
slouken@1895
   310
        movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
slouken@1895
   311
        punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
slouken@1895
   312
        punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
slouken@1895
   313
        alpha =
slouken@1895
   314
            (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
slouken@1895
   315
                                                           Bshift);
slouken@1895
   316
        movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
slouken@1895
   317
        punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
slouken@1895
   318
        pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
slouken@1895
   319
        /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   320
        movd_m2r(df->Amask, mm7);       /* dst alpha mask */
slouken@1895
   321
        punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
slouken@1542
   322
slouken@1895
   323
        while (height--) {
slouken@1895
   324
			/* *INDENT-OFF* */
slouken@689
   325
			DUFFS_LOOP_DOUBLE2({
slouken@689
   326
				/* One Pixel Blend */
slouken@1542
   327
				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   328
				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   329
				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
slouken@1542
   330
				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
slouken@1542
   331
slouken@1542
   332
				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@1542
   333
				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   334
				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@1542
   335
				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@1542
   336
slouken@1542
   337
				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
slouken@1542
   338
				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@1542
   339
				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
slouken@689
   340
				++srcp;
slouken@689
   341
				++dstp;
slouken@689
   342
			},{
slouken@1542
   343
				/* Two Pixels Blend */
slouken@689
   344
				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
slouken@1542
   345
				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@1542
   346
				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
slouken@1542
   347
				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@1542
   348
slouken@1542
   349
				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
slouken@1542
   350
				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
slouken@1542
   351
				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
slouken@1542
   352
				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
slouken@1542
   353
slouken@1542
   354
				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
slouken@1542
   355
				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
slouken@1542
   356
				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
slouken@1542
   357
				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
slouken@1542
   358
slouken@1542
   359
				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
slouken@1542
   360
				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   361
				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@1542
   362
				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
slouken@1542
   363
slouken@1542
   364
				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
slouken@1542
   365
				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
slouken@1542
   366
				
slouken@1542
   367
				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
slouken@1542
   368
slouken@1542
   369
  				srcp += 2;
slouken@1542
   370
  				dstp += 2;
slouken@1542
   371
  			}, width);
slouken@1895
   372
			/* *INDENT-ON* */
slouken@1895
   373
            srcp += srcskip;
slouken@1895
   374
            dstp += dstskip;
slouken@1895
   375
        }
slouken@1895
   376
        emms();
slouken@1895
   377
    }
slouken@689
   378
}
slouken@689
   379
slouken@689
   380
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   381
static void
slouken@1895
   382
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@689
   383
{
slouken@1895
   384
    int width = info->d_width;
slouken@1895
   385
    int height = info->d_height;
slouken@1895
   386
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   387
    int srcskip = info->s_skip >> 2;
slouken@1895
   388
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   389
    int dstskip = info->d_skip >> 2;
slouken@1895
   390
    SDL_PixelFormat *sf = info->src;
slouken@1895
   391
    Uint32 amask = sf->Amask;
slouken@1542
   392
slouken@1895
   393
    pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
slouken@1895
   394
    /* form multiplication mask */
slouken@1895
   395
    movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
slouken@1895
   396
    punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
slouken@1895
   397
    pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
slouken@1895
   398
    movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
slouken@1895
   399
    pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
slouken@1895
   400
    /* form channel masks */
slouken@1895
   401
    movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
slouken@1895
   402
    packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
slouken@1895
   403
    packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
slouken@1895
   404
    pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
slouken@1895
   405
    /* get alpha channel shift */
slouken@2120
   406
    /* *INDENT-OFF* */
icculus@2101
   407
    __asm__ __volatile__ (
icculus@2101
   408
        "movd %0, %%mm5"
icculus@2101
   409
        : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
slouken@2120
   410
    /* *INDENT-ON* */
slouken@1542
   411
slouken@1895
   412
    while (height--) {
slouken@1895
   413
	    /* *INDENT-OFF* */
slouken@689
   414
	    DUFFS_LOOP4({
slouken@1542
   415
		Uint32 alpha = *srcp & amask;
slouken@689
   416
		/* FIXME: Here we special-case opaque alpha since the
slouken@1542
   417
			compositioning used (>>8 instead of /255) doesn't handle
slouken@1542
   418
			it correctly. Also special-case alpha=0 for speed?
slouken@1542
   419
			Benchmark this! */
slouken@1542
   420
		if(alpha == 0) {
slouken@1542
   421
			/* do nothing */
slouken@1542
   422
		} else if(alpha == amask) {
slouken@1542
   423
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
   424
			/* using MMX here to free up regular registers for other things */
slouken@1542
   425
			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   426
			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   427
			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
slouken@1542
   428
			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
slouken@1542
   429
			por_r2r(mm1, mm2); /* src | dst -> mm2 */
slouken@1542
   430
			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
slouken@1542
   431
		} else {
slouken@1542
   432
			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   433
			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
slouken@1542
   434
slouken@1542
   435
			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   436
			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
slouken@1542
   437
slouken@1542
   438
			__asm__ __volatile__ (
slouken@1542
   439
				"movd %0, %%mm4"
slouken@1542
   440
				: : "r" (alpha) ); /* 0000A000 -> mm4 */
slouken@1542
   441
			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
slouken@1542
   442
			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
slouken@1542
   443
			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
slouken@1542
   444
			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
slouken@1542
   445
slouken@1542
   446
			/* blend */		    
slouken@1542
   447
			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@1542
   448
			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   449
			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
slouken@1542
   450
			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@1542
   451
			
slouken@1542
   452
			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
slouken@1542
   453
			movd_r2m(mm2, *dstp);/* mm2 -> dst */
slouken@689
   454
		}
slouken@689
   455
		++srcp;
slouken@689
   456
		++dstp;
slouken@689
   457
	    }, width);
slouken@1895
   458
	    /* *INDENT-ON* */
slouken@1895
   459
        srcp += srcskip;
slouken@1895
   460
        dstp += dstskip;
slouken@1895
   461
    }
slouken@1895
   462
    emms();
slouken@689
   463
}
slouken@1895
   464
slouken@1542
   465
/* End GCC_ASMBLIT */
slouken@1542
   466
slouken@1542
   467
#elif MSVC_ASMBLIT
slouken@1542
   468
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
   469
static void
slouken@1895
   470
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
slouken@1542
   471
{
slouken@1895
   472
    int width = info->d_width;
slouken@1895
   473
    int height = info->d_height;
slouken@1895
   474
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   475
    int srcskip = info->s_skip >> 2;
slouken@1895
   476
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   477
    int dstskip = info->d_skip >> 2;
slouken@1895
   478
    Uint32 dalpha = info->dst->Amask;
slouken@1895
   479
slouken@1895
   480
    __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   481
slouken@1895
   482
    hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
slouken@1895
   483
    lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
slouken@1895
   484
    dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
slouken@1895
   485
slouken@1895
   486
    while (height--) {
slouken@1895
   487
        int n = width;
slouken@1895
   488
        if (n & 1) {
slouken@1895
   489
            Uint32 s = *srcp++;
slouken@1895
   490
            Uint32 d = *dstp;
slouken@1895
   491
            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1895
   492
                       + (s & d & 0x00010101)) | dalpha;
slouken@1895
   493
            n--;
slouken@1895
   494
        }
slouken@1542
   495
slouken@1895
   496
        for (n >>= 1; n > 0; --n) {
slouken@1895
   497
            dst1 = *(__m64 *) dstp;     /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   498
            dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   499
slouken@1895
   500
            src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   501
            src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   502
slouken@1895
   503
            dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
slouken@1895
   504
            src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
slouken@1895
   505
            src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
slouken@1895
   506
            src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
slouken@1542
   507
slouken@1895
   508
            dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
slouken@1895
   509
            dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
slouken@1895
   510
            dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
slouken@1895
   511
            dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
slouken@1542
   512
slouken@1895
   513
            *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
slouken@1895
   514
            dstp += 2;
slouken@1895
   515
            srcp += 2;
slouken@1895
   516
        }
slouken@1895
   517
slouken@1895
   518
        srcp += srcskip;
slouken@1895
   519
        dstp += dstskip;
slouken@1895
   520
    }
slouken@1895
   521
    _mm_empty();
slouken@1542
   522
}
slouken@1542
   523
slouken@1542
   524
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
   525
static void
slouken@1895
   526
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   527
{
slouken@1895
   528
    SDL_PixelFormat *df = info->dst;
slouken@1895
   529
    Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
slouken@1895
   530
    unsigned alpha = info->src->alpha;
slouken@1542
   531
slouken@1895
   532
    if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1895
   533
        /* only call a128 version when R,G,B occupy lower bits */
slouken@1895
   534
        BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1895
   535
    } else {
slouken@1895
   536
        int width = info->d_width;
slouken@1895
   537
        int height = info->d_height;
slouken@1895
   538
        Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   539
        int srcskip = info->s_skip >> 2;
slouken@1895
   540
        Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   541
        int dstskip = info->d_skip >> 2;
slouken@1895
   542
        Uint32 dalpha = df->Amask;
slouken@1895
   543
        Uint32 amult;
slouken@1895
   544
slouken@1895
   545
        __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   546
slouken@1895
   547
        mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
slouken@1895
   548
        /* form the alpha mult */
slouken@1895
   549
        amult = alpha | (alpha << 8);
slouken@1895
   550
        amult = amult | (amult << 16);
slouken@1895
   551
        chanmask =
slouken@1895
   552
            (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
slouken@1895
   553
                                                           Bshift);
slouken@1895
   554
        mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1895
   555
        mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1895
   556
        /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1895
   557
        dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
slouken@1542
   558
slouken@1895
   559
        while (height--) {
slouken@1895
   560
            int n = width;
slouken@1895
   561
            if (n & 1) {
slouken@1895
   562
                /* One Pixel Blend */
slouken@1895
   563
                src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
slouken@1895
   564
                src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1895
   565
slouken@1895
   566
                dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
slouken@1895
   567
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   568
slouken@1895
   569
                src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
slouken@1895
   570
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   571
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   572
                dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1895
   573
slouken@1895
   574
                dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
slouken@1895
   575
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1895
   576
                *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   577
slouken@1895
   578
                ++srcp;
slouken@1895
   579
                ++dstp;
slouken@1895
   580
slouken@1895
   581
                n--;
slouken@1895
   582
            }
slouken@1542
   583
slouken@1895
   584
            for (n >>= 1; n > 0; --n) {
slouken@1895
   585
                /* Two Pixels Blend */
slouken@1895
   586
                src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1895
   587
                src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
slouken@1895
   588
                src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1895
   589
                src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   590
slouken@1895
   591
                dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1895
   592
                dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
slouken@1895
   593
                dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1895
   594
                dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1542
   595
slouken@1895
   596
                src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
slouken@1895
   597
                src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
slouken@1895
   598
                src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
slouken@1895
   599
                dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1542
   600
slouken@1895
   601
                src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
slouken@1895
   602
                src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
slouken@1895
   603
                src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
slouken@1895
   604
                dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1895
   605
slouken@1895
   606
                dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1895
   607
                dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1542
   608
slouken@1895
   609
                *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1542
   610
slouken@1895
   611
                srcp += 2;
slouken@1895
   612
                dstp += 2;
slouken@1895
   613
            }
slouken@1895
   614
            srcp += srcskip;
slouken@1895
   615
            dstp += dstskip;
slouken@1895
   616
        }
slouken@1895
   617
        _mm_empty();
slouken@1895
   618
    }
slouken@1542
   619
}
slouken@1542
   620
slouken@1542
   621
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
   622
static void
slouken@1895
   623
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
slouken@1542
   624
{
slouken@1895
   625
    int width = info->d_width;
slouken@1895
   626
    int height = info->d_height;
slouken@1895
   627
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
   628
    int srcskip = info->s_skip >> 2;
slouken@1895
   629
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
   630
    int dstskip = info->d_skip >> 2;
slouken@1895
   631
    SDL_PixelFormat *sf = info->src;
slouken@1895
   632
    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1895
   633
    Uint32 amask = sf->Amask;
slouken@1895
   634
    Uint32 ashift = sf->Ashift;
slouken@1895
   635
    Uint64 multmask;
slouken@1542
   636
slouken@1895
   637
    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
slouken@1542
   638
slouken@1895
   639
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@1895
   640
	/* *INDENT-OFF* */
slouken@1895
   641
	multmask = ~(0xFFFFI64 << (ashift * 2));
slouken@1895
   642
	/* *INDENT-ON* */
slouken@1895
   643
    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
slouken@1542
   644
slouken@1895
   645
    while (height--) {
slouken@1895
   646
		/* *INDENT-OFF* */
slouken@1542
   647
		DUFFS_LOOP4({
slouken@1542
   648
		Uint32 alpha = *srcp & amask;
slouken@1542
   649
		if (alpha == 0) {
slouken@1542
   650
			/* do nothing */
slouken@1542
   651
		} else if (alpha == amask) {
slouken@1542
   652
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
   653
			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
slouken@1542
   654
		} else {
slouken@1542
   655
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
slouken@1542
   656
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   657
slouken@1542
   658
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
   659
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   660
slouken@1542
   661
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
   662
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
   663
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
   664
			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
   665
			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
slouken@1542
   666
slouken@1542
   667
			/* blend */		    
slouken@1542
   668
			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
slouken@1542
   669
			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
slouken@1542
   670
			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
slouken@1542
   671
			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
slouken@1542
   672
			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
   673
			
slouken@1542
   674
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   675
		}
slouken@1542
   676
		++srcp;
slouken@1542
   677
		++dstp;
slouken@1542
   678
	    }, width);
slouken@1895
   679
		/* *INDENT-ON* */
slouken@1895
   680
        srcp += srcskip;
slouken@1895
   681
        dstp += dstskip;
slouken@1895
   682
    }
slouken@1895
   683
    _mm_empty();
slouken@1542
   684
}
slouken@1895
   685
slouken@1542
   686
/* End MSVC_ASMBLIT */
slouken@1542
   687
slouken@1542
   688
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
   689
slouken@1361
   690
#if SDL_ALTIVEC_BLITTERS
slouken@1795
   691
#if __MWERKS__
slouken@1795
   692
#pragma altivec_model on
slouken@1795
   693
#endif
slouken@1361
   694
#if HAVE_ALTIVEC_H
icculus@1162
   695
#include <altivec.h>
icculus@1175
   696
#endif
icculus@1047
   697
#include <assert.h>
icculus@1162
   698
slouken@1402
   699
#if (defined(__MACOSX__) && (__GNUC__ < 4))
slouken@1895
   700
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   701
        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
slouken@1895
   702
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   703
        (vector unsigned short) ( a,b,c,d,e,f,g,h )
icculus@1162
   704
#else
slouken@1895
   705
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   706
        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
slouken@1895
   707
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   708
        (vector unsigned short) { a,b,c,d,e,f,g,h }
icculus@1162
   709
#endif
icculus@1162
   710
icculus@1047
   711
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
icculus@1047
   712
#define VECPRINT(msg, v) do { \
icculus@1047
   713
    vector unsigned int tmpvec = (vector unsigned int)(v); \
icculus@1047
   714
    unsigned int *vp = (unsigned int *)&tmpvec; \
icculus@1047
   715
    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
icculus@1047
   716
} while (0)
icculus@1047
   717
icculus@1047
   718
/* the permuation vector that takes the high bytes out of all the appropriate shorts 
icculus@1047
   719
    (vector unsigned char)(
icculus@1047
   720
        0x00, 0x10, 0x02, 0x12,
icculus@1047
   721
        0x04, 0x14, 0x06, 0x16,
icculus@1047
   722
        0x08, 0x18, 0x0A, 0x1A,
icculus@1047
   723
        0x0C, 0x1C, 0x0E, 0x1E );
icculus@1047
   724
*/
icculus@1047
   725
#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
icculus@1047
   726
#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
icculus@1047
   727
#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
icculus@1047
   728
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
icculus@1047
   729
    ? vec_lvsl(0, src) \
icculus@1047
   730
    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
icculus@1047
   731
slouken@1895
   732
icculus@1047
   733
#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
icculus@1047
   734
    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
icculus@1047
   735
    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
icculus@1047
   736
    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
icculus@1047
   737
    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
icculus@1047
   738
    /* valpha2 is 255-alpha */ \
icculus@1047
   739
    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
icculus@1047
   740
    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
icculus@1047
   741
    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
icculus@1047
   742
    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
icculus@1047
   743
    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
icculus@1047
   744
    /* add source and dest */ \
icculus@1047
   745
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   746
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   747
    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
icculus@1047
   748
    vtemp1 = vec_add(vtemp1, v1_16); \
icculus@1047
   749
    vtemp3 = vec_sr(vtemp1, v8_16); \
icculus@1047
   750
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   751
    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
icculus@1047
   752
    vtemp2 = vec_add(vtemp2, v1_16); \
icculus@1047
   753
    vtemp4 = vec_sr(vtemp2, v8_16); \
icculus@1047
   754
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   755
    /* (>>8) and get ARGBARGBARGBARGB */ \
icculus@1047
   756
    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
icculus@1047
   757
} while (0)
slouken@1895
   758
icculus@1047
   759
/* Calculate the permute vector used for 32->32 swizzling */
slouken@1895
   760
static vector unsigned char
slouken@1895
   761
calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
icculus@1047
   762
{
icculus@1047
   763
    /*
icculus@1047
   764
     * We have to assume that the bits that aren't used by other
icculus@1047
   765
     *  colors is alpha, and it's one complete byte, since some formats
icculus@1047
   766
     *  leave alpha with a zero mask, but we should still swizzle the bits.
icculus@1047
   767
     */
icculus@1047
   768
    /* ARGB */
icculus@1047
   769
    const static struct SDL_PixelFormat default_pixel_format = {
icculus@1047
   770
        NULL, 0, 0,
icculus@1047
   771
        0, 0, 0, 0,
icculus@1047
   772
        16, 8, 0, 24,
icculus@1047
   773
        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
slouken@1895
   774
        0, 0
slouken@1895
   775
    };
icculus@1047
   776
    if (!srcfmt) {
icculus@1047
   777
        srcfmt = &default_pixel_format;
icculus@1047
   778
    }
icculus@1047
   779
    if (!dstfmt) {
icculus@1047
   780
        dstfmt = &default_pixel_format;
icculus@1047
   781
    }
slouken@1895
   782
    const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
slouken@1895
   783
                                                       0x04, 0x04, 0x04, 0x04,
slouken@1895
   784
                                                       0x08, 0x08, 0x08, 0x08,
slouken@1895
   785
                                                       0x0C, 0x0C, 0x0C,
slouken@1895
   786
                                                       0x0C);
icculus@1047
   787
    vector unsigned char vswiz;
icculus@1047
   788
    vector unsigned int srcvec;
icculus@1047
   789
#define RESHIFT(X) (3 - ((X) >> 3))
icculus@1047
   790
    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
icculus@1047
   791
    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
icculus@1047
   792
    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
icculus@1047
   793
    Uint32 amask;
icculus@1047
   794
    /* Use zero for alpha if either surface doesn't have alpha */
icculus@1047
   795
    if (dstfmt->Amask) {
slouken@1895
   796
        amask =
slouken@1895
   797
            ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
slouken@1895
   798
                                                                   Ashift);
icculus@1047
   799
    } else {
slouken@1895
   800
        amask =
slouken@1895
   801
            0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
slouken@1895
   802
                          0xFFFFFFFF);
icculus@1047
   803
    }
slouken@1895
   804
#undef RESHIFT
slouken@1895
   805
    ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
slouken@1895
   806
    vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
slouken@1895
   807
    return (vswiz);
icculus@1047
   808
}
icculus@1047
   809
slouken@1895
   810
static void
slouken@1895
   811
Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
icculus@1047
   812
{
icculus@1047
   813
    int height = info->d_height;
slouken@1895
   814
    Uint8 *src = (Uint8 *) info->s_pixels;
icculus@1047
   815
    int srcskip = info->s_skip;
slouken@1895
   816
    Uint8 *dst = (Uint8 *) info->d_pixels;
icculus@1047
   817
    int dstskip = info->d_skip;
icculus@1047
   818
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   819
icculus@1047
   820
    vector unsigned char v0 = vec_splat_u8(0);
icculus@1047
   821
    vector unsigned short v8_16 = vec_splat_u16(8);
icculus@1047
   822
    vector unsigned short v1_16 = vec_splat_u16(1);
icculus@1047
   823
    vector unsigned short v2_16 = vec_splat_u16(2);
icculus@1047
   824
    vector unsigned short v3_16 = vec_splat_u16(3);
icculus@1047
   825
    vector unsigned int v8_32 = vec_splat_u32(8);
icculus@1047
   826
    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
slouken@1895
   827
    vector unsigned short v3f =
slouken@1895
   828
        VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
slouken@1895
   829
                          0x003f, 0x003f, 0x003f, 0x003f);
slouken@1895
   830
    vector unsigned short vfc =
slouken@1895
   831
        VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
slouken@1895
   832
                          0x00fc, 0x00fc, 0x00fc, 0x00fc);
icculus@1047
   833
icculus@1047
   834
    /* 
slouken@1895
   835
       0x10 - 0x1f is the alpha
slouken@1895
   836
       0x00 - 0x0e evens are the red
slouken@1895
   837
       0x01 - 0x0f odds are zero
slouken@1895
   838
     */
slouken@1895
   839
    vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
slouken@1895
   840
                                                       0x10, 0x02, 0x01, 0x01,
slouken@1895
   841
                                                       0x10, 0x04, 0x01, 0x01,
slouken@1895
   842
                                                       0x10, 0x06, 0x01,
slouken@1895
   843
                                                       0x01);
slouken@1895
   844
    vector unsigned char vredalpha2 =
slouken@1895
   845
        (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
slouken@1895
   846
                                        vec_sl(v8_32, v16_32))
slouken@1895
   847
        );
icculus@1047
   848
    /*
slouken@1895
   849
       0x00 - 0x0f is ARxx ARxx ARxx ARxx
slouken@1895
   850
       0x11 - 0x0f odds are blue
slouken@1895
   851
     */
slouken@1895
   852
    vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
slouken@1895
   853
                                                   0x04, 0x05, 0x06, 0x13,
slouken@1895
   854
                                                   0x08, 0x09, 0x0a, 0x15,
slouken@1895
   855
                                                   0x0c, 0x0d, 0x0e, 0x17);
slouken@1895
   856
    vector unsigned char vblue2 =
slouken@1895
   857
        (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
slouken@1895
   858
        );
icculus@1047
   859
    /*
slouken@1895
   860
       0x00 - 0x0f is ARxB ARxB ARxB ARxB
slouken@1895
   861
       0x10 - 0x0e evens are green
slouken@1895
   862
     */
slouken@1895
   863
    vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
slouken@1895
   864
                                                    0x04, 0x05, 0x12, 0x07,
slouken@1895
   865
                                                    0x08, 0x09, 0x14, 0x0b,
slouken@1895
   866
                                                    0x0c, 0x0d, 0x16, 0x0f);
slouken@1895
   867
    vector unsigned char vgreen2 =
slouken@1895
   868
        (vector unsigned
slouken@1895
   869
         char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
slouken@1895
   870
        );
slouken@1895
   871
    vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
slouken@1895
   872
                                                    0x00, 0x0a, 0x00, 0x0e,
slouken@1895
   873
                                                    0x00, 0x12, 0x00, 0x16,
slouken@1895
   874
                                                    0x00, 0x1a, 0x00, 0x1e);
icculus@1047
   875
    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   876
    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
slouken@1895
   877
    vector unsigned char valphaPermute =
slouken@1895
   878
        vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
icculus@1047
   879
slouken@1895
   880
    vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
icculus@1047
   881
    vf800 = vec_sl(vf800, vec_splat_u16(8));
icculus@1047
   882
slouken@1895
   883
    while (height--) {
icculus@1047
   884
        int extrawidth;
icculus@1047
   885
        vector unsigned char valigner;
icculus@1047
   886
        vector unsigned char vsrc;
icculus@1047
   887
        vector unsigned char voverflow;
icculus@1047
   888
        int width = info->d_width;
icculus@1047
   889
icculus@1047
   890
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   891
        while (condition) { \
icculus@1162
   892
            Uint32 Pixel; \
icculus@1047
   893
            unsigned sR, sG, sB, dR, dG, dB, sA; \
icculus@1162
   894
            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
   895
            if(sA) { \
icculus@1047
   896
                unsigned short dstpixel = *((unsigned short *)dst); \
icculus@1047
   897
                dR = (dstpixel >> 8) & 0xf8; \
icculus@1047
   898
                dG = (dstpixel >> 3) & 0xfc; \
icculus@1047
   899
                dB = (dstpixel << 3) & 0xf8; \
icculus@1047
   900
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   901
                *((unsigned short *)dst) = ( \
icculus@1047
   902
                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
icculus@1047
   903
                ); \
icculus@1047
   904
            } \
icculus@1047
   905
            src += 4; \
icculus@1047
   906
            dst += 2; \
icculus@1047
   907
            widthvar--; \
icculus@1047
   908
        }
icculus@1047
   909
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
icculus@1047
   910
        extrawidth = (width % 8);
icculus@1047
   911
        valigner = VEC_ALIGNER(src);
slouken@1895
   912
        vsrc = (vector unsigned char) vec_ld(0, src);
icculus@1047
   913
        width -= extrawidth;
icculus@1047
   914
        while (width) {
icculus@1047
   915
            vector unsigned char valpha;
icculus@1047
   916
            vector unsigned char vsrc1, vsrc2;
icculus@1047
   917
            vector unsigned char vdst1, vdst2;
icculus@1047
   918
            vector unsigned short vR, vG, vB;
icculus@1047
   919
            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
icculus@1047
   920
icculus@1047
   921
            /* Load 8 pixels from src as ARGB */
slouken@1895
   922
            voverflow = (vector unsigned char) vec_ld(15, src);
icculus@1047
   923
            vsrc = vec_perm(vsrc, voverflow, valigner);
icculus@1047
   924
            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
icculus@1047
   925
            src += 16;
slouken@1895
   926
            vsrc = (vector unsigned char) vec_ld(15, src);
icculus@1047
   927
            voverflow = vec_perm(voverflow, vsrc, valigner);
icculus@1047
   928
            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
icculus@1047
   929
            src += 16;
icculus@1047
   930
icculus@1047
   931
            /* Load 8 pixels from dst as XRGB */
icculus@1047
   932
            voverflow = vec_ld(0, dst);
slouken@1895
   933
            vR = vec_and((vector unsigned short) voverflow, vf800);
slouken@1895
   934
            vB = vec_sl((vector unsigned short) voverflow, v3_16);
icculus@1047
   935
            vG = vec_sl(vB, v2_16);
slouken@1895
   936
            vdst1 =
slouken@1895
   937
                (vector unsigned char) vec_perm((vector unsigned char) vR,
slouken@1895
   938
                                                (vector unsigned char) vR,
slouken@1895
   939
                                                vredalpha1);
slouken@1895
   940
            vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
slouken@1895
   941
            vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
slouken@1895
   942
            vdst2 =
slouken@1895
   943
                (vector unsigned char) vec_perm((vector unsigned char) vR,
slouken@1895
   944
                                                (vector unsigned char) vR,
slouken@1895
   945
                                                vredalpha2);
slouken@1895
   946
            vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
slouken@1895
   947
            vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
icculus@1047
   948
icculus@1047
   949
            /* Alpha blend 8 pixels as ARGB */
icculus@1047
   950
            valpha = vec_perm(vsrc1, v0, valphaPermute);
slouken@1895
   951
            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
slouken@1895
   952
                               v8_16);
icculus@1047
   953
            valpha = vec_perm(vsrc2, v0, valphaPermute);
slouken@1895
   954
            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
slouken@1895
   955
                               v8_16);
icculus@1047
   956
icculus@1047
   957
            /* Convert 8 pixels to 565 */
slouken@1895
   958
            vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
slouken@1895
   959
                                                        vdst1,
slouken@1895
   960
                                                        (vector unsigned int)
slouken@1895
   961
                                                        vdst2);
slouken@1895
   962
            vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
icculus@1047
   963
            vgpixel = vec_and(vgpixel, vfc);
icculus@1047
   964
            vgpixel = vec_sl(vgpixel, v3_16);
icculus@1047
   965
            vrpixel = vec_sl(vpixel, v1_16);
icculus@1047
   966
            vrpixel = vec_and(vrpixel, vf800);
icculus@1047
   967
            vbpixel = vec_and(vpixel, v3f);
slouken@1895
   968
            vdst1 =
slouken@1895
   969
                vec_or((vector unsigned char) vrpixel,
slouken@1895
   970
                       (vector unsigned char) vgpixel);
slouken@1895
   971
            vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
slouken@1895
   972
icculus@1047
   973
            /* Store 8 pixels */
icculus@1047
   974
            vec_st(vdst1, 0, dst);
icculus@1047
   975
icculus@1047
   976
            width -= 8;
icculus@1047
   977
            dst += 16;
icculus@1047
   978
        }
icculus@1047
   979
        ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
   980
#undef ONE_PIXEL_BLEND
icculus@1047
   981
        src += srcskip;
icculus@1047
   982
        dst += dstskip;
icculus@1047
   983
    }
icculus@1047
   984
}
icculus@1047
   985
slouken@1895
   986
static void
slouken@1895
   987
Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
icculus@1047
   988
{
icculus@1047
   989
    unsigned alpha = info->src->alpha;
icculus@1047
   990
    int height = info->d_height;
slouken@1895
   991
    Uint32 *srcp = (Uint32 *) info->s_pixels;
icculus@1047
   992
    int srcskip = info->s_skip >> 2;
slouken@1895
   993
    Uint32 *dstp = (Uint32 *) info->d_pixels;
icculus@1047
   994
    int dstskip = info->d_skip >> 2;
icculus@1047
   995
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   996
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
   997
    unsigned sA = srcfmt->alpha;
icculus@1047
   998
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
   999
    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
icculus@1047
  1000
    Uint32 ckey = info->src->colorkey;
icculus@1047
  1001
    vector unsigned char mergePermute;
icculus@1047
  1002
    vector unsigned char vsrcPermute;
icculus@1047
  1003
    vector unsigned char vdstPermute;
icculus@1047
  1004
    vector unsigned char vsdstPermute;
icculus@1047
  1005
    vector unsigned char valpha;
icculus@1047
  1006
    vector unsigned char valphamask;
icculus@1047
  1007
    vector unsigned char vbits;
icculus@1047
  1008
    vector unsigned char v0;
icculus@1047
  1009
    vector unsigned short v1;
icculus@1047
  1010
    vector unsigned short v8;
icculus@1047
  1011
    vector unsigned int vckey;
icculus@1047
  1012
    vector unsigned int vrgbmask;
icculus@1047
  1013
icculus@1047
  1014
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1015
    v0 = vec_splat_u8(0);
icculus@1047
  1016
    v1 = vec_splat_u16(1);
icculus@1047
  1017
    v8 = vec_splat_u16(8);
icculus@1047
  1018
icculus@1047
  1019
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1020
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1021
icculus@1047
  1022
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1023
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1024
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1025
icculus@1047
  1026
    /* set a vector full of alpha and 255-alpha */
slouken@1895
  1027
    ((unsigned char *) &valpha)[0] = alpha;
icculus@1047
  1028
    valpha = vec_splat(valpha, 0);
slouken@1895
  1029
    vbits = (vector unsigned char) vec_splat_s8(-1);
icculus@1047
  1030
icculus@1047
  1031
    ckey &= rgbmask;
slouken@1895
  1032
    ((unsigned int *) (char *) &vckey)[0] = ckey;
icculus@1047
  1033
    vckey = vec_splat(vckey, 0);
slouken@1895
  1034
    ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
icculus@1047
  1035
    vrgbmask = vec_splat(vrgbmask, 0);
icculus@1047
  1036
slouken@1895
  1037
    while (height--) {
icculus@1047
  1038
        int width = info->d_width;
icculus@1047
  1039
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
  1040
        while (condition) { \
icculus@1162
  1041
            Uint32 Pixel; \
icculus@1047
  1042
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
  1043
            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
icculus@1162
  1044
            if(sA && Pixel != ckey) { \
icculus@1162
  1045
                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
icculus@1162
  1046
                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
  1047
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1048
                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1049
            } \
icculus@1162
  1050
            dstp++; \
icculus@1162
  1051
            srcp++; \
icculus@1047
  1052
            widthvar--; \
icculus@1047
  1053
        }
icculus@1047
  1054
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1055
        if (width > 0) {
icculus@1047
  1056
            int extrawidth = (width % 4);
icculus@1047
  1057
            vector unsigned char valigner = VEC_ALIGNER(srcp);
slouken@1895
  1058
            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
icculus@1047
  1059
            width -= extrawidth;
icculus@1047
  1060
            while (width) {
icculus@1047
  1061
                vector unsigned char vsel;
icculus@1047
  1062
                vector unsigned char voverflow;
icculus@1047
  1063
                vector unsigned char vd;
icculus@1047
  1064
                vector unsigned char vd_orig;
icculus@1047
  1065
icculus@1047
  1066
                /* s = *srcp */
slouken@1895
  1067
                voverflow = (vector unsigned char) vec_ld(15, srcp);
icculus@1047
  1068
                vs = vec_perm(vs, voverflow, valigner);
slouken@1895
  1069
icculus@1047
  1070
                /* vsel is set for items that match the key */
slouken@1895
  1071
                vsel =
slouken@1895
  1072
                    (vector unsigned char) vec_and((vector unsigned int) vs,
slouken@1895
  1073
                                                   vrgbmask);
slouken@1895
  1074
                vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
slouken@1895
  1075
                                                        vsel, vckey);
icculus@1047
  1076
icculus@1047
  1077
                /* permute to source format */
icculus@1047
  1078
                vs = vec_perm(vs, valpha, vsrcPermute);
icculus@1047
  1079
icculus@1047
  1080
                /* d = *dstp */
slouken@1895
  1081
                vd = (vector unsigned char) vec_ld(0, dstp);
icculus@1047
  1082
                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
  1083
icculus@1047
  1084
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1085
icculus@1047
  1086
                /* set the alpha channel to full on */
icculus@1047
  1087
                vd = vec_or(vd, valphamask);
icculus@1047
  1088
icculus@1047
  1089
                /* mask out color key */
icculus@1047
  1090
                vd = vec_sel(vd, vd_orig, vsel);
slouken@1895
  1091
icculus@1047
  1092
                /* permute to dest format */
icculus@1047
  1093
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
  1094
icculus@1047
  1095
                /* *dstp = res */
slouken@1895
  1096
                vec_st((vector unsigned int) vd, 0, dstp);
slouken@1895
  1097
icculus@1047
  1098
                srcp += 4;
icculus@1047
  1099
                dstp += 4;
icculus@1047
  1100
                width -= 4;
icculus@1047
  1101
                vs = voverflow;
icculus@1047
  1102
            }
icculus@1047
  1103
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1104
        }
icculus@1047
  1105
#undef ONE_PIXEL_BLEND
slouken@1895
  1106
icculus@1047
  1107
        srcp += srcskip;
icculus@1047
  1108
        dstp += dstskip;
icculus@1047
  1109
    }
icculus@1047
  1110
}
icculus@1047
  1111
icculus@1047
  1112
slouken@1895
  1113
static void
slouken@1895
  1114
Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
icculus@1047
  1115
{
icculus@1047
  1116
    int width = info->d_width;
icculus@1047
  1117
    int height = info->d_height;
slouken@1895
  1118
    Uint32 *srcp = (Uint32 *) info->s_pixels;
icculus@1047
  1119
    int srcskip = info->s_skip >> 2;
slouken@1895
  1120
    Uint32 *dstp = (Uint32 *) info->d_pixels;
icculus@1047
  1121
    int dstskip = info->d_skip >> 2;
icculus@1047
  1122
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
  1123
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
  1124
    vector unsigned char mergePermute;
icculus@1047
  1125
    vector unsigned char valphaPermute;
icculus@1047
  1126
    vector unsigned char vsrcPermute;
icculus@1047
  1127
    vector unsigned char vdstPermute;
icculus@1047
  1128
    vector unsigned char vsdstPermute;
icculus@1047
  1129
    vector unsigned char valphamask;
icculus@1047
  1130
    vector unsigned char vpixelmask;
icculus@1047
  1131
    vector unsigned char v0;
icculus@1047
  1132
    vector unsigned short v1;
icculus@1047
  1133
    vector unsigned short v8;
icculus@1047
  1134
icculus@1047
  1135
    v0 = vec_splat_u8(0);
icculus@1047
  1136
    v1 = vec_splat_u16(1);
icculus@1047
  1137
    v8 = vec_splat_u16(8);
icculus@1047
  1138
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1139
    valphamask = VEC_ALPHA_MASK();
slouken@1895
  1140
    valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
icculus@1047
  1141
    vpixelmask = vec_nor(valphamask, v0);
icculus@1047
  1142
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1143
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1144
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1145
slouken@1895
  1146
    while (height--) {
icculus@1047
  1147
        width = info->d_width;
icculus@1047
  1148
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
  1149
            Uint32 Pixel; \
icculus@1047
  1150
            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
icculus@1162
  1151
            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
  1152
            if(sA) { \
icculus@1162
  1153
              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
icculus@1047
  1154
              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1155
              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1156
            } \
icculus@1047
  1157
            ++srcp; \
icculus@1047
  1158
            ++dstp; \
icculus@1047
  1159
            widthvar--; \
icculus@1047
  1160
        }
icculus@1047
  1161
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1162
        if (width > 0) {
slouken@1487
  1163
            /* vsrcPermute */
slouken@1487
  1164
            /* vdstPermute */
icculus@1047
  1165
            int extrawidth = (width % 4);
icculus@1047
  1166
            vector unsigned char valigner = VEC_ALIGNER(srcp);
slouken@1895
  1167
            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
icculus@1047
  1168
            width -= extrawidth;
icculus@1047
  1169
            while (width) {
icculus@1047
  1170
                vector unsigned char voverflow;
icculus@1047
  1171
                vector unsigned char vd;
icculus@1047
  1172
                vector unsigned char valpha;
icculus@1047
  1173
                vector unsigned char vdstalpha;
icculus@1047
  1174
                /* s = *srcp */
slouken@1895
  1175
                voverflow = (vector unsigned char) vec_ld(15, srcp);
icculus@1047
  1176
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1177
                vs = vec_perm(vs, v0, vsrcPermute);
icculus@1047
  1178
icculus@1047
  1179
                valpha = vec_perm(vs, v0, valphaPermute);
slouken@1895
  1180
icculus@1047
  1181
                /* d = *dstp */
slouken@1895
  1182
                vd = (vector unsigned char) vec_ld(0, dstp);
icculus@1047
  1183
                vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
  1184
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
  1185
icculus@1047
  1186
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1187
icculus@1047
  1188
                /* set the alpha to the dest alpha */
icculus@1047
  1189
                vd = vec_and(vd, vpixelmask);
icculus@1047
  1190
                vd = vec_or(vd, vdstalpha);
icculus@1047
  1191
                vd = vec_perm(vd, v0, vdstPermute);
icculus@1047
  1192
icculus@1047
  1193
                /* *dstp = res */
slouken@1895
  1194
                vec_st((vector unsigned int) vd, 0, dstp);
slouken@1895
  1195
icculus@1047
  1196
                srcp += 4;
icculus@1047
  1197
                dstp += 4;
icculus@1047
  1198
                width -= 4;
icculus@1047
  1199
                vs = voverflow;
icculus@1047
  1200
icculus@1047
  1201
            }
icculus@1047
  1202
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1203
        }
slouken@1895
  1204
        srcp += srcskip;
slouken@1895
  1205
        dstp += dstskip;
icculus@1047
  1206
#undef ONE_PIXEL_BLEND
slouken@1895
  1207
    }
icculus@1047
  1208
}
icculus@1047
  1209
icculus@1047
  1210
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
  1211
static void
slouken@1895
  1212
BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
icculus@1047
  1213
{
slouken@1895
  1214
    int width = info->d_width;
slouken@1895
  1215
    int height = info->d_height;
slouken@1895
  1216
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1217
    int srcskip = info->s_skip >> 2;
slouken@1895
  1218
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1219
    int dstskip = info->d_skip >> 2;
icculus@1047
  1220
    vector unsigned char mergePermute;
icculus@1047
  1221
    vector unsigned char valphaPermute;
icculus@1047
  1222
    vector unsigned char valphamask;
icculus@1047
  1223
    vector unsigned char vpixelmask;
icculus@1047
  1224
    vector unsigned char v0;
icculus@1047
  1225
    vector unsigned short v1;
icculus@1047
  1226
    vector unsigned short v8;
icculus@1047
  1227
    v0 = vec_splat_u8(0);
icculus@1047
  1228
    v1 = vec_splat_u16(1);
icculus@1047
  1229
    v8 = vec_splat_u16(8);
icculus@1047
  1230
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1231
    valphamask = VEC_ALPHA_MASK();
slouken@1895
  1232
    valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
slouken@1895
  1233
slouken@1895
  1234
icculus@1047
  1235
    vpixelmask = vec_nor(valphamask, v0);
slouken@1895
  1236
    while (height--) {
icculus@1047
  1237
        width = info->d_width;
icculus@1047
  1238
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
  1239
        while ((condition)) { \
icculus@1047
  1240
            Uint32 dalpha; \
icculus@1047
  1241
            Uint32 d; \
icculus@1047
  1242
            Uint32 s1; \
icculus@1047
  1243
            Uint32 d1; \
icculus@1047
  1244
            Uint32 s = *srcp; \
icculus@1047
  1245
            Uint32 alpha = s >> 24; \
icculus@1047
  1246
            if(alpha) { \
icculus@1047
  1247
              if(alpha == SDL_ALPHA_OPAQUE) { \
icculus@1047
  1248
                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
icculus@1047
  1249
              } else { \
icculus@1047
  1250
                d = *dstp; \
icculus@1047
  1251
                dalpha = d & 0xff000000; \
icculus@1047
  1252
                s1 = s & 0xff00ff; \
icculus@1047
  1253
                d1 = d & 0xff00ff; \
icculus@1047
  1254
                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
icculus@1047
  1255
                s &= 0xff00; \
icculus@1047
  1256
                d &= 0xff00; \
icculus@1047
  1257
                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
  1258
                *dstp = d1 | d | dalpha; \
icculus@1047
  1259
              } \
icculus@1047
  1260
            } \
icculus@1047
  1261
            ++srcp; \
icculus@1047
  1262
            ++dstp; \
icculus@1047
  1263
            widthvar--; \
icculus@1047
  1264
	    }
icculus@1047
  1265
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1266
        if (width > 0) {
icculus@1047
  1267
            int extrawidth = (width % 4);
icculus@1047
  1268
            vector unsigned char valigner = VEC_ALIGNER(srcp);
slouken@1895
  1269
            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
icculus@1047
  1270
            width -= extrawidth;
icculus@1047
  1271
            while (width) {
icculus@1047
  1272
                vector unsigned char voverflow;
icculus@1047
  1273
                vector unsigned char vd;
icculus@1047
  1274
                vector unsigned char valpha;
icculus@1047
  1275
                vector unsigned char vdstalpha;
icculus@1047
  1276
                /* s = *srcp */
slouken@1895
  1277
                voverflow = (vector unsigned char) vec_ld(15, srcp);
icculus@1047
  1278
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1279
icculus@1047
  1280
                valpha = vec_perm(vs, v0, valphaPermute);
slouken@1895
  1281
icculus@1047
  1282
                /* d = *dstp */
slouken@1895
  1283
                vd = (vector unsigned char) vec_ld(0, dstp);
icculus@1047
  1284
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
  1285
icculus@1047
  1286
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1287
icculus@1047
  1288
                /* set the alpha to the dest alpha */
icculus@1047
  1289
                vd = vec_and(vd, vpixelmask);
icculus@1047
  1290
                vd = vec_or(vd, vdstalpha);
icculus@1047
  1291
icculus@1047
  1292
                /* *dstp = res */
slouken@1895
  1293
                vec_st((vector unsigned int) vd, 0, dstp);
slouken@1895
  1294
icculus@1047
  1295
                srcp += 4;
icculus@1047
  1296
                dstp += 4;
icculus@1047
  1297
                width -= 4;
icculus@1047
  1298
                vs = voverflow;
icculus@1047
  1299
            }
icculus@1047
  1300
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1301
        }
slouken@1895
  1302
        srcp += srcskip;
slouken@1895
  1303
        dstp += dstskip;
slouken@1895
  1304
    }
icculus@1047
  1305
#undef ONE_PIXEL_BLEND
icculus@1047
  1306
}
icculus@1047
  1307
slouken@1895
  1308
static void
slouken@1895
  1309
Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
icculus@1047
  1310
{
icculus@1047
  1311
    /* XXX : 6 */
slouken@1895
  1312
    unsigned alpha = info->src->alpha;
icculus@1047
  1313
    int height = info->d_height;
slouken@1895
  1314
    Uint32 *srcp = (Uint32 *) info->s_pixels;
icculus@1047
  1315
    int srcskip = info->s_skip >> 2;
slouken@1895
  1316
    Uint32 *dstp = (Uint32 *) info->d_pixels;
icculus@1047
  1317
    int dstskip = info->d_skip >> 2;
icculus@1047
  1318
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
  1319
    SDL_PixelFormat *dstfmt = info->dst;
slouken@1895
  1320
    unsigned sA = srcfmt->alpha;
slouken@1895
  1321
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
  1322
    vector unsigned char mergePermute;
icculus@1047
  1323
    vector unsigned char vsrcPermute;
icculus@1047
  1324
    vector unsigned char vdstPermute;
icculus@1047
  1325
    vector unsigned char vsdstPermute;
icculus@1047
  1326
    vector unsigned char valpha;
icculus@1047
  1327
    vector unsigned char valphamask;
icculus@1047
  1328
    vector unsigned char vbits;
icculus@1047
  1329
    vector unsigned short v1;
icculus@1047
  1330
    vector unsigned short v8;
icculus@1047
  1331
icculus@1047
  1332
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1333
    v1 = vec_splat_u16(1);
icculus@1047
  1334
    v8 = vec_splat_u16(8);
icculus@1047
  1335
icculus@1047
  1336
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1337
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1338
icculus@1047
  1339
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1340
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1341
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1342
icculus@1047
  1343
    /* set a vector full of alpha and 255-alpha */
slouken@1895
  1344
    ((unsigned char *) &valpha)[0] = alpha;
icculus@1047
  1345
    valpha = vec_splat(valpha, 0);
slouken@1895
  1346
    vbits = (vector unsigned char) vec_splat_s8(-1);
icculus@1047
  1347
slouken@1895
  1348
    while (height--) {
icculus@1047
  1349
        int width = info->d_width;
icculus@1047
  1350
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
  1351
            Uint32 Pixel; \
icculus@1047
  1352
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
  1353
            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
icculus@1162
  1354
            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
  1355
            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1356
            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1357
            ++srcp; \
icculus@1047
  1358
            ++dstp; \
icculus@1047
  1359
            widthvar--; \
icculus@1047
  1360
        }
icculus@1047
  1361
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1362
        if (width > 0) {
icculus@1047
  1363
            int extrawidth = (width % 4);
icculus@2086
  1364
            vector unsigned char valigner = VEC_ALIGNER(srcp);
slouken@1895
  1365
            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
icculus@1047
  1366
            width -= extrawidth;
icculus@1047
  1367
            while (width) {
icculus@1047
  1368
                vector unsigned char voverflow;
icculus@1047
  1369
                vector unsigned char vd;
icculus@1047
  1370
icculus@1047
  1371
                /* s = *srcp */
slouken@1895
  1372
                voverflow = (vector unsigned char) vec_ld(15, srcp);
icculus@1047
  1373
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1374
                vs = vec_perm(vs, valpha, vsrcPermute);
slouken@1895
  1375
icculus@1047
  1376
                /* d = *dstp */
slouken@1895
  1377
                vd = (vector unsigned char) vec_ld(0, dstp);
icculus@1047
  1378
                vd = vec_perm(vd, vd, vsdstPermute);
icculus@1047
  1379
icculus@1047
  1380
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1381
icculus@1047
  1382
                /* set the alpha channel to full on */
icculus@1047
  1383
                vd = vec_or(vd, valphamask);
icculus@1047
  1384
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
  1385
icculus@1047
  1386
                /* *dstp = res */
slouken@1895
  1387
                vec_st((vector unsigned int) vd, 0, dstp);
slouken@1895
  1388
icculus@1047
  1389
                srcp += 4;
icculus@1047
  1390
                dstp += 4;
icculus@1047
  1391
                width -= 4;
icculus@1047
  1392
                vs = voverflow;
icculus@1047
  1393
            }
icculus@1047
  1394
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1395
        }
icculus@1047
  1396
#undef ONE_PIXEL_BLEND
slouken@1895
  1397
icculus@1047
  1398
        srcp += srcskip;
icculus@1047
  1399
        dstp += dstskip;
icculus@1047
  1400
    }
icculus@1047
  1401
icculus@1047
  1402
}
icculus@1047
  1403
icculus@1047
  1404
icculus@1047
  1405
/* fast RGB888->(A)RGB888 blending */
slouken@1895
  1406
static void
slouken@1895
  1407
BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
icculus@1047
  1408
{
slouken@1895
  1409
    unsigned alpha = info->src->alpha;
icculus@1047
  1410
    int height = info->d_height;
slouken@1895
  1411
    Uint32 *srcp = (Uint32 *) info->s_pixels;
icculus@1047
  1412
    int srcskip = info->s_skip >> 2;
slouken@1895
  1413
    Uint32 *dstp = (Uint32 *) info->d_pixels;
icculus@1047
  1414
    int dstskip = info->d_skip >> 2;
icculus@1047
  1415
    vector unsigned char mergePermute;
icculus@1047
  1416
    vector unsigned char valpha;
icculus@1047
  1417
    vector unsigned char valphamask;
icculus@1047
  1418
    vector unsigned short v1;
icculus@1047
  1419
    vector unsigned short v8;
icculus@1047
  1420
icculus@1047
  1421
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1422
    v1 = vec_splat_u16(1);
icculus@1047
  1423
    v8 = vec_splat_u16(8);
icculus@1047
  1424
icculus@1047
  1425
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1426
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1427
icculus@1047
  1428
    /* set a vector full of alpha and 255-alpha */
slouken@1895
  1429
    ((unsigned char *) &valpha)[0] = alpha;
icculus@1047
  1430
    valpha = vec_splat(valpha, 0);
icculus@1047
  1431
slouken@1895
  1432
    while (height--) {
icculus@1047
  1433
        int width = info->d_width;
icculus@1047
  1434
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1047
  1435
            Uint32 s = *srcp; \
icculus@1047
  1436
            Uint32 d = *dstp; \
icculus@1047
  1437
            Uint32 s1 = s & 0xff00ff; \
icculus@1047
  1438
            Uint32 d1 = d & 0xff00ff; \
icculus@1047
  1439
            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
icculus@1047
  1440
                 & 0xff00ff; \
icculus@1047
  1441
            s &= 0xff00; \
icculus@1047
  1442
            d &= 0xff00; \
icculus@1047
  1443
            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
  1444
            *dstp = d1 | d | 0xff000000; \
icculus@1047
  1445
            ++srcp; \
icculus@1047
  1446
            ++dstp; \
icculus@1047
  1447
            widthvar--; \
icculus@1047
  1448
        }
icculus@1047
  1449
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1450
        if (width > 0) {
icculus@1047
  1451
            int extrawidth = (width % 4);
icculus@1047
  1452
            vector unsigned char valigner = VEC_ALIGNER(srcp);
slouken@1895
  1453
            vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
icculus@1047
  1454
            width -= extrawidth;
icculus@1047
  1455
            while (width) {
icculus@1047
  1456
                vector unsigned char voverflow;
icculus@1047
  1457
                vector unsigned char vd;
icculus@1047
  1458
icculus@1047
  1459
                /* s = *srcp */
slouken@1895
  1460
                voverflow = (vector unsigned char) vec_ld(15, srcp);
icculus@1047
  1461
                vs = vec_perm(vs, voverflow, valigner);
slouken@1895
  1462
icculus@1047
  1463
                /* d = *dstp */
slouken@1895
  1464
                vd = (vector unsigned char) vec_ld(0, dstp);
icculus@1047
  1465
icculus@1047
  1466
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1467
icculus@1047
  1468
                /* set the alpha channel to full on */
icculus@1047
  1469
                vd = vec_or(vd, valphamask);
icculus@1047
  1470
icculus@1047
  1471
                /* *dstp = res */
slouken@1895
  1472
                vec_st((vector unsigned int) vd, 0, dstp);
slouken@1895
  1473
icculus@1047
  1474
                srcp += 4;
icculus@1047
  1475
                dstp += 4;
icculus@1047
  1476
                width -= 4;
icculus@1047
  1477
                vs = voverflow;
icculus@1047
  1478
            }
icculus@1047
  1479
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1480
        }
icculus@1047
  1481
#undef ONE_PIXEL_BLEND
slouken@1895
  1482
icculus@1047
  1483
        srcp += srcskip;
icculus@1047
  1484
        dstp += dstskip;
icculus@1047
  1485
    }
icculus@1047
  1486
}
slouken@1895
  1487
slouken@1795
  1488
#if __MWERKS__
slouken@1795
  1489
#pragma altivec_model off
slouken@1795
  1490
#endif
slouken@1361
  1491
#endif /* SDL_ALTIVEC_BLITTERS */
icculus@1047
  1492
slouken@1
  1493
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1895
  1494
static void
slouken@1895
  1495
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
slouken@0
  1496
{
slouken@1895
  1497
    int width = info->d_width;
slouken@1895
  1498
    int height = info->d_height;
slouken@1895
  1499
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1500
    int srcskip = info->s_skip >> 2;
slouken@1895
  1501
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1502
    int dstskip = info->d_skip >> 2;
slouken@0
  1503
slouken@1895
  1504
    while (height--) {
slouken@1895
  1505
	    /* *INDENT-OFF* */
slouken@0
  1506
	    DUFFS_LOOP4({
slouken@1
  1507
		    Uint32 s = *srcp++;
slouken@1
  1508
		    Uint32 d = *dstp;
slouken@1
  1509
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
  1510
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
  1511
	    }, width);
slouken@1895
  1512
	    /* *INDENT-ON* */
slouken@1895
  1513
        srcp += srcskip;
slouken@1895
  1514
        dstp += dstskip;
slouken@1895
  1515
    }
slouken@0
  1516
}
slouken@0
  1517
slouken@1
  1518
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1895
  1519
static void
slouken@1895
  1520
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
slouken@1
  1521
{
slouken@1895
  1522
    unsigned alpha = info->src->alpha;
slouken@1895
  1523
    if (alpha == 128) {
slouken@1895
  1524
        BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1895
  1525
    } else {
slouken@1895
  1526
        int width = info->d_width;
slouken@1895
  1527
        int height = info->d_height;
slouken@1895
  1528
        Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1529
        int srcskip = info->s_skip >> 2;
slouken@1895
  1530
        Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1531
        int dstskip = info->d_skip >> 2;
slouken@1895
  1532
        Uint32 s;
slouken@1895
  1533
        Uint32 d;
slouken@1895
  1534
        Uint32 s1;
slouken@1895
  1535
        Uint32 d1;
slouken@1
  1536
slouken@1895
  1537
        while (height--) {
slouken@1895
  1538
			/* *INDENT-OFF* */
slouken@689
  1539
			DUFFS_LOOP_DOUBLE2({
slouken@689
  1540
				/* One Pixel Blend */
slouken@1
  1541
				s = *srcp;
slouken@1
  1542
				d = *dstp;
slouken@1
  1543
				s1 = s & 0xff00ff;
slouken@1
  1544
				d1 = d & 0xff00ff;
slouken@1
  1545
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
  1546
				     & 0xff00ff;
slouken@1
  1547
				s &= 0xff00;
slouken@1
  1548
				d &= 0xff00;
slouken@1
  1549
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
  1550
				*dstp = d1 | d | 0xff000000;
slouken@1
  1551
				++srcp;
slouken@1
  1552
				++dstp;
slouken@689
  1553
			},{
slouken@689
  1554
			        /* Two Pixels Blend */
slouken@689
  1555
				s = *srcp;
slouken@689
  1556
				d = *dstp;
slouken@689
  1557
				s1 = s & 0xff00ff;
slouken@689
  1558
				d1 = d & 0xff00ff;
slouken@689
  1559
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1560
				d1 &= 0xff00ff;
slouken@689
  1561
				     
slouken@689
  1562
				s = ((s & 0xff00) >> 8) | 
slouken@689
  1563
					((srcp[1] & 0xff00) << 8);
slouken@689
  1564
				d = ((d & 0xff00) >> 8) |
slouken@689
  1565
					((dstp[1] & 0xff00) << 8);
slouken@689
  1566
				d += (s - d) * alpha >> 8;
slouken@689
  1567
				d &= 0x00ff00ff;
slouken@689
  1568
				
slouken@689
  1569
				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
slouken@689
  1570
				++srcp;
slouken@689
  1571
				
slouken@689
  1572
			        s1 = *srcp;
slouken@689
  1573
				d1 = *dstp;
slouken@689
  1574
				s1 &= 0xff00ff;
slouken@689
  1575
				d1 &= 0xff00ff;
slouken@689
  1576
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1577
				d1 &= 0xff00ff;
slouken@689
  1578
				
slouken@689
  1579
				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
slouken@689
  1580
				++srcp;
slouken@689
  1581
				++dstp;
slouken@1
  1582
			}, width);
slouken@1895
  1583
			/* *INDENT-ON* */
slouken@1895
  1584
            srcp += srcskip;
slouken@1895
  1585
            dstp += dstskip;
slouken@1895
  1586
        }
slouken@1895
  1587
    }
slouken@1
  1588
}
slouken@1
  1589
slouken@0
  1590
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
  1591
static void
slouken@1895
  1592
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
slouken@0
  1593
{
slouken@1895
  1594
    int width = info->d_width;
slouken@1895
  1595
    int height = info->d_height;
slouken@1895
  1596
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1597
    int srcskip = info->s_skip >> 2;
slouken@1895
  1598
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1599
    int dstskip = info->d_skip >> 2;
slouken@0
  1600
slouken@1895
  1601
    while (height--) {
slouken@1895
  1602
	    /* *INDENT-OFF* */
slouken@0
  1603
	    DUFFS_LOOP4({
slouken@0
  1604
		Uint32 dalpha;
slouken@0
  1605
		Uint32 d;
slouken@0
  1606
		Uint32 s1;
slouken@0
  1607
		Uint32 d1;
slouken@0
  1608
		Uint32 s = *srcp;
slouken@0
  1609
		Uint32 alpha = s >> 24;
slouken@0
  1610
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1611
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1612
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1613
		   Benchmark this! */
slouken@689
  1614
		if(alpha) {   
slouken@689
  1615
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@0
  1616
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
  1617
		  } else {
slouken@0
  1618
		    /*
slouken@0
  1619
		     * take out the middle component (green), and process
slouken@0
  1620
		     * the other two in parallel. One multiply less.
slouken@0
  1621
		     */
slouken@0
  1622
		    d = *dstp;
slouken@0
  1623
		    dalpha = d & 0xff000000;
slouken@0
  1624
		    s1 = s & 0xff00ff;
slouken@0
  1625
		    d1 = d & 0xff00ff;
slouken@0
  1626
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
  1627
		    s &= 0xff00;
slouken@0
  1628
		    d &= 0xff00;
slouken@0
  1629
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@0
  1630
		    *dstp = d1 | d | dalpha;
slouken@689
  1631
		  }
slouken@0
  1632
		}
slouken@0
  1633
		++srcp;
slouken@0
  1634
		++dstp;
slouken@0
  1635
	    }, width);
slouken@1895
  1636
	    /* *INDENT-ON* */
slouken@1895
  1637
        srcp += srcskip;
slouken@1895
  1638
        dstp += dstskip;
slouken@1895
  1639
    }
slouken@0
  1640
}
slouken@0
  1641
slouken@1542
  1642
#if GCC_ASMBLIT
slouken@689
  1643
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@2038
  1644
static void
slouken@1895
  1645
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
slouken@689
  1646
{
slouken@1895
  1647
    int width = info->d_width;
slouken@1895
  1648
    int height = info->d_height;
slouken@1895
  1649
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1650
    int srcskip = info->s_skip >> 2;
slouken@1895
  1651
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1652
    int dstskip = info->d_skip >> 2;
slouken@1895
  1653
    SDL_PixelFormat *sf = info->src;
slouken@1895
  1654
    Uint32 amask = sf->Amask;
slouken@689
  1655
slouken@1895
  1656
    __asm__(
slouken@1895
  1657
               /* make mm6 all zeros. */
slouken@1895
  1658
               "pxor       %%mm6, %%mm6\n"
slouken@1895
  1659
               /* Make a mask to preserve the alpha. */
slouken@1895
  1660
               "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
slouken@1895
  1661
               "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
slouken@1895
  1662
               "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
slouken@1895
  1663
               "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
slouken@1895
  1664
               "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
slouken@1895
  1665
               /* form channel masks */
slouken@1895
  1666
               "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
slouken@1895
  1667
               "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
slouken@1895
  1668
               "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
slouken@1895
  1669
               "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
slouken@1895
  1670
               /* get alpha channel shift */
slouken@1895
  1671
               "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
icculus@2101
  1672
  : /* nothing */ :            "rm"(amask), "rm"((Uint32) sf->Ashift));
slouken@689
  1673
slouken@1895
  1674
    while (height--) {
slouken@1542
  1675
slouken@1895
  1676
	    /* *INDENT-OFF* */
slouken@689
  1677
	    DUFFS_LOOP4({
slouken@1542
  1678
		Uint32 alpha;
slouken@689
  1679
slouken@689
  1680
		__asm__ (
slouken@689
  1681
		"prefetch 64(%0)\n"
slouken@689
  1682
		"prefetch 64(%1)\n"
slouken@689
  1683
			: : "r" (srcp), "r" (dstp) );
slouken@689
  1684
slouken@1542
  1685
		alpha = *srcp & amask;
slouken@689
  1686
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
  1687
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
  1688
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
  1689
		   Benchmark this! */
slouken@1542
  1690
		if(alpha == 0) {
slouken@1542
  1691
		    /* do nothing */
slouken@1542
  1692
		}
slouken@1542
  1693
		else if(alpha == amask) {
slouken@1542
  1694
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
  1695
		    /* using MMX here to free up regular registers for other things */
slouken@1542
  1696
			    __asm__ (
slouken@1542
  1697
		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
slouken@1542
  1698
		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
  1699
		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
slouken@1542
  1700
		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
slouken@1542
  1701
		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
slouken@1542
  1702
		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
slouken@1542
  1703
slouken@1542
  1704
		     : : "r" (srcp), "r" (dstp) );
slouken@689
  1705
		} 
slouken@689
  1706
slouken@689
  1707
		else {
slouken@689
  1708
			    __asm__ (
slouken@689
  1709
		    /* load in the source, and dst. */
slouken@689
  1710
		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
slouken@689
  1711
		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
slouken@689
  1712
slouken@689
  1713
		    /* Move the src alpha into mm2 */
slouken@689
  1714
slouken@689
  1715
		    /* if supporting pshufw */
slouken@689
  1716
		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
  1717
		    /*"psrlw     $8, %%mm2\n" */
slouken@689
  1718
		    
slouken@689
  1719
		    /* else: */
slouken@1542
  1720
		    "movd       %2,    %%mm2\n"
slouken@1542
  1721
		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
slouken@689
  1722
		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
slouken@689
  1723
		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@1542
  1724
		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
slouken@689
  1725
slouken@689
  1726
		    /* move the colors into words. */
slouken@689
  1727
		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
slouken@689
  1728
		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
slouken@689
  1729
slouken@689
  1730
		    /* src - dst */
slouken@689
  1731
		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
slouken@689
  1732
slouken@689
  1733
		    /* A * (src-dst) */
slouken@1542
  1734
		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
slouken@1542
  1735
		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
slouken@1542
  1736
		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
slouken@689
  1737
slouken@689
  1738
		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
slouken@689
  1739
		    
slouken@689
  1740
		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
slouken@689
  1741
slouken@1542
  1742
		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
slouken@689
  1743
slouken@689
  1744
		}
slouken@689
  1745
		++srcp;
slouken@689
  1746
		++dstp;
slouken@689
  1747
	    }, width);
slouken@1895
  1748
	    /* *INDENT-ON* */
slouken@1895
  1749
        srcp += srcskip;
slouken@1895
  1750
        dstp += dstskip;
slouken@1895
  1751
    }
slouken@689
  1752
slouken@1895
  1753
  __asm__("emms\n":);
slouken@689
  1754
}
slouken@1895
  1755
slouken@1542
  1756
/* End GCC_ASMBLIT*/
slouken@1542
  1757
slouken@1542
  1758
#elif MSVC_ASMBLIT
slouken@1542
  1759
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1895
  1760
static void
slouken@1895
  1761
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
slouken@1542
  1762
{
slouken@1895
  1763
    int width = info->d_width;
slouken@1895
  1764
    int height = info->d_height;
slouken@1895
  1765
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  1766
    int srcskip = info->s_skip >> 2;
slouken@1895
  1767
    Uint32 *dstp = (Uint32 *) info->d_pixels;
slouken@1895
  1768
    int dstskip = info->d_skip >> 2;
slouken@1895
  1769
    SDL_PixelFormat *sf = info->src;
slouken@1895
  1770
    Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1895
  1771
    Uint32 amask = sf->Amask;
slouken@1895
  1772
    Uint32 ashift = sf->Ashift;
slouken@1895
  1773
    Uint64 multmask;
slouken@1542
  1774
slouken@1895
  1775
    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
slouken@1542
  1776
slouken@1895
  1777
    mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
slouken@1895
  1778
	/* *INDENT-OFF* */
slouken@1895
  1779
    multmask = ~(0xFFFFI64 << (ashift * 2));
slouken@1895
  1780
	/* *INDENT-ON* */
slouken@1895
  1781
    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
slouken@1895
  1782
slouken@1895
  1783
    while (height--) {
slouken@1895
  1784
	    /* *INDENT-OFF* */
slouken@1542
  1785
	    DUFFS_LOOP4({
slouken@1542
  1786
		Uint32 alpha;
slouken@1542
  1787
slouken@1542
  1788
		_m_prefetch(srcp + 16);
slouken@1542
  1789
		_m_prefetch(dstp + 16);
slouken@1542
  1790
slouken@1542
  1791
		alpha = *srcp & amask;
slouken@1542
  1792
		if (alpha == 0) {
slouken@1542
  1793
			/* do nothing */
slouken@1542
  1794
		} else if (alpha == amask) {
slouken@1542
  1795
			/* copy RGB, keep dst alpha */
slouken@1542
  1796
			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
slouken@1542
  1797
		} else {
slouken@1542
  1798
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
slouken@1542
  1799
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
  1800
slouken@1542
  1801
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
  1802
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
  1803
slouken@1542
  1804
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
  1805
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
  1806
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
  1807
			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
  1808
			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
slouken@1542
  1809
slouken@1542
  1810
			/* blend */		    
slouken@1542
  1811
			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
slouken@1542
  1812
			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
slouken@1542
  1813
			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
slouken@1542
  1814
			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
slouken@1542
  1815
			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
  1816
			
slouken@1542
  1817
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
  1818
		}
slouken@1542
  1819
		++srcp;
slouken@1542
  1820
		++dstp;
slouken@1542
  1821
	    }, width);
slouken@1895
  1822
	    /* *INDENT-ON* */
slouken@1895
  1823
        srcp += srcskip;
slouken@1895
  1824
        dstp += dstskip;
slouken@1895
  1825
    }
slouken@1895
  1826
    _mm_empty();
slouken@1542
  1827
}
slouken@1895
  1828
slouken@1542
  1829
/* End MSVC_ASMBLIT */
slouken@1542
  1830
slouken@1542
  1831
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
  1832
slouken@1
  1833
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
  1834
slouken@1
  1835
/* blend a single 16 bit pixel at 50% */
slouken@1
  1836
#define BLEND16_50(d, s, mask)						\
slouken@1
  1837
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
  1838
slouken@1
  1839
/* blend two 16 bit pixels at 50% */
slouken@1
  1840
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
  1841
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
  1842
	 + (s & d & (~(mask | mask << 16))))
slouken@1
  1843
slouken@1895
  1844
static void
slouken@1895
  1845
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
slouken@0
  1846
{
slouken@1895
  1847
    int width = info->d_width;
slouken@1895
  1848
    int height = info->d_height;
slouken@1895
  1849
    Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  1850
    int srcskip = info->s_skip >> 1;
slouken@1895
  1851
    Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  1852
    int dstskip = info->d_skip >> 1;
slouken@0
  1853
slouken@1895
  1854
    while (height--) {
slouken@1895
  1855
        if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
slouken@1895
  1856
            /*
slouken@1895
  1857
             * Source and destination not aligned, pipeline it.
slouken@1895
  1858
             * This is mostly a win for big blits but no loss for
slouken@1895
  1859
             * small ones
slouken@1895
  1860
             */
slouken@1895
  1861
            Uint32 prev_sw;
slouken@1895
  1862
            int w = width;
slouken@1
  1863
slouken@1895
  1864
            /* handle odd destination */
slouken@1895
  1865
            if ((uintptr_t) dstp & 2) {
slouken@1895
  1866
                Uint16 d = *dstp, s = *srcp;
slouken@1895
  1867
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
  1868
                dstp++;
slouken@1895
  1869
                srcp++;
slouken@1895
  1870
                w--;
slouken@1895
  1871
            }
slouken@1895
  1872
            srcp++;             /* srcp is now 32-bit aligned */
slouken@1
  1873
slouken@1895
  1874
            /* bootstrap pipeline with first halfword */
slouken@1895
  1875
            prev_sw = ((Uint32 *) srcp)[-1];
slouken@1
  1876
slouken@1895
  1877
            while (w > 1) {
slouken@1895
  1878
                Uint32 sw, dw, s;
slouken@1895
  1879
                sw = *(Uint32 *) srcp;
slouken@1895
  1880
                dw = *(Uint32 *) dstp;
slouken@1443
  1881
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
  1882
                s = (prev_sw << 16) + (sw >> 16);
slouken@1443
  1883
#else
slouken@1895
  1884
                s = (prev_sw >> 16) + (sw << 16);
slouken@1443
  1885
#endif
slouken@1895
  1886
                prev_sw = sw;
slouken@1895
  1887
                *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
slouken@1895
  1888
                dstp += 2;
slouken@1895
  1889
                srcp += 2;
slouken@1895
  1890
                w -= 2;
slouken@1895
  1891
            }
slouken@1
  1892
slouken@1895
  1893
            /* final pixel if any */
slouken@1895
  1894
            if (w) {
slouken@1895
  1895
                Uint16 d = *dstp, s;
slouken@1443
  1896
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1895
  1897
                s = (Uint16) prev_sw;
slouken@1443
  1898
#else
slouken@1895
  1899
                s = (Uint16) (prev_sw >> 16);
slouken@1443
  1900
#endif
slouken@1895
  1901
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
  1902
                srcp++;
slouken@1895
  1903
                dstp++;
slouken@1895
  1904
            }
slouken@1895
  1905
            srcp += srcskip - 1;
slouken@1895
  1906
            dstp += dstskip;
slouken@1895
  1907
        } else {
slouken@1895
  1908
            /* source and destination are aligned */
slouken@1895
  1909
            int w = width;
slouken@1
  1910
slouken@1895
  1911
            /* first odd pixel? */
slouken@1895
  1912
            if ((uintptr_t) srcp & 2) {
slouken@1895
  1913
                Uint16 d = *dstp, s = *srcp;
slouken@1895
  1914
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
  1915
                srcp++;
slouken@1895
  1916
                dstp++;
slouken@1895
  1917
                w--;
slouken@1895
  1918
            }
slouken@1895
  1919
            /* srcp and dstp are now 32-bit aligned */
slouken@1
  1920
slouken@1895
  1921
            while (w > 1) {
slouken@1895
  1922
                Uint32 sw = *(Uint32 *) srcp;
slouken@1895
  1923
                Uint32 dw = *(Uint32 *) dstp;
slouken@1895
  1924
                *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
slouken@1895
  1925
                srcp += 2;
slouken@1895
  1926
                dstp += 2;
slouken@1895
  1927
                w -= 2;
slouken@1895
  1928
            }
slouken@1
  1929
slouken@1895
  1930
            /* last odd pixel? */
slouken@1895
  1931
            if (w) {
slouken@1895
  1932
                Uint16 d = *dstp, s = *srcp;
slouken@1895
  1933
                *dstp = BLEND16_50(d, s, mask);
slouken@1895
  1934
                srcp++;
slouken@1895
  1935
                dstp++;
slouken@1895
  1936
            }
slouken@1895
  1937
            srcp += srcskip;
slouken@1895
  1938
            dstp += dstskip;
slouken@1895
  1939
        }
slouken@1895
  1940
    }
slouken@1
  1941
}
slouken@1
  1942
slouken@1542
  1943
#if GCC_ASMBLIT
slouken@689
  1944
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
  1945
static void
slouken@1895
  1946
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@689
  1947
{
slouken@1895
  1948
    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
slouken@1895
  1949
    if (alpha == 128) {
slouken@1895
  1950
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
  1951
    } else {
slouken@1895
  1952
        int width = info->d_width;
slouken@1895
  1953
        int height = info->d_height;
slouken@1895
  1954
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  1955
        int srcskip = info->s_skip >> 1;
slouken@1895
  1956
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  1957
        int dstskip = info->d_skip >> 1;
slouken@1895
  1958
        Uint32 s, d;
slouken@1895
  1959
        Uint8 load[8];
slouken@689
  1960
slouken@1895
  1961
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
  1962
        *(Uint64 *) load = alpha;
slouken@1895
  1963
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1895
  1964
slouken@1895
  1965
        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
slouken@1895
  1966
        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
slouken@1895
  1967
        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
slouken@1895
  1968
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
  1969
           to reduce the number of operations */
slouken@1895
  1970
        psllq_i2r(3, mm0);
slouken@1895
  1971
slouken@1895
  1972
        /* Setup the 565 color channel masks */
slouken@1895
  1973
        *(Uint64 *) load = 0x07E007E007E007E0ULL;
slouken@1895
  1974
        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
slouken@1895
  1975
        *(Uint64 *) load = 0x001F001F001F001FULL;
slouken@1895
  1976
        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
slouken@1895
  1977
        while (height--) {
slouken@1895
  1978
			/* *INDENT-OFF* */
slouken@1542
  1979
			DUFFS_LOOP_QUATRO2(
slouken@1542
  1980
			{
slouken@1542
  1981
				s = *srcp++;
slouken@689
  1982
				d = *dstp;
slouken@689
  1983
				/*
slouken@689
  1984
				 * shift out the middle component (green) to
slouken@689
  1985
				 * the high 16 bits, and process all three RGB
slouken@689
  1986
				 * components at the same time.
slouken@689
  1987
				 */
slouken@689
  1988
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1989
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1990
				d += (s - d) * alpha >> 5;
slouken@689
  1991
				d &= 0x07e0f81f;
slouken@689
  1992
				*dstp++ = d | d >> 16;
slouken@1542
  1993
			},{
slouken@1542
  1994
				s = *srcp++;
slouken@689
  1995
				d = *dstp;
slouken@689
  1996
				/*
slouken@689
  1997
				 * shift out the middle component (green) to
slouken@689
  1998
				 * the high 16 bits, and process all three RGB
slouken@689
  1999
				 * components at the same time.
slouken@689
  2000
				 */
slouken@689
  2001
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  2002
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  2003
				d += (s - d) * alpha >> 5;
slouken@689
  2004
				d &= 0x07e0f81f;
slouken@689
  2005
				*dstp++ = d | d >> 16;
slouken@1542
  2006
				s = *srcp++;
slouken@689
  2007
				d = *dstp;
slouken@689
  2008
				/*
slouken@689
  2009
				 * shift out the middle component (green) to
slouken@689
  2010
				 * the high 16 bits, and process all three RGB
slouken@689
  2011
				 * components at the same time.
slouken@689
  2012
				 */
slouken@689
  2013
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  2014
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  2015
				d += (s - d) * alpha >> 5;
slouken@689
  2016
				d &= 0x07e0f81f;
slouken@689
  2017
				*dstp++ = d | d >> 16;
slouken@1542
  2018
			},{
slouken@1542
  2019
				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@1542
  2020
				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@1542
  2021
slouken@1542
  2022
				/* red -- does not need a mask since the right shift clears
slouken@1542
  2023
				   the uninteresting bits */
slouken@1542
  2024
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2025
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2026
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
slouken@1542
  2027
				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
slouken@1542
  2028
slouken@1542
  2029
				/* blend */
slouken@1542
  2030
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2031
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2032
				/* alpha used is actually 11 bits
slouken@1542
  2033
				   11 + 5 = 16 bits, so the sign bits are lost */
slouken@1542
  2034
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  2035
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2036
				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
slouken@1542
  2037
slouken@1542
  2038
				movq_r2r(mm6, mm1); /* save new reds in dsts */
slouken@1542
  2039
slouken@1542
  2040
				/* green -- process the bits in place */
slouken@1542
  2041
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2042
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2043
				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
slouken@1542
  2044
				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
slouken@1542
  2045
slouken@1542
  2046
				/* blend */
slouken@1542
  2047
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2048
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2049
				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
slouken@1542
  2050
				   bits are gone and the sign bits present */
slouken@1542
  2051
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  2052
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2053
slouken@1542
  2054
				por_r2r(mm6, mm1); /* save new greens in dsts */
slouken@1542
  2055
slouken@1542
  2056
				/* blue */
slouken@1542
  2057
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2058
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2059
				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
slouken@1542
  2060
				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2061
slouken@1542
  2062
				/* blend */
slouken@1542
  2063
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2064
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2065
				/* 11 + 5 = 16 bits, so the sign bits are lost and
slouken@1542
  2066
				   the interesting bits will need to be MASKed */
slouken@1542
  2067
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  2068
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2069
				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2070
slouken@1542
  2071
				por_r2r(mm6, mm1); /* save new blues in dsts */
slouken@1542
  2072
slouken@1542
  2073
				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
slouken@1542
  2074
slouken@1542
  2075
				srcp += 4;
slouken@1542
  2076
				dstp += 4;
slouken@1542
  2077
			}, width);			
slouken@1895
  2078
			/* *INDENT-ON* */
slouken@1895
  2079
            srcp += srcskip;
slouken@1895
  2080
            dstp += dstskip;
slouken@1895
  2081
        }
slouken@1895
  2082
        emms();
slouken@1895
  2083
    }
slouken@689
  2084
}
slouken@689
  2085
slouken@689
  2086
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  2087
static void
slouken@1895
  2088
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@689
  2089
{
slouken@1895
  2090
    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
slouken@1895
  2091
    if (alpha == 128) {
slouken@1895
  2092
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  2093
    } else {
slouken@1895
  2094
        int width = info->d_width;
slouken@1895
  2095
        int height = info->d_height;
slouken@1895
  2096
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  2097
        int srcskip = info->s_skip >> 1;
slouken@1895
  2098
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2099
        int dstskip = info->d_skip >> 1;
slouken@1895
  2100
        Uint32 s, d;
slouken@1895
  2101
        Uint8 load[8];
slouken@689
  2102
slouken@1895
  2103
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
  2104
        *(Uint64 *) load = alpha;
slouken@1895
  2105
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
  2106
slouken@1895
  2107
        movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
slouken@1895
  2108
        punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
slouken@1895
  2109
        punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
slouken@1895
  2110
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
  2111
           to reduce the number of operations */
slouken@1895
  2112
        psllq_i2r(3, mm0);
slouken@1895
  2113
slouken@1895
  2114
        /* Setup the 555 color channel masks */
slouken@1895
  2115
        *(Uint64 *) load = 0x03E003E003E003E0ULL;
slouken@1895
  2116
        movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
slouken@1895
  2117
        *(Uint64 *) load = 0x001F001F001F001FULL;
slouken@1895
  2118
        movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
slouken@1895
  2119
        while (height--) {
slouken@1895
  2120
			/* *INDENT-OFF* */
slouken@1542
  2121
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2122
			{
slouken@1542
  2123
				s = *srcp++;
slouken@689
  2124
				d = *dstp;
slouken@689
  2125
				/*
slouken@689
  2126
				 * shift out the middle component (green) to
slouken@689
  2127
				 * the high 16 bits, and process all three RGB
slouken@689
  2128
				 * components at the same time.
slouken@689
  2129
				 */
slouken@689
  2130
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2131
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2132
				d += (s - d) * alpha >> 5;
slouken@689
  2133
				d &= 0x03e07c1f;
slouken@689
  2134
				*dstp++ = d | d >> 16;
slouken@1542
  2135
			},{
slouken@1542
  2136
				s = *srcp++;
slouken@689
  2137
				d = *dstp;
slouken@689
  2138
				/*
slouken@689
  2139
				 * shift out the middle component (green) to
slouken@689
  2140
				 * the high 16 bits, and process all three RGB
slouken@689
  2141
				 * components at the same time.
slouken@689
  2142
				 */
slouken@689
  2143
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2144
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2145
				d += (s - d) * alpha >> 5;
slouken@689
  2146
				d &= 0x03e07c1f;
slouken@689
  2147
				*dstp++ = d | d >> 16;
slouken@689
  2148
			        s = *srcp++;
slouken@689
  2149
				d = *dstp;
slouken@689
  2150
				/*
slouken@689
  2151
				 * shift out the middle component (green) to
slouken@689
  2152
				 * the high 16 bits, and process all three RGB
slouken@689
  2153
				 * components at the same time.
slouken@689
  2154
				 */
slouken@689
  2155
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2156
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2157
				d += (s - d) * alpha >> 5;
slouken@689
  2158
				d &= 0x03e07c1f;
slouken@689
  2159
				*dstp++ = d | d >> 16;
slouken@1542
  2160
			},{
slouken@1542
  2161
				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@1542
  2162
				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@1542
  2163
slouken@1542
  2164
				/* red -- process the bits in place */
slouken@1542
  2165
				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
slouken@1542
  2166
					/* by reusing the GREEN mask we free up another mmx
slouken@1542
  2167
					   register to accumulate the result */
slouken@1542
  2168
slouken@1542
  2169
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2170
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2171
				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
slouken@1542
  2172
				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
slouken@1542
  2173
slouken@1542
  2174
				/* blend */
slouken@1542
  2175
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2176
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2177
				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
slouken@1542
  2178
				   cleared by a MASK below */
slouken@1542
  2179
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  2180
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2181
				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
slouken@1542
  2182
slouken@1542
  2183
				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
slouken@1542
  2184
slouken@1542
  2185
				movq_r2r(mm6, mm1); /* save new reds in dsts */
slouken@1542
  2186
slouken@1542
  2187
				/* green -- process the bits in place */
slouken@1542
  2188
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2189
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2190
				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
slouken@1542
  2191
				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
slouken@1542
  2192
slouken@1542
  2193
				/* blend */
slouken@1542
  2194
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2195
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2196
				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
slouken@1542
  2197
				   bits are gone and the sign bits present */
slouken@1542
  2198
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  2199
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2200
slouken@1542
  2201
				por_r2r(mm6, mm1); /* save new greens in dsts */
slouken@1542
  2202
slouken@1542
  2203
				/* blue */
slouken@1542
  2204
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2205
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2206
				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
slouken@1542
  2207
				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2208
slouken@1542
  2209
				/* blend */
slouken@1542
  2210
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2211
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2212
				/* 11 + 5 = 16 bits, so the sign bits are lost and
slouken@1542
  2213
				   the interesting bits will need to be MASKed */
slouken@1542
  2214
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  2215
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2216
				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2217
slouken@1542
  2218
				por_r2r(mm6, mm1); /* save new blues in dsts */
slouken@1542
  2219
slouken@1542
  2220
				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
slouken@1542
  2221
slouken@1542
  2222
				srcp += 4;
slouken@1542
  2223
				dstp += 4;
slouken@1895
  2224
			}, width);
slouken@1895
  2225
			/* *INDENT-ON* */
slouken@1895
  2226
            srcp += srcskip;
slouken@1895
  2227
            dstp += dstskip;
slouken@1895
  2228
        }
slouken@1895
  2229
        emms();
slouken@1895
  2230
    }
slouken@689
  2231
}
slouken@1895
  2232
slouken@1542
  2233
/* End GCC_ASMBLIT */
slouken@1542
  2234
slouken@1542
  2235
#elif MSVC_ASMBLIT
slouken@1542
  2236
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
  2237
static void
slouken@1895
  2238
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
  2239
{
slouken@1895
  2240
    unsigned alpha = info->src->alpha;
slouken@1895
  2241
    if (alpha == 128) {
slouken@1895
  2242
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
  2243
    } else {
slouken@1895
  2244
        int width = info->d_width;
slouken@1895
  2245
        int height = info->d_height;
slouken@1895
  2246
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  2247
        int srcskip = info->s_skip >> 1;
slouken@1895
  2248
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2249
        int dstskip = info->d_skip >> 1;
slouken@1895
  2250
        Uint32 s, d;
slouken@1895
  2251
slouken@1895
  2252
        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
  2253
slouken@1895
  2254
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
  2255
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
  2256
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
  2257
slouken@1895
  2258
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
  2259
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
  2260
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
  2261
           to reduce the number of operations */
slouken@1895
  2262
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1895
  2263
slouken@1895
  2264
        /* Setup the 565 color channel masks */
slouken@1895
  2265
        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
slouken@1895
  2266
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
  2267
slouken@1895
  2268
        while (height--) {
slouken@1895
  2269
			/* *INDENT-OFF* */
slouken@1542
  2270
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2271
			{
slouken@1542
  2272
				s = *srcp++;
slouken@1542
  2273
				d = *dstp;
slouken@1542
  2274
				/*
slouken@1542
  2275
				 * shift out the middle component (green) to
slouken@1542
  2276
				 * the high 16 bits, and process all three RGB
slouken@1542
  2277
				 * components at the same time.
slouken@1542
  2278
				 */
slouken@1542
  2279
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2280
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2281
				d += (s - d) * alpha >> 5;
slouken@1542
  2282
				d &= 0x07e0f81f;
slouken@1546
  2283
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2284
			},{
slouken@1542
  2285
				s = *srcp++;
slouken@1542
  2286
				d = *dstp;
slouken@1542
  2287
				/*
slouken@1542
  2288
				 * shift out the middle component (green) to
slouken@1542
  2289
				 * the high 16 bits, and process all three RGB
slouken@1542
  2290
				 * components at the same time.
slouken@1542
  2291
				 */
slouken@1542
  2292
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2293
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2294
				d += (s - d) * alpha >> 5;
slouken@1542
  2295
				d &= 0x07e0f81f;
slouken@1546
  2296
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2297
				s = *srcp++;
slouken@1542
  2298
				d = *dstp;
slouken@1542
  2299
				/*
slouken@1542
  2300
				 * shift out the middle component (green) to
slouken@1542
  2301
				 * the high 16 bits, and process all three RGB
slouken@1542
  2302
				 * components at the same time.
slouken@1542
  2303
				 */
slouken@1542
  2304
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2305
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2306
				d += (s - d) * alpha >> 5;
slouken@1542
  2307
				d &= 0x07e0f81f;
slouken@1546
  2308
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2309
			},{
slouken@1542
  2310
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
  2311
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
  2312
slouken@1542
  2313
				/* red */
slouken@1542
  2314
				src2 = src1;
slouken@1542
  2315
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
  2316
slouken@1542
  2317
				dst2 = dst1;
slouken@1542
  2318
				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
  2319
slouken@1542
  2320
				/* blend */
slouken@1542
  2321
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2322
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2323
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2324
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2325
				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
  2326
slouken@1542
  2327
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
  2328
slouken@1542
  2329
				/* green -- process the bits in place */
slouken@1542
  2330
				src2 = src1;
slouken@1542
  2331
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
  2332
slouken@1542
  2333
				dst2 = dst1;
slouken@1542
  2334
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
  2335
slouken@1542
  2336
				/* blend */
slouken@1542
  2337
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2338
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2339
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2340
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2341
slouken@1542
  2342
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
  2343
slouken@1542
  2344
				/* blue */
slouken@1542
  2345
				src2 = src1;
slouken@1542
  2346
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
  2347
slouken@1542
  2348
				dst2 = dst1;
slouken@1542
  2349
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
  2350
slouken@1542
  2351
				/* blend */
slouken@1542
  2352
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2353
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2354
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2355
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2356
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
  2357
slouken@1542
  2358
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
  2359
slouken@1542
  2360
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
  2361
slouken@1542
  2362
				srcp += 4;
slouken@1542
  2363
				dstp += 4;
slouken@1895
  2364
			}, width);
slouken@1895
  2365
			/* *INDENT-ON* */
slouken@1895
  2366
            srcp += srcskip;
slouken@1895
  2367
            dstp += dstskip;
slouken@1895
  2368
        }
slouken@1895
  2369
        _mm_empty();
slouken@1895
  2370
    }
slouken@1542
  2371
}
slouken@1542
  2372
slouken@1542
  2373
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  2374
static void
slouken@1895
  2375
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
slouken@1542
  2376
{
slouken@1895
  2377
    unsigned alpha = info->src->alpha;
slouken@1895
  2378
    if (alpha == 128) {
slouken@1895
  2379
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  2380
    } else {
slouken@1895
  2381
        int width = info->d_width;
slouken@1895
  2382
        int height = info->d_height;
slouken@1895
  2383
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  2384
        int srcskip = info->s_skip >> 1;
slouken@1895
  2385
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2386
        int dstskip = info->d_skip >> 1;
slouken@1895
  2387
        Uint32 s, d;
slouken@1895
  2388
slouken@1895
  2389
        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
  2390
slouken@1895
  2391
        alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
slouken@1895
  2392
        mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
slouken@1895
  2393
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1542
  2394
slouken@1895
  2395
        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
slouken@1895
  2396
        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
slouken@1895
  2397
        /* position alpha to allow for mullo and mulhi on diff channels
slouken@1895
  2398
           to reduce the number of operations */
slouken@1895
  2399
        mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1542
  2400
slouken@1895
  2401
        /* Setup the 555 color channel masks */
slouken@1895
  2402
        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
slouken@1895
  2403
        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
slouken@1895
  2404
        bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
slouken@1895
  2405
slouken@1895
  2406
        while (height--) {
slouken@1895
  2407
			/* *INDENT-OFF* */
slouken@1542
  2408
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2409
			{
slouken@1542
  2410
				s = *srcp++;
slouken@1542
  2411
				d = *dstp;
slouken@1542
  2412
				/*
slouken@1542
  2413
				 * shift out the middle component (green) to
slouken@1542
  2414
				 * the high 16 bits, and process all three RGB
slouken@1542
  2415
				 * components at the same time.
slouken@1542
  2416
				 */
slouken@1542
  2417
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2418
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2419
				d += (s - d) * alpha >> 5;
slouken@1542
  2420
				d &= 0x03e07c1f;
slouken@1546
  2421
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2422
			},{
slouken@1542
  2423
				s = *srcp++;
slouken@1542
  2424
				d = *dstp;
slouken@1542
  2425
				/*
slouken@1542
  2426
				 * shift out the middle component (green) to
slouken@1542
  2427
				 * the high 16 bits, and process all three RGB
slouken@1542
  2428
				 * components at the same time.
slouken@1542
  2429
				 */
slouken@1542
  2430
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2431
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2432
				d += (s - d) * alpha >> 5;
slouken@1542
  2433
				d &= 0x03e07c1f;
slouken@1546
  2434
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2435
			        s = *srcp++;
slouken@1542
  2436
				d = *dstp;
slouken@1542
  2437
				/*
slouken@1542
  2438
				 * shift out the middle component (green) to
slouken@1542
  2439
				 * the high 16 bits, and process all three RGB
slouken@1542
  2440
				 * components at the same time.
slouken@1542
  2441
				 */
slouken@1542
  2442
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2443
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2444
				d += (s - d) * alpha >> 5;
slouken@1542
  2445
				d &= 0x03e07c1f;
slouken@1546
  2446
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2447
			},{
slouken@1542
  2448
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
  2449
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
  2450
slouken@1542
  2451
				/* red -- process the bits in place */
slouken@1542
  2452
				src2 = src1;
slouken@1542
  2453
				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
  2454
slouken@1542
  2455
				dst2 = dst1;
slouken@1542
  2456
				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
  2457
slouken@1542
  2458
				/* blend */
slouken@1542
  2459
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2460
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2461
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2462
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2463
				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
  2464
slouken@1542
  2465
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
  2466
				
slouken@1542
  2467
				/* green -- process the bits in place */
slouken@1542
  2468
				src2 = src1;
slouken@1542
  2469
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
  2470
slouken@1542
  2471
				dst2 = dst1;
slouken@1542
  2472
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
  2473
slouken@1542
  2474
				/* blend */
slouken@1542
  2475
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2476
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2477
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2478
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2479
slouken@1542
  2480
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
  2481
slouken@1542
  2482
				/* blue */
slouken@1542
  2483
				src2 = src1; /* src -> src2 */
slouken@1542
  2484
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
  2485
slouken@1542
  2486
				dst2 = dst1; /* dst -> dst2 */
slouken@1542
  2487
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
  2488
slouken@1542
  2489
				/* blend */
slouken@1542
  2490
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2491
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2492
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2493
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2494
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
  2495
slouken@1542
  2496
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
  2497
slouken@1542
  2498
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
  2499
slouken@1542
  2500
				srcp += 4;
slouken@1542
  2501
				dstp += 4;
slouken@1895
  2502
			}, width);
slouken@1895
  2503
			/* *INDENT-ON* */
slouken@1895
  2504
            srcp += srcskip;
slouken@1895
  2505
            dstp += dstskip;
slouken@1895
  2506
        }
slouken@1895
  2507
        _mm_empty();
slouken@1895
  2508
    }
slouken@1542
  2509
}
slouken@1542
  2510
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
  2511
slouken@1
  2512
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1895
  2513
static void
slouken@1895
  2514
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
slouken@1
  2515
{
slouken@1895
  2516
    unsigned alpha = info->src->alpha;
slouken@1895
  2517
    if (alpha == 128) {
slouken@1895
  2518
        Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1895
  2519
    } else {
slouken@1895
  2520
        int width = info->d_width;
slouken@1895
  2521
        int height = info->d_height;
slouken@1895
  2522
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  2523
        int srcskip = info->s_skip >> 1;
slouken@1895
  2524
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2525
        int dstskip = info->d_skip >> 1;
slouken@1895
  2526
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@1
  2527
slouken@1895
  2528
        while (height--) {
slouken@1895
  2529
			/* *INDENT-OFF* */
slouken@1
  2530
			DUFFS_LOOP4({
slouken@1
  2531
				Uint32 s = *srcp++;
slouken@1
  2532
				Uint32 d = *dstp;
slouken@1
  2533
				/*
slouken@1
  2534
				 * shift out the middle component (green) to
slouken@1
  2535
				 * the high 16 bits, and process all three RGB
slouken@1
  2536
				 * components at the same time.
slouken@1
  2537
				 */
slouken@1
  2538
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  2539
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  2540
				d += (s - d) * alpha >> 5;
slouken@1
  2541
				d &= 0x07e0f81f;
slouken@1428
  2542
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  2543
			}, width);
slouken@1895
  2544
			/* *INDENT-ON* */
slouken@1895
  2545
            srcp += srcskip;
slouken@1895
  2546
            dstp += dstskip;
slouken@1895
  2547
        }
slouken@1895
  2548
    }
slouken@0
  2549
}
slouken@0
  2550
slouken@0
  2551
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1895
  2552
static void
slouken@1895
  2553
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  2554
{
slouken@1895
  2555
    unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
slouken@1895
  2556
    if (alpha == 128) {
slouken@1895
  2557
        Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1895
  2558
    } else {
slouken@1895
  2559
        int width = info->d_width;
slouken@1895
  2560
        int height = info->d_height;
slouken@1895
  2561
        Uint16 *srcp = (Uint16 *) info->s_pixels;
slouken@1895
  2562
        int srcskip = info->s_skip >> 1;
slouken@1895
  2563
        Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2564
        int dstskip = info->d_skip >> 1;
slouken@1895
  2565
        alpha >>= 3;            /* downscale alpha to 5 bits */
slouken@0
  2566
slouken@1895
  2567
        while (height--) {
slouken@1895
  2568
			/* *INDENT-OFF* */
slouken@1
  2569
			DUFFS_LOOP4({
slouken@1
  2570
				Uint32 s = *srcp++;
slouken@1
  2571
				Uint32 d = *dstp;
slouken@1
  2572
				/*
slouken@1
  2573
				 * shift out the middle component (green) to
slouken@1
  2574
				 * the high 16 bits, and process all three RGB
slouken@1
  2575
				 * components at the same time.
slouken@1
  2576
				 */
slouken@1
  2577
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  2578
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  2579
				d += (s - d) * alpha >> 5;
slouken@1
  2580
				d &= 0x03e07c1f;
slouken@1428
  2581
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  2582
			}, width);
slouken@1895
  2583
			/* *INDENT-ON* */
slouken@1895
  2584
            srcp += srcskip;
slouken@1895
  2585
            dstp += dstskip;
slouken@1895
  2586
        }
slouken@1895
  2587
    }
slouken@0
  2588
}
slouken@0
  2589
slouken@0
  2590
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@1895
  2591
static void
slouken@1895
  2592
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
slouken@0
  2593
{
slouken@1895
  2594
    int width = info->d_width;
slouken@1895
  2595
    int height = info->d_height;
slouken@1895
  2596
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  2597
    int srcskip = info->s_skip >> 2;
slouken@1895
  2598
    Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2599
    int dstskip = info->d_skip >> 1;
slouken@0
  2600
slouken@1895
  2601
    while (height--) {
slouken@1895
  2602
	    /* *INDENT-OFF* */
slouken@0
  2603
	    DUFFS_LOOP4({
slouken@0
  2604
		Uint32 s = *srcp;
slouken@0
  2605
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  2606
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  2607
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  2608
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  2609
		   Benchmark this! */
slouken@689
  2610
		if(alpha) {   
slouken@689
  2611
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  2612
		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@689
  2613
		  } else {
slouken@0
  2614
		    Uint32 d = *dstp;
slouken@0
  2615
		    /*
slouken@0
  2616
		     * convert source and destination to G0RAB65565
slouken@0
  2617
		     * and blend all components at the same time
slouken@0
  2618
		     */
slouken@0
  2619
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  2620
		      + (s >> 3 & 0x1f);
slouken@0
  2621
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  2622
		    d += (s - d) * alpha >> 5;
slouken@0
  2623
		    d &= 0x07e0f81f;
slouken@1428
  2624
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  2625
		  }
slouken@0
  2626
		}
slouken@0
  2627
		srcp++;
slouken@0
  2628
		dstp++;
slouken@0
  2629
	    }, width);
slouken@1895
  2630
	    /* *INDENT-ON* */
slouken@1895
  2631
        srcp += srcskip;
slouken@1895
  2632
        dstp += dstskip;
slouken@1895
  2633
    }
slouken@0
  2634
}
slouken@0
  2635
slouken@0
  2636
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@1895
  2637
static void
slouken@1895
  2638
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
slouken@0
  2639
{
slouken@1895
  2640
    int width = info->d_width;
slouken@1895
  2641
    int height = info->d_height;
slouken@1895
  2642
    Uint32 *srcp = (Uint32 *) info->s_pixels;
slouken@1895
  2643
    int srcskip = info->s_skip >> 2;
slouken@1895
  2644
    Uint16 *dstp = (Uint16 *) info->d_pixels;
slouken@1895
  2645
    int dstskip = info->d_skip >> 1;
slouken@0
  2646
slouken@1895
  2647
    while (height--) {
slouken@1895
  2648
	    /* *INDENT-OFF* */
slouken@0
  2649
	    DUFFS_LOOP4({
slouken@0
  2650
		unsigned alpha;
slouken@0
  2651
		Uint32 s = *srcp;
slouken@0
  2652
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  2653
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  2654
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  2655
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  2656
		   Benchmark this! */
slouken@689
  2657
		if(alpha) {   
slouken@689
  2658
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  2659
		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@689
  2660
		  } else {
slouken@0
  2661
		    Uint32 d = *dstp;
slouken@0
  2662
		    /*
slouken@0
  2663
		     * convert source and destination to G0RAB65565
slouken@0
  2664
		     * and blend all components at the same time
slouken@0
  2665
		     */
slouken@0
  2666
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  2667
		      + (s >> 3 & 0x1f);
slouken@0
  2668
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  2669
		    d += (s - d) * alpha >> 5;
slouken@0
  2670
		    d &= 0x03e07c1f;
slouken@1428
  2671
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  2672
		  }
slouken@0
  2673
		}
slouken@0
  2674
		srcp++;
slouken@0
  2675
		dstp++;
slouken@0
  2676
	    }, width);
slouken@1895
  2677
	    /* *INDENT-ON* */
slouken@1895
  2678
        srcp += srcskip;
slouken@1895
  2679
        dstp += dstskip;
slouken@1895
  2680
    }
slouken@0
  2681
}
slouken@0
  2682
slouken@0
  2683
/* General (slow) N->N blending with per-surface alpha */
slouken@1895
  2684
static void
slouken@1895
  2685
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
slouken@0
  2686
{
slouken@1895
  2687
    int width = info->d_width;
slouken@1895
  2688
    int height = info->d_height;
slouken@1895
  2689
    Uint8 *src = info->s_pixels;
slouken@1895
  2690
    int srcskip = info->s_skip;
slouken@1895
  2691
    Uint8 *dst = info->d_pixels;
slouken@1895
  2692
    int dstskip = info->d_skip;
slouken@1895
  2693
    SDL_PixelFormat *srcfmt = info->src;
slouken@1895
  2694
    SDL_PixelFormat *dstfmt = info->dst;
slouken@1895
  2695
    int srcbpp = srcfmt->BytesPerPixel;
slouken@1895
  2696
    int dstbpp = dstfmt->BytesPerPixel;
slouken@1895
  2697
    unsigned sA = srcfmt->alpha;
slouken@1895
  2698
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  2699
slouken@1895
  2700
    if (sA) {
slouken@1895
  2701
        while (height--) {
slouken@1895
  2702
	    /* *INDENT-OFF* */
slouken@0
  2703
	    DUFFS_LOOP4(
slouken@0
  2704
	    {
icculus@1162
  2705
		Uint32 Pixel;
slouken@0
  2706
		unsigned sR;
slouken@0
  2707
		unsigned sG;
slouken@0
  2708
		unsigned sB;
slouken@0
  2709
		unsigned dR;
slouken@0
  2710
		unsigned dG;
slouken@0
  2711
		unsigned dB;
icculus@1162
  2712
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
  2713
		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  2714
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  2715
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  2716
		src += srcbpp;
slouken@0
  2717
		dst += dstbpp;
slouken@0
  2718
	    },
slouken@0
  2719
	    width);
slouken@1895
  2720
	    /* *INDENT-ON* */
slouken@1895
  2721
            src += srcskip;
slouken@1895
  2722
            dst += dstskip;
slouken@1895
  2723
        }
slouken@1895
  2724
    }
slouken@0
  2725
}
slouken@0
  2726
slouken@0
  2727
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@1895
  2728
static void
slouken@1895
  2729
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
slouken@0
  2730
{
slouken@1895
  2731
    int width = info->d_width;
slouken@1895
  2732
    int height = info->d_height;
slouken@1895
  2733
    Uint8 *src = info->s_pixels;
slouken@1895
  2734
    int srcskip = info->s_skip;