src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Tue, 09 May 2006 15:09:47 +0000
changeset 1795 398ac0f88e4d
parent 1617 b255b4058d37
child 1662 782fd950bd46
child 1895 c121d94672cb
child 3870 571c75f3d093
permissions -rw-r--r--
Fixed bug #220

The AltiVec blitters don't compile, since they require __VEC__ to be enabled in
order for the compiler to understand "vector" and friends (i.e. do AltiVec)
But you don't want to turn AltiVec on globally, since then the code would only
run on a G4 (there are already runtime tests, before using the AltiVec
variants)

The solution here is to enable AltiVec locally, for the actual AltiVec code.
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@1312
     3
    Copyright (C) 1997-2006 Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@1312
     6
    modify it under the terms of the GNU Lesser General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@1312
     8
    version 2.1 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@1312
    13
    Lesser General Public License for more details.
slouken@0
    14
slouken@1312
    15
    You should have received a copy of the GNU Lesser General Public
slouken@1312
    16
    License along with this library; if not, write to the Free Software
slouken@1312
    17
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@1402
    22
#include "SDL_config.h"
slouken@0
    23
slouken@0
    24
#include "SDL_video.h"
slouken@0
    25
#include "SDL_blit.h"
slouken@0
    26
slouken@1542
    27
#if SDL_ASSEMBLY_ROUTINES
slouken@1542
    28
#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
slouken@1361
    29
#define MMX_ASMBLIT 1
slouken@1542
    30
#define GCC_ASMBLIT 1
slouken@1542
    31
#elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
slouken@1542
    32
#define MMX_ASMBLIT 1
slouken@1542
    33
#define MSVC_ASMBLIT 1
slouken@880
    34
#endif
slouken@1542
    35
#endif /* SDL_ASSEMBLY_ROUTINES */
slouken@880
    36
slouken@739
    37
/* Function to check the CPU flags */
slouken@739
    38
#include "SDL_cpuinfo.h"
slouken@1542
    39
#if GCC_ASMBLIT
slouken@689
    40
#include "mmx.h"
slouken@1542
    41
#elif MSVC_ASMBLIT
slouken@1542
    42
#include <mmintrin.h>
slouken@1542
    43
#include <mm3dnow.h>
slouken@689
    44
#endif
slouken@689
    45
slouken@0
    46
/* Functions to perform alpha blended blitting */
slouken@0
    47
slouken@0
    48
/* N->1 blending with per-surface alpha */
slouken@0
    49
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
    50
{
slouken@0
    51
	int width = info->d_width;
slouken@0
    52
	int height = info->d_height;
slouken@0
    53
	Uint8 *src = info->s_pixels;
slouken@0
    54
	int srcskip = info->s_skip;
slouken@0
    55
	Uint8 *dst = info->d_pixels;
slouken@0
    56
	int dstskip = info->d_skip;
slouken@0
    57
	Uint8 *palmap = info->table;
slouken@0
    58
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
    59
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
    60
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    61
slouken@0
    62
	const unsigned A = srcfmt->alpha;
slouken@0
    63
slouken@0
    64
	while ( height-- ) {
slouken@0
    65
	    DUFFS_LOOP4(
slouken@0
    66
	    {
icculus@1162
    67
		Uint32 Pixel;
slouken@0
    68
		unsigned sR;
slouken@0
    69
		unsigned sG;
slouken@0
    70
		unsigned sB;
slouken@0
    71
		unsigned dR;
slouken@0
    72
		unsigned dG;
slouken@0
    73
		unsigned dB;
icculus@1162
    74
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    75
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    76
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    77
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
    78
		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
    79
		dR &= 0xff;
slouken@0
    80
		dG &= 0xff;
slouken@0
    81
		dB &= 0xff;
slouken@0
    82
		/* Pack RGB into 8bit pixel */
slouken@0
    83
		if ( palmap == NULL ) {
slouken@0
    84
		    *dst =((dR>>5)<<(3+2))|
slouken@0
    85
			  ((dG>>5)<<(2))|
slouken@0
    86
			  ((dB>>6)<<(0));
slouken@0
    87
		} else {
slouken@0
    88
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
    89
				  ((dG>>5)<<(2))  |
slouken@0
    90
				  ((dB>>6)<<(0))];
slouken@0
    91
		}
slouken@0
    92
		dst++;
slouken@0
    93
		src += srcbpp;
slouken@0
    94
	    },
slouken@0
    95
	    width);
slouken@0
    96
	    src += srcskip;
slouken@0
    97
	    dst += dstskip;
slouken@0
    98
	}
slouken@0
    99
}
slouken@0
   100
slouken@0
   101
/* N->1 blending with pixel alpha */
slouken@0
   102
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
slouken@0
   103
{
slouken@0
   104
	int width = info->d_width;
slouken@0
   105
	int height = info->d_height;
slouken@0
   106
	Uint8 *src = info->s_pixels;
slouken@0
   107
	int srcskip = info->s_skip;
slouken@0
   108
	Uint8 *dst = info->d_pixels;
slouken@0
   109
	int dstskip = info->d_skip;
slouken@0
   110
	Uint8 *palmap = info->table;
slouken@0
   111
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   112
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   113
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   114
slouken@0
   115
	/* FIXME: fix alpha bit field expansion here too? */
slouken@0
   116
	while ( height-- ) {
slouken@0
   117
	    DUFFS_LOOP4(
slouken@0
   118
	    {
icculus@1162
   119
		Uint32 Pixel;
slouken@0
   120
		unsigned sR;
slouken@0
   121
		unsigned sG;
slouken@0
   122
		unsigned sB;
slouken@0
   123
		unsigned sA;
slouken@0
   124
		unsigned dR;
slouken@0
   125
		unsigned dG;
slouken@0
   126
		unsigned dB;
icculus@1162
   127
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
   128
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
   129
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   130
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
   131
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   132
		dR &= 0xff;
slouken@0
   133
		dG &= 0xff;
slouken@0
   134
		dB &= 0xff;
slouken@0
   135
		/* Pack RGB into 8bit pixel */
slouken@0
   136
		if ( palmap == NULL ) {
slouken@0
   137
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   138
			  ((dG>>5)<<(2))|
slouken@0
   139
			  ((dB>>6)<<(0));
slouken@0
   140
		} else {
slouken@0
   141
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   142
				  ((dG>>5)<<(2))  |
slouken@0
   143
				  ((dB>>6)<<(0))  ];
slouken@0
   144
		}
slouken@0
   145
		dst++;
slouken@0
   146
		src += srcbpp;
slouken@0
   147
	    },
slouken@0
   148
	    width);
slouken@0
   149
	    src += srcskip;
slouken@0
   150
	    dst += dstskip;
slouken@0
   151
	}
slouken@0
   152
}
slouken@0
   153
slouken@0
   154
/* colorkeyed N->1 blending with per-surface alpha */
slouken@0
   155
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
   156
{
slouken@0
   157
	int width = info->d_width;
slouken@0
   158
	int height = info->d_height;
slouken@0
   159
	Uint8 *src = info->s_pixels;
slouken@0
   160
	int srcskip = info->s_skip;
slouken@0
   161
	Uint8 *dst = info->d_pixels;
slouken@0
   162
	int dstskip = info->d_skip;
slouken@0
   163
	Uint8 *palmap = info->table;
slouken@0
   164
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   165
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   166
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   167
	Uint32 ckey = srcfmt->colorkey;
slouken@0
   168
slouken@0
   169
	const int A = srcfmt->alpha;
slouken@0
   170
slouken@0
   171
	while ( height-- ) {
slouken@0
   172
	    DUFFS_LOOP(
slouken@0
   173
	    {
icculus@1162
   174
		Uint32 Pixel;
slouken@0
   175
		unsigned sR;
slouken@0
   176
		unsigned sG;
slouken@0
   177
		unsigned sB;
slouken@0
   178
		unsigned dR;
slouken@0
   179
		unsigned dG;
slouken@0
   180
		unsigned dB;
icculus@1162
   181
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   182
		if ( Pixel != ckey ) {
slouken@0
   183
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   184
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   185
		    dB = dstfmt->palette->colors[*dst].b;
slouken@0
   186
		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
   187
		    dR &= 0xff;
slouken@0
   188
		    dG &= 0xff;
slouken@0
   189
		    dB &= 0xff;
slouken@0
   190
		    /* Pack RGB into 8bit pixel */
slouken@0
   191
		    if ( palmap == NULL ) {
slouken@0
   192
			*dst =((dR>>5)<<(3+2))|
slouken@0
   193
			      ((dG>>5)<<(2)) |
slouken@0
   194
			      ((dB>>6)<<(0));
slouken@0
   195
		    } else {
slouken@0
   196
			*dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   197
				      ((dG>>5)<<(2))  |
slouken@0
   198
				      ((dB>>6)<<(0))  ];
slouken@0
   199
		    }
slouken@0
   200
		}
slouken@0
   201
		dst++;
slouken@0
   202
		src += srcbpp;
slouken@0
   203
	    },
slouken@0
   204
	    width);
slouken@0
   205
	    src += srcskip;
slouken@0
   206
	    dst += dstskip;
slouken@0
   207
	}
slouken@0
   208
}
slouken@0
   209
slouken@1542
   210
#if GCC_ASMBLIT
slouken@689
   211
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@689
   212
static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
slouken@689
   213
{
slouken@689
   214
	int width = info->d_width;
slouken@689
   215
	int height = info->d_height;
slouken@689
   216
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   217
	int srcskip = info->s_skip >> 2;
slouken@689
   218
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   219
	int dstskip = info->d_skip >> 2;
slouken@1542
   220
	Uint32 dalpha = info->dst->Amask;
slouken@1542
   221
	Uint8 load[8];
slouken@1542
   222
slouken@1542
   223
	*(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
slouken@1542
   224
	movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
slouken@1542
   225
	*(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
slouken@1542
   226
	movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
slouken@1542
   227
	movd_m2r(dalpha, mm7); /* dst alpha mask */
slouken@1542
   228
	punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
slouken@689
   229
	while(height--) {
slouken@1542
   230
		DUFFS_LOOP_DOUBLE2(
slouken@1542
   231
		{
slouken@1542
   232
			Uint32 s = *srcp++;
slouken@1542
   233
			Uint32 d = *dstp;
slouken@1542
   234
			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1542
   235
				   + (s & d & 0x00010101)) | dalpha;
slouken@1542
   236
		},{
slouken@1542
   237
			movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@1542
   238
			movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@1542
   239
slouken@1542
   240
			movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
slouken@1542
   241
			movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
slouken@1542
   242
slouken@1542
   243
			pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
slouken@1542
   244
			pand_r2r(mm4, mm5); /* src & mask -> mm5 */
slouken@1542
   245
			paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
slouken@1542
   246
			pand_r2r(mm1, mm2); /* src & dst -> mm2 */
slouken@1542
   247
			psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
slouken@1542
   248
			pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
slouken@1542
   249
			paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
slouken@1542
   250
			
slouken@1542
   251
			por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@1542
   252
			movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
slouken@1542
   253
			dstp += 2;
slouken@1542
   254
			srcp += 2;
slouken@1542
   255
		}, width);
slouken@1542
   256
		srcp += srcskip;
slouken@1542
   257
		dstp += dstskip;
slouken@689
   258
	}
slouken@689
   259
	emms();
slouken@689
   260
}
slouken@689
   261
slouken@689
   262
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@689
   263
static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
   264
{
slouken@1542
   265
	SDL_PixelFormat* df = info->dst;
slouken@689
   266
	unsigned alpha = info->src->alpha;
slouken@1542
   267
slouken@1542
   268
	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1542
   269
			/* only call a128 version when R,G,B occupy lower bits */
slouken@689
   270
		BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@689
   271
	} else {
slouken@689
   272
		int width = info->d_width;
slouken@689
   273
		int height = info->d_height;
slouken@689
   274
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   275
		int srcskip = info->s_skip >> 2;
slouken@689
   276
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   277
		int dstskip = info->d_skip >> 2;
slouken@1542
   278
slouken@1542
   279
		pxor_r2r(mm5, mm5); /* 0 -> mm5 */
slouken@1542
   280
		/* form the alpha mult */
slouken@1542
   281
		movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
slouken@1542
   282
		punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
slouken@1542
   283
		punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
slouken@1542
   284
		alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
slouken@1542
   285
		movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
slouken@1542
   286
		punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
slouken@1542
   287
		pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
slouken@1542
   288
			/* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1542
   289
		movd_m2r(df->Amask, mm7); /* dst alpha mask */
slouken@1542
   290
		punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
slouken@689
   291
		
slouken@689
   292
		while(height--) {
slouken@689
   293
			DUFFS_LOOP_DOUBLE2({
slouken@689
   294
				/* One Pixel Blend */
slouken@1542
   295
				movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   296
				movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   297
				punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
slouken@1542
   298
				punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
slouken@1542
   299
slouken@1542
   300
				psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@1542
   301
				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   302
				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@1542
   303
				paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@1542
   304
slouken@1542
   305
				packuswb_r2r(mm5, mm2);  /* ARGBARGB -> mm2 */
slouken@1542
   306
				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@1542
   307
				movd_r2m(mm2, *dstp);/* mm2 -> pixel */
slouken@689
   308
				++srcp;
slouken@689
   309
				++dstp;
slouken@689
   310
			},{
slouken@1542
   311
				/* Two Pixels Blend */
slouken@689
   312
				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
slouken@1542
   313
				movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@1542
   314
				movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
slouken@1542
   315
				movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@1542
   316
slouken@1542
   317
				punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
slouken@1542
   318
				punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
slouken@1542
   319
				punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
slouken@1542
   320
				punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
slouken@1542
   321
slouken@1542
   322
				psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
slouken@1542
   323
				pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
slouken@1542
   324
				psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
slouken@1542
   325
				paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
slouken@1542
   326
slouken@1542
   327
				psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
slouken@1542
   328
				pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   329
				psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@1542
   330
				paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
slouken@1542
   331
slouken@1542
   332
				packuswb_r2r(mm6, mm2);  /* ARGBARGB -> mm2 */
slouken@1542
   333
				por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
slouken@1542
   334
				
slouken@1542
   335
				movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
slouken@1542
   336
slouken@1542
   337
  				srcp += 2;
slouken@1542
   338
  				dstp += 2;
slouken@1542
   339
  			}, width);
slouken@689
   340
			srcp += srcskip;
slouken@689
   341
			dstp += dstskip;
slouken@689
   342
		}
slouken@689
   343
		emms();
slouken@689
   344
	}
slouken@689
   345
}
slouken@689
   346
slouken@689
   347
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
   348
static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
slouken@689
   349
{
slouken@689
   350
	int width = info->d_width;
slouken@689
   351
	int height = info->d_height;
slouken@689
   352
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   353
	int srcskip = info->s_skip >> 2;
slouken@689
   354
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   355
	int dstskip = info->d_skip >> 2;
slouken@1542
   356
	SDL_PixelFormat* sf = info->src;
slouken@1542
   357
	Uint32 amask = sf->Amask;
slouken@1542
   358
slouken@1542
   359
	pxor_r2r(mm6, mm6); /* 0 -> mm6 */
slouken@1542
   360
	/* form multiplication mask */
slouken@1542
   361
	movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
slouken@1542
   362
	punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
slouken@1542
   363
	pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
slouken@1542
   364
	movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
slouken@1542
   365
	pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
slouken@1542
   366
	/* form channel masks */
slouken@1542
   367
	movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
slouken@1542
   368
	packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
slouken@1542
   369
	packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
slouken@1542
   370
	pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
slouken@1542
   371
	/* get alpha channel shift */
slouken@1542
   372
	movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */
slouken@1542
   373
slouken@689
   374
	while(height--) {
slouken@689
   375
	    DUFFS_LOOP4({
slouken@1542
   376
		Uint32 alpha = *srcp & amask;
slouken@689
   377
		/* FIXME: Here we special-case opaque alpha since the
slouken@1542
   378
			compositioning used (>>8 instead of /255) doesn't handle
slouken@1542
   379
			it correctly. Also special-case alpha=0 for speed?
slouken@1542
   380
			Benchmark this! */
slouken@1542
   381
		if(alpha == 0) {
slouken@1542
   382
			/* do nothing */
slouken@1542
   383
		} else if(alpha == amask) {
slouken@1542
   384
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
   385
			/* using MMX here to free up regular registers for other things */
slouken@1542
   386
			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   387
			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   388
			pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
slouken@1542
   389
			pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
slouken@1542
   390
			por_r2r(mm1, mm2); /* src | dst -> mm2 */
slouken@1542
   391
			movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
slouken@1542
   392
		} else {
slouken@1542
   393
			movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
   394
			punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
slouken@1542
   395
slouken@1542
   396
			movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@1542
   397
			punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
slouken@1542
   398
slouken@1542
   399
			__asm__ __volatile__ (
slouken@1542
   400
				"movd %0, %%mm4"
slouken@1542
   401
				: : "r" (alpha) ); /* 0000A000 -> mm4 */
slouken@1542
   402
			psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
slouken@1542
   403
			punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
slouken@1542
   404
			punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
slouken@1542
   405
			pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
slouken@1542
   406
slouken@1542
   407
			/* blend */		    
slouken@1542
   408
			psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@1542
   409
			pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@1542
   410
			psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
slouken@1542
   411
			paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@1542
   412
			
slouken@1542
   413
			packuswb_r2r(mm6, mm2);  /* 0000ARGB -> mm2 */
slouken@1542
   414
			movd_r2m(mm2, *dstp);/* mm2 -> dst */
slouken@689
   415
		}
slouken@689
   416
		++srcp;
slouken@689
   417
		++dstp;
slouken@689
   418
	    }, width);
slouken@689
   419
	    srcp += srcskip;
slouken@689
   420
	    dstp += dstskip;
slouken@689
   421
	}
slouken@689
   422
	emms();
slouken@689
   423
}
slouken@1542
   424
/* End GCC_ASMBLIT */
slouken@1542
   425
slouken@1542
   426
#elif MSVC_ASMBLIT
slouken@1542
   427
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1542
   428
static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
slouken@1542
   429
{
slouken@1542
   430
	int width = info->d_width;
slouken@1542
   431
	int height = info->d_height;
slouken@1542
   432
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1542
   433
	int srcskip = info->s_skip >> 2;
slouken@1542
   434
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1542
   435
	int dstskip = info->d_skip >> 2;
slouken@1542
   436
	Uint32 dalpha = info->dst->Amask;
slouken@1542
   437
slouken@1542
   438
	__m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
slouken@1542
   439
	
slouken@1542
   440
	hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
slouken@1542
   441
	lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
slouken@1542
   442
	dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
slouken@1542
   443
slouken@1542
   444
	while (height--) {
slouken@1542
   445
		int n = width;
slouken@1542
   446
		if ( n & 1 ) {
slouken@1542
   447
			Uint32 s = *srcp++;
slouken@1542
   448
			Uint32 d = *dstp;
slouken@1542
   449
			*dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1542
   450
				   + (s & d & 0x00010101)) | dalpha;
slouken@1542
   451
			n--;
slouken@1542
   452
		}
slouken@1542
   453
		
slouken@1542
   454
		for (n >>= 1; n > 0; --n) {
slouken@1542
   455
			dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
slouken@1542
   456
			dst2 = dst1;   /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   457
slouken@1542
   458
			src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
slouken@1542
   459
			src2 = src1; /* 2 x src -> src2(ARGBARGB) */
slouken@1542
   460
slouken@1542
   461
			dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
slouken@1542
   462
			src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
slouken@1542
   463
			src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
slouken@1542
   464
			src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
slouken@1542
   465
slouken@1542
   466
			dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
slouken@1542
   467
			dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
slouken@1542
   468
			dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
slouken@1542
   469
			dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
slouken@1542
   470
			
slouken@1542
   471
			*(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
slouken@1542
   472
			dstp += 2;
slouken@1542
   473
			srcp += 2;
slouken@1542
   474
		}
slouken@1542
   475
		
slouken@1542
   476
		srcp += srcskip;
slouken@1542
   477
		dstp += dstskip;
slouken@1542
   478
	}
slouken@1542
   479
	_mm_empty();
slouken@1542
   480
}
slouken@1542
   481
slouken@1542
   482
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1542
   483
static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@1542
   484
{
slouken@1542
   485
	SDL_PixelFormat* df = info->dst;
slouken@1542
   486
	Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
slouken@1542
   487
	unsigned alpha = info->src->alpha;
slouken@1542
   488
slouken@1542
   489
	if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
slouken@1542
   490
			/* only call a128 version when R,G,B occupy lower bits */
slouken@1542
   491
		BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@1542
   492
	} else {
slouken@1542
   493
		int width = info->d_width;
slouken@1542
   494
		int height = info->d_height;
slouken@1542
   495
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1542
   496
		int srcskip = info->s_skip >> 2;
slouken@1542
   497
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1542
   498
		int dstskip = info->d_skip >> 2;
slouken@1542
   499
		Uint32 dalpha = df->Amask;
slouken@1542
   500
		Uint32 amult;
slouken@1542
   501
slouken@1542
   502
		__m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
slouken@1542
   503
		
slouken@1542
   504
		mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
slouken@1542
   505
		/* form the alpha mult */
slouken@1542
   506
		amult = alpha | (alpha << 8);
slouken@1542
   507
		amult = amult | (amult << 16);
slouken@1542
   508
		chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
slouken@1542
   509
		mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
slouken@1542
   510
		mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
slouken@1542
   511
			/* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
slouken@1542
   512
		dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
slouken@1542
   513
		
slouken@1542
   514
		while (height--) {
slouken@1542
   515
			int n = width;
slouken@1542
   516
			if (n & 1) {
slouken@1542
   517
				/* One Pixel Blend */
slouken@1542
   518
				src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
slouken@1542
   519
				src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
slouken@1542
   520
slouken@1542
   521
				dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
   522
				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   523
slouken@1542
   524
				src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
slouken@1542
   525
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   526
				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
slouken@1542
   527
				dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
slouken@1542
   528
				
slouken@1542
   529
				dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
   530
				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1542
   531
				*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   532
slouken@1542
   533
				++srcp;
slouken@1542
   534
				++dstp;
slouken@1542
   535
				
slouken@1542
   536
				n--;
slouken@1542
   537
			}
slouken@1542
   538
slouken@1542
   539
			for (n >>= 1; n > 0; --n) {
slouken@1542
   540
				/* Two Pixels Blend */
slouken@1542
   541
				src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
slouken@1542
   542
				src2 = src1; /* 2 x src -> src2(ARGBARGB) */
slouken@1542
   543
				src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
slouken@1542
   544
				src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
slouken@1542
   545
slouken@1542
   546
				dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
slouken@1542
   547
				dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
slouken@1542
   548
				dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
slouken@1542
   549
				dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
slouken@1542
   550
slouken@1542
   551
				src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
slouken@1542
   552
				src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
slouken@1542
   553
				src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
slouken@1542
   554
				dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
slouken@1542
   555
slouken@1542
   556
				src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
slouken@1542
   557
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
   558
				src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
slouken@1542
   559
				dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
slouken@1542
   560
				
slouken@1542
   561
				dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
slouken@1542
   562
				dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
slouken@1542
   563
slouken@1542
   564
				*(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
slouken@1542
   565
slouken@1542
   566
				srcp += 2;
slouken@1542
   567
				dstp += 2;
slouken@1542
   568
			}
slouken@1542
   569
			srcp += srcskip;
slouken@1542
   570
			dstp += dstskip;
slouken@1542
   571
		}
slouken@1542
   572
		_mm_empty();
slouken@1542
   573
	}
slouken@1542
   574
}
slouken@1542
   575
slouken@1542
   576
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1542
   577
static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
slouken@1542
   578
{
slouken@1542
   579
	int width = info->d_width;
slouken@1542
   580
	int height = info->d_height;
slouken@1542
   581
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1542
   582
	int srcskip = info->s_skip >> 2;
slouken@1542
   583
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1542
   584
	int dstskip = info->d_skip >> 2;
slouken@1542
   585
	SDL_PixelFormat* sf = info->src;
slouken@1542
   586
	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1542
   587
	Uint32 amask = sf->Amask;
slouken@1542
   588
	Uint32 ashift = sf->Ashift;
slouken@1542
   589
	Uint64 multmask;
slouken@1542
   590
slouken@1542
   591
	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
slouken@1542
   592
slouken@1542
   593
	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
slouken@1542
   594
	multmask = ~(0xFFFFi64 << (ashift * 2));
slouken@1542
   595
	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
slouken@1542
   596
slouken@1542
   597
	while(height--) {
slouken@1542
   598
		DUFFS_LOOP4({
slouken@1542
   599
		Uint32 alpha = *srcp & amask;
slouken@1542
   600
		if (alpha == 0) {
slouken@1542
   601
			/* do nothing */
slouken@1542
   602
		} else if (alpha == amask) {
slouken@1542
   603
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
   604
			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
slouken@1542
   605
		} else {
slouken@1542
   606
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
slouken@1542
   607
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
   608
slouken@1542
   609
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
   610
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
   611
slouken@1542
   612
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
   613
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
   614
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
   615
			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
   616
			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
slouken@1542
   617
slouken@1542
   618
			/* blend */		    
slouken@1542
   619
			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
slouken@1542
   620
			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
slouken@1542
   621
			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
slouken@1542
   622
			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
slouken@1542
   623
			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
   624
			
slouken@1542
   625
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
   626
		}
slouken@1542
   627
		++srcp;
slouken@1542
   628
		++dstp;
slouken@1542
   629
	    }, width);
slouken@1542
   630
	    srcp += srcskip;
slouken@1542
   631
	    dstp += dstskip;
slouken@1542
   632
	}
slouken@1542
   633
	_mm_empty();
slouken@1542
   634
}
slouken@1542
   635
/* End MSVC_ASMBLIT */
slouken@1542
   636
slouken@1542
   637
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
   638
slouken@1361
   639
#if SDL_ALTIVEC_BLITTERS
slouken@1795
   640
#if __MWERKS__
slouken@1795
   641
#pragma altivec_model on
slouken@1795
   642
#endif
slouken@1361
   643
#if HAVE_ALTIVEC_H
icculus@1162
   644
#include <altivec.h>
icculus@1175
   645
#endif
icculus@1047
   646
#include <assert.h>
icculus@1162
   647
slouken@1402
   648
#if (defined(__MACOSX__) && (__GNUC__ < 4))
icculus@1162
   649
    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   650
        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
icculus@1162
   651
    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   652
        (vector unsigned short) ( a,b,c,d,e,f,g,h )
icculus@1162
   653
#else
icculus@1162
   654
    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   655
        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
icculus@1162
   656
    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   657
        (vector unsigned short) { a,b,c,d,e,f,g,h }
icculus@1162
   658
#endif
icculus@1162
   659
icculus@1047
   660
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
icculus@1047
   661
#define VECPRINT(msg, v) do { \
icculus@1047
   662
    vector unsigned int tmpvec = (vector unsigned int)(v); \
icculus@1047
   663
    unsigned int *vp = (unsigned int *)&tmpvec; \
icculus@1047
   664
    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
icculus@1047
   665
} while (0)
icculus@1047
   666
icculus@1047
   667
/* the permuation vector that takes the high bytes out of all the appropriate shorts 
icculus@1047
   668
    (vector unsigned char)(
icculus@1047
   669
        0x00, 0x10, 0x02, 0x12,
icculus@1047
   670
        0x04, 0x14, 0x06, 0x16,
icculus@1047
   671
        0x08, 0x18, 0x0A, 0x1A,
icculus@1047
   672
        0x0C, 0x1C, 0x0E, 0x1E );
icculus@1047
   673
*/
icculus@1047
   674
#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
icculus@1047
   675
#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
icculus@1047
   676
#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
icculus@1047
   677
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
icculus@1047
   678
    ? vec_lvsl(0, src) \
icculus@1047
   679
    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
icculus@1047
   680
icculus@1047
   681
   
icculus@1047
   682
#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
icculus@1047
   683
    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
icculus@1047
   684
    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
icculus@1047
   685
    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
icculus@1047
   686
    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
icculus@1047
   687
    /* valpha2 is 255-alpha */ \
icculus@1047
   688
    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
icculus@1047
   689
    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
icculus@1047
   690
    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
icculus@1047
   691
    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
icculus@1047
   692
    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
icculus@1047
   693
    /* add source and dest */ \
icculus@1047
   694
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   695
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   696
    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
icculus@1047
   697
    vtemp1 = vec_add(vtemp1, v1_16); \
icculus@1047
   698
    vtemp3 = vec_sr(vtemp1, v8_16); \
icculus@1047
   699
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   700
    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
icculus@1047
   701
    vtemp2 = vec_add(vtemp2, v1_16); \
icculus@1047
   702
    vtemp4 = vec_sr(vtemp2, v8_16); \
icculus@1047
   703
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   704
    /* (>>8) and get ARGBARGBARGBARGB */ \
icculus@1047
   705
    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
icculus@1047
   706
} while (0)
icculus@1047
   707
 
icculus@1047
   708
/* Calculate the permute vector used for 32->32 swizzling */
icculus@1047
   709
static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
icculus@1047
   710
                                  const SDL_PixelFormat *dstfmt)
icculus@1047
   711
{
icculus@1047
   712
    /*
icculus@1047
   713
     * We have to assume that the bits that aren't used by other
icculus@1047
   714
     *  colors is alpha, and it's one complete byte, since some formats
icculus@1047
   715
     *  leave alpha with a zero mask, but we should still swizzle the bits.
icculus@1047
   716
     */
icculus@1047
   717
    /* ARGB */
icculus@1047
   718
    const static struct SDL_PixelFormat default_pixel_format = {
icculus@1047
   719
        NULL, 0, 0,
icculus@1047
   720
        0, 0, 0, 0,
icculus@1047
   721
        16, 8, 0, 24,
icculus@1047
   722
        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
icculus@1047
   723
        0, 0};
icculus@1047
   724
    if (!srcfmt) {
icculus@1047
   725
        srcfmt = &default_pixel_format;
icculus@1047
   726
    }
icculus@1047
   727
    if (!dstfmt) {
icculus@1047
   728
        dstfmt = &default_pixel_format;
icculus@1047
   729
    }
slouken@1487
   730
    const vector unsigned char plus = VECUINT8_LITERAL
icculus@1047
   731
                                            ( 0x00, 0x00, 0x00, 0x00,
icculus@1047
   732
                                              0x04, 0x04, 0x04, 0x04,
icculus@1047
   733
                                              0x08, 0x08, 0x08, 0x08,
icculus@1047
   734
                                              0x0C, 0x0C, 0x0C, 0x0C );
icculus@1047
   735
    vector unsigned char vswiz;
icculus@1047
   736
    vector unsigned int srcvec;
icculus@1047
   737
#define RESHIFT(X) (3 - ((X) >> 3))
icculus@1047
   738
    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
icculus@1047
   739
    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
icculus@1047
   740
    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
icculus@1047
   741
    Uint32 amask;
icculus@1047
   742
    /* Use zero for alpha if either surface doesn't have alpha */
icculus@1047
   743
    if (dstfmt->Amask) {
icculus@1047
   744
        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
icculus@1047
   745
    } else {
icculus@1047
   746
        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
icculus@1047
   747
    }
icculus@1047
   748
#undef RESHIFT  
icculus@1162
   749
    ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
icculus@1047
   750
    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
icculus@1047
   751
    return(vswiz);
icculus@1047
   752
}
icculus@1047
   753
icculus@1047
   754
static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
   755
{
icculus@1047
   756
    int height = info->d_height;
icculus@1047
   757
    Uint8 *src = (Uint8 *)info->s_pixels;
icculus@1047
   758
    int srcskip = info->s_skip;
icculus@1047
   759
    Uint8 *dst = (Uint8 *)info->d_pixels;
icculus@1047
   760
    int dstskip = info->d_skip;
icculus@1047
   761
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   762
icculus@1047
   763
    vector unsigned char v0 = vec_splat_u8(0);
icculus@1047
   764
    vector unsigned short v8_16 = vec_splat_u16(8);
icculus@1047
   765
    vector unsigned short v1_16 = vec_splat_u16(1);
icculus@1047
   766
    vector unsigned short v2_16 = vec_splat_u16(2);
icculus@1047
   767
    vector unsigned short v3_16 = vec_splat_u16(3);
icculus@1047
   768
    vector unsigned int v8_32 = vec_splat_u32(8);
icculus@1047
   769
    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
icculus@1162
   770
    vector unsigned short v3f = VECUINT16_LITERAL(
icculus@1047
   771
        0x003f, 0x003f, 0x003f, 0x003f,
icculus@1047
   772
        0x003f, 0x003f, 0x003f, 0x003f);
icculus@1162
   773
    vector unsigned short vfc = VECUINT16_LITERAL(
icculus@1047
   774
        0x00fc, 0x00fc, 0x00fc, 0x00fc,
icculus@1047
   775
        0x00fc, 0x00fc, 0x00fc, 0x00fc);
icculus@1047
   776
icculus@1047
   777
    /* 
icculus@1047
   778
        0x10 - 0x1f is the alpha
icculus@1047
   779
        0x00 - 0x0e evens are the red
icculus@1047
   780
        0x01 - 0x0f odds are zero
icculus@1047
   781
    */
icculus@1162
   782
    vector unsigned char vredalpha1 = VECUINT8_LITERAL(
icculus@1047
   783
        0x10, 0x00, 0x01, 0x01,
icculus@1047
   784
        0x10, 0x02, 0x01, 0x01,
icculus@1047
   785
        0x10, 0x04, 0x01, 0x01,
icculus@1047
   786
        0x10, 0x06, 0x01, 0x01
icculus@1047
   787
    );
icculus@1047
   788
    vector unsigned char vredalpha2 = (vector unsigned char)(
icculus@1047
   789
        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
icculus@1047
   790
    );
icculus@1047
   791
    /*
icculus@1047
   792
        0x00 - 0x0f is ARxx ARxx ARxx ARxx
icculus@1047
   793
        0x11 - 0x0f odds are blue
icculus@1047
   794
    */
icculus@1162
   795
    vector unsigned char vblue1 = VECUINT8_LITERAL(
icculus@1047
   796
        0x00, 0x01, 0x02, 0x11,
icculus@1047
   797
        0x04, 0x05, 0x06, 0x13,
icculus@1047
   798
        0x08, 0x09, 0x0a, 0x15,
icculus@1047
   799
        0x0c, 0x0d, 0x0e, 0x17
icculus@1047
   800
    );
icculus@1047
   801
    vector unsigned char vblue2 = (vector unsigned char)(
icculus@1047
   802
        vec_add((vector unsigned int)vblue1, v8_32)
icculus@1047
   803
    );
icculus@1047
   804
    /*
icculus@1047
   805
        0x00 - 0x0f is ARxB ARxB ARxB ARxB
icculus@1047
   806
        0x10 - 0x0e evens are green
icculus@1047
   807
    */
icculus@1162
   808
    vector unsigned char vgreen1 = VECUINT8_LITERAL(
icculus@1047
   809
        0x00, 0x01, 0x10, 0x03,
icculus@1047
   810
        0x04, 0x05, 0x12, 0x07,
icculus@1047
   811
        0x08, 0x09, 0x14, 0x0b,
icculus@1047
   812
        0x0c, 0x0d, 0x16, 0x0f
icculus@1047
   813
    );
icculus@1047
   814
    vector unsigned char vgreen2 = (vector unsigned char)(
icculus@1047
   815
        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
icculus@1047
   816
    );
icculus@1162
   817
    vector unsigned char vgmerge = VECUINT8_LITERAL(
icculus@1047
   818
        0x00, 0x02, 0x00, 0x06,
icculus@1047
   819
        0x00, 0x0a, 0x00, 0x0e,
icculus@1047
   820
        0x00, 0x12, 0x00, 0x16,
icculus@1047
   821
        0x00, 0x1a, 0x00, 0x1e);
icculus@1047
   822
    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   823
    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
   824
    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
   825
icculus@1047
   826
    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
icculus@1047
   827
    vf800 = vec_sl(vf800, vec_splat_u16(8));
icculus@1047
   828
icculus@1047
   829
    while(height--) {
icculus@1047
   830
        int extrawidth;
icculus@1047
   831
        vector unsigned char valigner;
icculus@1047
   832
        vector unsigned char vsrc;
icculus@1047
   833
        vector unsigned char voverflow;
icculus@1047
   834
        int width = info->d_width;
icculus@1047
   835
icculus@1047
   836
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   837
        while (condition) { \
icculus@1162
   838
            Uint32 Pixel; \
icculus@1047
   839
            unsigned sR, sG, sB, dR, dG, dB, sA; \
icculus@1162
   840
            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
   841
            if(sA) { \
icculus@1047
   842
                unsigned short dstpixel = *((unsigned short *)dst); \
icculus@1047
   843
                dR = (dstpixel >> 8) & 0xf8; \
icculus@1047
   844
                dG = (dstpixel >> 3) & 0xfc; \
icculus@1047
   845
                dB = (dstpixel << 3) & 0xf8; \
icculus@1047
   846
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   847
                *((unsigned short *)dst) = ( \
icculus@1047
   848
                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
icculus@1047
   849
                ); \
icculus@1047
   850
            } \
icculus@1047
   851
            src += 4; \
icculus@1047
   852
            dst += 2; \
icculus@1047
   853
            widthvar--; \
icculus@1047
   854
        }
icculus@1047
   855
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
icculus@1047
   856
        extrawidth = (width % 8);
icculus@1047
   857
        valigner = VEC_ALIGNER(src);
icculus@1047
   858
        vsrc = (vector unsigned char)vec_ld(0, src);
icculus@1047
   859
        width -= extrawidth;
icculus@1047
   860
        while (width) {
icculus@1047
   861
            vector unsigned char valpha;
icculus@1047
   862
            vector unsigned char vsrc1, vsrc2;
icculus@1047
   863
            vector unsigned char vdst1, vdst2;
icculus@1047
   864
            vector unsigned short vR, vG, vB;
icculus@1047
   865
            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
icculus@1047
   866
icculus@1047
   867
            /* Load 8 pixels from src as ARGB */
icculus@1047
   868
            voverflow = (vector unsigned char)vec_ld(15, src);
icculus@1047
   869
            vsrc = vec_perm(vsrc, voverflow, valigner);
icculus@1047
   870
            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
icculus@1047
   871
            src += 16;
icculus@1047
   872
            vsrc = (vector unsigned char)vec_ld(15, src);
icculus@1047
   873
            voverflow = vec_perm(voverflow, vsrc, valigner);
icculus@1047
   874
            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
icculus@1047
   875
            src += 16;
icculus@1047
   876
icculus@1047
   877
            /* Load 8 pixels from dst as XRGB */
icculus@1047
   878
            voverflow = vec_ld(0, dst);
icculus@1047
   879
            vR = vec_and((vector unsigned short)voverflow, vf800);
icculus@1047
   880
            vB = vec_sl((vector unsigned short)voverflow, v3_16);
icculus@1047
   881
            vG = vec_sl(vB, v2_16);
icculus@1047
   882
            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
icculus@1047
   883
            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
icculus@1047
   884
            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
icculus@1047
   885
            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
icculus@1047
   886
            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
icculus@1047
   887
            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
icculus@1047
   888
icculus@1047
   889
            /* Alpha blend 8 pixels as ARGB */
icculus@1047
   890
            valpha = vec_perm(vsrc1, v0, valphaPermute);
icculus@1047
   891
            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
icculus@1047
   892
            valpha = vec_perm(vsrc2, v0, valphaPermute);
icculus@1047
   893
            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
icculus@1047
   894
icculus@1047
   895
            /* Convert 8 pixels to 565 */
icculus@1047
   896
            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
icculus@1047
   897
            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
icculus@1047
   898
            vgpixel = vec_and(vgpixel, vfc);
icculus@1047
   899
            vgpixel = vec_sl(vgpixel, v3_16);
icculus@1047
   900
            vrpixel = vec_sl(vpixel, v1_16);
icculus@1047
   901
            vrpixel = vec_and(vrpixel, vf800);
icculus@1047
   902
            vbpixel = vec_and(vpixel, v3f);
icculus@1047
   903
            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
icculus@1047
   904
            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
icculus@1047
   905
            
icculus@1047
   906
            /* Store 8 pixels */
icculus@1047
   907
            vec_st(vdst1, 0, dst);
icculus@1047
   908
icculus@1047
   909
            width -= 8;
icculus@1047
   910
            dst += 16;
icculus@1047
   911
        }
icculus@1047
   912
        ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
   913
#undef ONE_PIXEL_BLEND
icculus@1047
   914
        src += srcskip;
icculus@1047
   915
        dst += dstskip;
icculus@1047
   916
    }
icculus@1047
   917
}
icculus@1047
   918
icculus@1047
   919
static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
icculus@1047
   920
{
icculus@1047
   921
    unsigned alpha = info->src->alpha;
icculus@1047
   922
    int height = info->d_height;
icculus@1047
   923
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
   924
    int srcskip = info->s_skip >> 2;
icculus@1047
   925
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
   926
    int dstskip = info->d_skip >> 2;
icculus@1047
   927
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   928
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
   929
    unsigned sA = srcfmt->alpha;
icculus@1047
   930
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
   931
    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
icculus@1047
   932
    Uint32 ckey = info->src->colorkey;
icculus@1047
   933
    vector unsigned char mergePermute;
icculus@1047
   934
    vector unsigned char vsrcPermute;
icculus@1047
   935
    vector unsigned char vdstPermute;
icculus@1047
   936
    vector unsigned char vsdstPermute;
icculus@1047
   937
    vector unsigned char valpha;
icculus@1047
   938
    vector unsigned char valphamask;
icculus@1047
   939
    vector unsigned char vbits;
icculus@1047
   940
    vector unsigned char v0;
icculus@1047
   941
    vector unsigned short v1;
icculus@1047
   942
    vector unsigned short v8;
icculus@1047
   943
    vector unsigned int vckey;
icculus@1047
   944
    vector unsigned int vrgbmask;
icculus@1047
   945
icculus@1047
   946
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   947
    v0 = vec_splat_u8(0);
icculus@1047
   948
    v1 = vec_splat_u16(1);
icculus@1047
   949
    v8 = vec_splat_u16(8);
icculus@1047
   950
icculus@1047
   951
    /* set the alpha to 255 on the destination surf */
icculus@1047
   952
    valphamask = VEC_ALPHA_MASK();
icculus@1047
   953
icculus@1047
   954
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
   955
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
   956
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
   957
icculus@1047
   958
    /* set a vector full of alpha and 255-alpha */
icculus@1047
   959
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
   960
    valpha = vec_splat(valpha, 0);
icculus@1047
   961
    vbits = (vector unsigned char)vec_splat_s8(-1);
icculus@1047
   962
icculus@1047
   963
    ckey &= rgbmask;
icculus@1162
   964
    ((unsigned int *)(char*)&vckey)[0] = ckey;
icculus@1047
   965
    vckey = vec_splat(vckey, 0);
icculus@1162
   966
    ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
icculus@1047
   967
    vrgbmask = vec_splat(vrgbmask, 0);
icculus@1047
   968
icculus@1047
   969
    while(height--) {
icculus@1047
   970
        int width = info->d_width;
icculus@1047
   971
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   972
        while (condition) { \
icculus@1162
   973
            Uint32 Pixel; \
icculus@1047
   974
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
   975
            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
icculus@1162
   976
            if(sA && Pixel != ckey) { \
icculus@1162
   977
                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
icculus@1162
   978
                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
   979
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   980
                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
   981
            } \
icculus@1162
   982
            dstp++; \
icculus@1162
   983
            srcp++; \
icculus@1047
   984
            widthvar--; \
icculus@1047
   985
        }
icculus@1047
   986
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
   987
        if (width > 0) {
icculus@1047
   988
            int extrawidth = (width % 4);
icculus@1047
   989
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
   990
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
   991
            width -= extrawidth;
icculus@1047
   992
            while (width) {
icculus@1047
   993
                vector unsigned char vsel;
icculus@1047
   994
                vector unsigned char voverflow;
icculus@1047
   995
                vector unsigned char vd;
icculus@1047
   996
                vector unsigned char vd_orig;
icculus@1047
   997
icculus@1047
   998
                /* s = *srcp */
icculus@1047
   999
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1000
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1001
                
icculus@1047
  1002
                /* vsel is set for items that match the key */
icculus@1047
  1003
                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
icculus@1047
  1004
                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
icculus@1047
  1005
icculus@1047
  1006
                /* permute to source format */
icculus@1047
  1007
                vs = vec_perm(vs, valpha, vsrcPermute);
icculus@1047
  1008
icculus@1047
  1009
                /* d = *dstp */
icculus@1047
  1010
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1011
                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
  1012
icculus@1047
  1013
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1014
icculus@1047
  1015
                /* set the alpha channel to full on */
icculus@1047
  1016
                vd = vec_or(vd, valphamask);
icculus@1047
  1017
icculus@1047
  1018
                /* mask out color key */
icculus@1047
  1019
                vd = vec_sel(vd, vd_orig, vsel);
icculus@1047
  1020
                
icculus@1047
  1021
                /* permute to dest format */
icculus@1047
  1022
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
  1023
icculus@1047
  1024
                /* *dstp = res */
icculus@1047
  1025
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1026
                
icculus@1047
  1027
                srcp += 4;
icculus@1047
  1028
                dstp += 4;
icculus@1047
  1029
                width -= 4;
icculus@1047
  1030
                vs = voverflow;
icculus@1047
  1031
            }
icculus@1047
  1032
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1033
        }
icculus@1047
  1034
#undef ONE_PIXEL_BLEND
icculus@1047
  1035
 
icculus@1047
  1036
        srcp += srcskip;
icculus@1047
  1037
        dstp += dstskip;
icculus@1047
  1038
    }
icculus@1047
  1039
}
icculus@1047
  1040
icculus@1047
  1041
icculus@1047
  1042
static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1043
{
icculus@1047
  1044
    int width = info->d_width;
icculus@1047
  1045
    int height = info->d_height;
icculus@1047
  1046
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1047
    int srcskip = info->s_skip >> 2;
icculus@1047
  1048
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1049
    int dstskip = info->d_skip >> 2;
icculus@1047
  1050
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
  1051
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
  1052
    vector unsigned char mergePermute;
icculus@1047
  1053
    vector unsigned char valphaPermute;
icculus@1047
  1054
    vector unsigned char vsrcPermute;
icculus@1047
  1055
    vector unsigned char vdstPermute;
icculus@1047
  1056
    vector unsigned char vsdstPermute;
icculus@1047
  1057
    vector unsigned char valphamask;
icculus@1047
  1058
    vector unsigned char vpixelmask;
icculus@1047
  1059
    vector unsigned char v0;
icculus@1047
  1060
    vector unsigned short v1;
icculus@1047
  1061
    vector unsigned short v8;
icculus@1047
  1062
icculus@1047
  1063
    v0 = vec_splat_u8(0);
icculus@1047
  1064
    v1 = vec_splat_u16(1);
icculus@1047
  1065
    v8 = vec_splat_u16(8);
icculus@1047
  1066
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1067
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1068
    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
  1069
    vpixelmask = vec_nor(valphamask, v0);
icculus@1047
  1070
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1071
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1072
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1073
icculus@1047
  1074
	while ( height-- ) {
icculus@1047
  1075
        width = info->d_width;
icculus@1047
  1076
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
  1077
            Uint32 Pixel; \
icculus@1047
  1078
            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
icculus@1162
  1079
            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
  1080
            if(sA) { \
icculus@1162
  1081
              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
icculus@1047
  1082
              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1083
              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1084
            } \
icculus@1047
  1085
            ++srcp; \
icculus@1047
  1086
            ++dstp; \
icculus@1047
  1087
            widthvar--; \
icculus@1047
  1088
        }
icculus@1047
  1089
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1090
        if (width > 0) {
slouken@1487
  1091
            /* vsrcPermute */
slouken@1487
  1092
            /* vdstPermute */
icculus@1047
  1093
            int extrawidth = (width % 4);
icculus@1047
  1094
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
  1095
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1096
            width -= extrawidth;
icculus@1047
  1097
            while (width) {
icculus@1047
  1098
                vector unsigned char voverflow;
icculus@1047
  1099
                vector unsigned char vd;
icculus@1047
  1100
                vector unsigned char valpha;
icculus@1047
  1101
                vector unsigned char vdstalpha;
icculus@1047
  1102
                /* s = *srcp */
icculus@1047
  1103
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1104
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1105
                vs = vec_perm(vs, v0, vsrcPermute);
icculus@1047
  1106
icculus@1047
  1107
                valpha = vec_perm(vs, v0, valphaPermute);
icculus@1047
  1108
                
icculus@1047
  1109
                /* d = *dstp */
icculus@1047
  1110
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1111
                vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
  1112
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
  1113
icculus@1047
  1114
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1115
icculus@1047
  1116
                /* set the alpha to the dest alpha */
icculus@1047
  1117
                vd = vec_and(vd, vpixelmask);
icculus@1047
  1118
                vd = vec_or(vd, vdstalpha);
icculus@1047
  1119
                vd = vec_perm(vd, v0, vdstPermute);
icculus@1047
  1120
icculus@1047
  1121
                /* *dstp = res */
icculus@1047
  1122
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1123
                
icculus@1047
  1124
                srcp += 4;
icculus@1047
  1125
                dstp += 4;
icculus@1047
  1126
                width -= 4;
icculus@1047
  1127
                vs = voverflow;
icculus@1047
  1128
icculus@1047
  1129
            }
icculus@1047
  1130
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1131
        }
icculus@1047
  1132
	    srcp += srcskip;
icculus@1047
  1133
	    dstp += dstskip;
icculus@1047
  1134
#undef ONE_PIXEL_BLEND
icculus@1047
  1135
	}
icculus@1047
  1136
}
icculus@1047
  1137
icculus@1047
  1138
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
icculus@1047
  1139
static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1140
{
icculus@1047
  1141
	int width = info->d_width;
icculus@1047
  1142
	int height = info->d_height;
icculus@1047
  1143
	Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1144
	int srcskip = info->s_skip >> 2;
icculus@1047
  1145
	Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1146
	int dstskip = info->d_skip >> 2;
icculus@1047
  1147
    vector unsigned char mergePermute;
icculus@1047
  1148
    vector unsigned char valphaPermute;
icculus@1047
  1149
    vector unsigned char valphamask;
icculus@1047
  1150
    vector unsigned char vpixelmask;
icculus@1047
  1151
    vector unsigned char v0;
icculus@1047
  1152
    vector unsigned short v1;
icculus@1047
  1153
    vector unsigned short v8;
icculus@1047
  1154
    v0 = vec_splat_u8(0);
icculus@1047
  1155
    v1 = vec_splat_u16(1);
icculus@1047
  1156
    v8 = vec_splat_u16(8);
icculus@1047
  1157
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1158
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1159
    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
  1160
    
icculus@1047
  1161
 
icculus@1047
  1162
    vpixelmask = vec_nor(valphamask, v0);
icculus@1047
  1163
	while(height--) {
icculus@1047
  1164
        width = info->d_width;
icculus@1047
  1165
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
  1166
        while ((condition)) { \
icculus@1047
  1167
            Uint32 dalpha; \
icculus@1047
  1168
            Uint32 d; \
icculus@1047
  1169
            Uint32 s1; \
icculus@1047
  1170
            Uint32 d1; \
icculus@1047
  1171
            Uint32 s = *srcp; \
icculus@1047
  1172
            Uint32 alpha = s >> 24; \
icculus@1047
  1173
            if(alpha) { \
icculus@1047
  1174
              if(alpha == SDL_ALPHA_OPAQUE) { \
icculus@1047
  1175
                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
icculus@1047
  1176
              } else { \
icculus@1047
  1177
                d = *dstp; \
icculus@1047
  1178
                dalpha = d & 0xff000000; \
icculus@1047
  1179
                s1 = s & 0xff00ff; \
icculus@1047
  1180
                d1 = d & 0xff00ff; \
icculus@1047
  1181
                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
icculus@1047
  1182
                s &= 0xff00; \
icculus@1047
  1183
                d &= 0xff00; \
icculus@1047
  1184
                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
  1185
                *dstp = d1 | d | dalpha; \
icculus@1047
  1186
              } \
icculus@1047
  1187
            } \
icculus@1047
  1188
            ++srcp; \
icculus@1047
  1189
            ++dstp; \
icculus@1047
  1190
            widthvar--; \
icculus@1047
  1191
	    }
icculus@1047
  1192
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1193
        if (width > 0) {
icculus@1047
  1194
            int extrawidth = (width % 4);
icculus@1047
  1195
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
  1196
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1197
            width -= extrawidth;
icculus@1047
  1198
            while (width) {
icculus@1047
  1199
                vector unsigned char voverflow;
icculus@1047
  1200
                vector unsigned char vd;
icculus@1047
  1201
                vector unsigned char valpha;
icculus@1047
  1202
                vector unsigned char vdstalpha;
icculus@1047
  1203
                /* s = *srcp */
icculus@1047
  1204
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1205
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1206
icculus@1047
  1207
                valpha = vec_perm(vs, v0, valphaPermute);
icculus@1047
  1208
                
icculus@1047
  1209
                /* d = *dstp */
icculus@1047
  1210
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1211
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
  1212
icculus@1047
  1213
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1214
icculus@1047
  1215
                /* set the alpha to the dest alpha */
icculus@1047
  1216
                vd = vec_and(vd, vpixelmask);
icculus@1047
  1217
                vd = vec_or(vd, vdstalpha);
icculus@1047
  1218
icculus@1047
  1219
                /* *dstp = res */
icculus@1047
  1220
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1221
                
icculus@1047
  1222
                srcp += 4;
icculus@1047
  1223
                dstp += 4;
icculus@1047
  1224
                width -= 4;
icculus@1047
  1225
                vs = voverflow;
icculus@1047
  1226
            }
icculus@1047
  1227
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1228
        }
icculus@1047
  1229
	    srcp += srcskip;
icculus@1047
  1230
	    dstp += dstskip;
icculus@1047
  1231
	}
icculus@1047
  1232
#undef ONE_PIXEL_BLEND
icculus@1047
  1233
}
icculus@1047
  1234
icculus@1047
  1235
static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1236
{
icculus@1047
  1237
    /* XXX : 6 */
icculus@1047
  1238
	unsigned alpha = info->src->alpha;
icculus@1047
  1239
    int height = info->d_height;
icculus@1047
  1240
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1241
    int srcskip = info->s_skip >> 2;
icculus@1047
  1242
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1243
    int dstskip = info->d_skip >> 2;
icculus@1047
  1244
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
  1245
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
  1246
	unsigned sA = srcfmt->alpha;
icculus@1047
  1247
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
  1248
    vector unsigned char mergePermute;
icculus@1047
  1249
    vector unsigned char vsrcPermute;
icculus@1047
  1250
    vector unsigned char vdstPermute;
icculus@1047
  1251
    vector unsigned char vsdstPermute;
icculus@1047
  1252
    vector unsigned char valpha;
icculus@1047
  1253
    vector unsigned char valphamask;
icculus@1047
  1254
    vector unsigned char vbits;
icculus@1047
  1255
    vector unsigned short v1;
icculus@1047
  1256
    vector unsigned short v8;
icculus@1047
  1257
icculus@1047
  1258
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1259
    v1 = vec_splat_u16(1);
icculus@1047
  1260
    v8 = vec_splat_u16(8);
icculus@1047
  1261
icculus@1047
  1262
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1263
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1264
icculus@1047
  1265
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1266
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1267
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1268
icculus@1047
  1269
    /* set a vector full of alpha and 255-alpha */
icculus@1047
  1270
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
  1271
    valpha = vec_splat(valpha, 0);
icculus@1047
  1272
    vbits = (vector unsigned char)vec_splat_s8(-1);
icculus@1047
  1273
icculus@1047
  1274
    while(height--) {
icculus@1047
  1275
        int width = info->d_width;
icculus@1047
  1276
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
  1277
            Uint32 Pixel; \
icculus@1047
  1278
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
  1279
            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
icculus@1162
  1280
            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
  1281
            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1282
            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1283
            ++srcp; \
icculus@1047
  1284
            ++dstp; \
icculus@1047
  1285
            widthvar--; \
icculus@1047
  1286
        }
icculus@1047
  1287
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1288
        if (width > 0) {
icculus@1047
  1289
            int extrawidth = (width % 4);
icculus@1047
  1290
            vector unsigned char valigner = vec_lvsl(0, srcp);
icculus@1047
  1291
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1292
            width -= extrawidth;
icculus@1047
  1293
            while (width) {
icculus@1047
  1294
                vector unsigned char voverflow;
icculus@1047
  1295
                vector unsigned char vd;
icculus@1047
  1296
icculus@1047
  1297
                /* s = *srcp */
icculus@1047
  1298
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1299
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1300
                vs = vec_perm(vs, valpha, vsrcPermute);
icculus@1047
  1301
                
icculus@1047
  1302
                /* d = *dstp */
icculus@1047
  1303
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1304
                vd = vec_perm(vd, vd, vsdstPermute);
icculus@1047
  1305
icculus@1047
  1306
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1307
icculus@1047
  1308
                /* set the alpha channel to full on */
icculus@1047
  1309
                vd = vec_or(vd, valphamask);
icculus@1047
  1310
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
  1311
icculus@1047
  1312
                /* *dstp = res */
icculus@1047
  1313
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1314
                
icculus@1047
  1315
                srcp += 4;
icculus@1047
  1316
                dstp += 4;
icculus@1047
  1317
                width -= 4;
icculus@1047
  1318
                vs = voverflow;
icculus@1047
  1319
            }
icculus@1047
  1320
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1321
        }
icculus@1047
  1322
#undef ONE_PIXEL_BLEND
icculus@1047
  1323
 
icculus@1047
  1324
        srcp += srcskip;
icculus@1047
  1325
        dstp += dstskip;
icculus@1047
  1326
    }
icculus@1047
  1327
icculus@1047
  1328
}
icculus@1047
  1329
icculus@1047
  1330
icculus@1047
  1331
/* fast RGB888->(A)RGB888 blending */
icculus@1047
  1332
static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1333
{
icculus@1047
  1334
	unsigned alpha = info->src->alpha;
icculus@1047
  1335
    int height = info->d_height;
icculus@1047
  1336
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1337
    int srcskip = info->s_skip >> 2;
icculus@1047
  1338
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1339
    int dstskip = info->d_skip >> 2;
icculus@1047
  1340
    vector unsigned char mergePermute;
icculus@1047
  1341
    vector unsigned char valpha;
icculus@1047
  1342
    vector unsigned char valphamask;
icculus@1047
  1343
    vector unsigned short v1;
icculus@1047
  1344
    vector unsigned short v8;
icculus@1047
  1345
icculus@1047
  1346
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1347
    v1 = vec_splat_u16(1);
icculus@1047
  1348
    v8 = vec_splat_u16(8);
icculus@1047
  1349
icculus@1047
  1350
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1351
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1352
icculus@1047
  1353
    /* set a vector full of alpha and 255-alpha */
icculus@1047
  1354
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
  1355
    valpha = vec_splat(valpha, 0);
icculus@1047
  1356
icculus@1047
  1357
    while(height--) {
icculus@1047
  1358
        int width = info->d_width;
icculus@1047
  1359
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1047
  1360
            Uint32 s = *srcp; \
icculus@1047
  1361
            Uint32 d = *dstp; \
icculus@1047
  1362
            Uint32 s1 = s & 0xff00ff; \
icculus@1047
  1363
            Uint32 d1 = d & 0xff00ff; \
icculus@1047
  1364
            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
icculus@1047
  1365
                 & 0xff00ff; \
icculus@1047
  1366
            s &= 0xff00; \
icculus@1047
  1367
            d &= 0xff00; \
icculus@1047
  1368
            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
  1369
            *dstp = d1 | d | 0xff000000; \
icculus@1047
  1370
            ++srcp; \
icculus@1047
  1371
            ++dstp; \
icculus@1047
  1372
            widthvar--; \
icculus@1047
  1373
        }
icculus@1047
  1374
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1375
        if (width > 0) {
icculus@1047
  1376
            int extrawidth = (width % 4);
icculus@1047
  1377
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
  1378
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1379
            width -= extrawidth;
icculus@1047
  1380
            while (width) {
icculus@1047
  1381
                vector unsigned char voverflow;
icculus@1047
  1382
                vector unsigned char vd;
icculus@1047
  1383
icculus@1047
  1384
                /* s = *srcp */
icculus@1047
  1385
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1386
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1387
                
icculus@1047
  1388
                /* d = *dstp */
icculus@1047
  1389
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1390
icculus@1047
  1391
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1392
icculus@1047
  1393
                /* set the alpha channel to full on */
icculus@1047
  1394
                vd = vec_or(vd, valphamask);
icculus@1047
  1395
icculus@1047
  1396
                /* *dstp = res */
icculus@1047
  1397
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1398
                
icculus@1047
  1399
                srcp += 4;
icculus@1047
  1400
                dstp += 4;
icculus@1047
  1401
                width -= 4;
icculus@1047
  1402
                vs = voverflow;
icculus@1047
  1403
            }
icculus@1047
  1404
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1405
        }
icculus@1047
  1406
#undef ONE_PIXEL_BLEND
icculus@1047
  1407
 
icculus@1047
  1408
        srcp += srcskip;
icculus@1047
  1409
        dstp += dstskip;
icculus@1047
  1410
    }
icculus@1047
  1411
}
slouken@1795
  1412
#if __MWERKS__
slouken@1795
  1413
#pragma altivec_model off
slouken@1795
  1414
#endif
slouken@1361
  1415
#endif /* SDL_ALTIVEC_BLITTERS */
icculus@1047
  1416
slouken@1
  1417
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1
  1418
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
slouken@0
  1419
{
slouken@0
  1420
	int width = info->d_width;
slouken@0
  1421
	int height = info->d_height;
slouken@0
  1422
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1423
	int srcskip = info->s_skip >> 2;
slouken@0
  1424
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
  1425
	int dstskip = info->d_skip >> 2;
slouken@0
  1426
slouken@0
  1427
	while(height--) {
slouken@0
  1428
	    DUFFS_LOOP4({
slouken@1
  1429
		    Uint32 s = *srcp++;
slouken@1
  1430
		    Uint32 d = *dstp;
slouken@1
  1431
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
  1432
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
  1433
	    }, width);
slouken@0
  1434
	    srcp += srcskip;
slouken@0
  1435
	    dstp += dstskip;
slouken@0
  1436
	}
slouken@0
  1437
}
slouken@0
  1438
slouken@1
  1439
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1
  1440
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
slouken@1
  1441
{
slouken@1
  1442
	unsigned alpha = info->src->alpha;
slouken@1
  1443
	if(alpha == 128) {
slouken@1
  1444
		BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1
  1445
	} else {
slouken@1
  1446
		int width = info->d_width;
slouken@1
  1447
		int height = info->d_height;
slouken@1
  1448
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1
  1449
		int srcskip = info->s_skip >> 2;
slouken@1
  1450
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1
  1451
		int dstskip = info->d_skip >> 2;
slouken@689
  1452
		Uint32 s;
slouken@689
  1453
		Uint32 d;
slouken@689
  1454
		Uint32 s1;
slouken@689
  1455
		Uint32 d1;
slouken@1
  1456
slouken@1
  1457
		while(height--) {
slouken@689
  1458
			DUFFS_LOOP_DOUBLE2({
slouken@689
  1459
				/* One Pixel Blend */
slouken@1
  1460
				s = *srcp;
slouken@1
  1461
				d = *dstp;
slouken@1
  1462
				s1 = s & 0xff00ff;
slouken@1
  1463
				d1 = d & 0xff00ff;
slouken@1
  1464
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
  1465
				     & 0xff00ff;
slouken@1
  1466
				s &= 0xff00;
slouken@1
  1467
				d &= 0xff00;
slouken@1
  1468
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
  1469
				*dstp = d1 | d | 0xff000000;
slouken@1
  1470
				++srcp;
slouken@1
  1471
				++dstp;
slouken@689
  1472
			},{
slouken@689
  1473
			        /* Two Pixels Blend */
slouken@689
  1474
				s = *srcp;
slouken@689
  1475
				d = *dstp;
slouken@689
  1476
				s1 = s & 0xff00ff;
slouken@689
  1477
				d1 = d & 0xff00ff;
slouken@689
  1478
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1479
				d1 &= 0xff00ff;
slouken@689
  1480
				     
slouken@689
  1481
				s = ((s & 0xff00) >> 8) | 
slouken@689
  1482
					((srcp[1] & 0xff00) << 8);
slouken@689
  1483
				d = ((d & 0xff00) >> 8) |
slouken@689
  1484
					((dstp[1] & 0xff00) << 8);
slouken@689
  1485
				d += (s - d) * alpha >> 8;
slouken@689
  1486
				d &= 0x00ff00ff;
slouken@689
  1487
				
slouken@689
  1488
				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
slouken@689
  1489
				++srcp;
slouken@689
  1490
				
slouken@689
  1491
			        s1 = *srcp;
slouken@689
  1492
				d1 = *dstp;
slouken@689
  1493
				s1 &= 0xff00ff;
slouken@689
  1494
				d1 &= 0xff00ff;
slouken@689
  1495
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1496
				d1 &= 0xff00ff;
slouken@689
  1497
				
slouken@689
  1498
				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
slouken@689
  1499
				++srcp;
slouken@689
  1500
				++dstp;
slouken@1
  1501
			}, width);
slouken@1
  1502
			srcp += srcskip;
slouken@1
  1503
			dstp += dstskip;
slouken@1
  1504
		}
slouken@1
  1505
	}
slouken@1
  1506
}
slouken@1
  1507
slouken@0
  1508
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@0
  1509
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
slouken@0
  1510
{
slouken@0
  1511
	int width = info->d_width;
slouken@0
  1512
	int height = info->d_height;
slouken@0
  1513
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1514
	int srcskip = info->s_skip >> 2;
slouken@0
  1515
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
  1516
	int dstskip = info->d_skip >> 2;
slouken@0
  1517
slouken@0
  1518
	while(height--) {
slouken@0
  1519
	    DUFFS_LOOP4({
slouken@0
  1520
		Uint32 dalpha;
slouken@0
  1521
		Uint32 d;
slouken@0
  1522
		Uint32 s1;
slouken@0
  1523
		Uint32 d1;
slouken@0
  1524
		Uint32 s = *srcp;
slouken@0
  1525
		Uint32 alpha = s >> 24;
slouken@0
  1526
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1527
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1528
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1529
		   Benchmark this! */
slouken@689
  1530
		if(alpha) {   
slouken@689
  1531
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@0
  1532
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
  1533
		  } else {
slouken@0
  1534
		    /*
slouken@0
  1535
		     * take out the middle component (green), and process
slouken@0
  1536
		     * the other two in parallel. One multiply less.
slouken@0
  1537
		     */
slouken@0
  1538
		    d = *dstp;
slouken@0
  1539
		    dalpha = d & 0xff000000;
slouken@0
  1540
		    s1 = s & 0xff00ff;
slouken@0
  1541
		    d1 = d & 0xff00ff;
slouken@0
  1542
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
  1543
		    s &= 0xff00;
slouken@0
  1544
		    d &= 0xff00;
slouken@0
  1545
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@0
  1546
		    *dstp = d1 | d | dalpha;
slouken@689
  1547
		  }
slouken@0
  1548
		}
slouken@0
  1549
		++srcp;
slouken@0
  1550
		++dstp;
slouken@0
  1551
	    }, width);
slouken@0
  1552
	    srcp += srcskip;
slouken@0
  1553
	    dstp += dstskip;
slouken@0
  1554
	}
slouken@0
  1555
}
slouken@0
  1556
slouken@1542
  1557
#if GCC_ASMBLIT
slouken@689
  1558
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
  1559
inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
slouken@689
  1560
{
slouken@689
  1561
	int width = info->d_width;
slouken@689
  1562
	int height = info->d_height;
slouken@689
  1563
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
  1564
	int srcskip = info->s_skip >> 2;
slouken@689
  1565
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
  1566
	int dstskip = info->d_skip >> 2;
slouken@1542
  1567
	SDL_PixelFormat* sf = info->src;
slouken@1542
  1568
	Uint32 amask = sf->Amask;
slouken@689
  1569
slouken@689
  1570
	__asm__ (
slouken@689
  1571
	/* make mm6 all zeros. */
slouken@689
  1572
	"pxor       %%mm6, %%mm6\n"
slouken@689
  1573
	
slouken@689
  1574
	/* Make a mask to preserve the alpha. */
slouken@1542
  1575
	"movd      %0, %%mm7\n\t"           /* 0000F000 -> mm7 */
slouken@1542
  1576
	"punpcklbw %%mm7, %%mm7\n\t"        /* FF000000 -> mm7 */
slouken@1542
  1577
	"pcmpeqb   %%mm4, %%mm4\n\t"        /* FFFFFFFF -> mm4 */
slouken@1542
  1578
	"movq      %%mm4, %%mm3\n\t"        /* FFFFFFFF -> mm3 (for later) */
slouken@1542
  1579
	"pxor      %%mm4, %%mm7\n\t"        /* 00FFFFFF -> mm7 (mult mask) */
slouken@689
  1580
slouken@1542
  1581
	/* form channel masks */
slouken@1542
  1582
	"movq      %%mm7, %%mm4\n\t"        /* 00FFFFFF -> mm4 */
slouken@1542
  1583
	"packsswb  %%mm6, %%mm4\n\t"        /* 00000FFF -> mm4 (channel mask) */
slouken@1542
  1584
	"packsswb  %%mm6, %%mm3\n\t"        /* 0000FFFF -> mm3 */
slouken@1542
  1585
	"pxor      %%mm4, %%mm3\n\t"        /* 0000F000 -> mm3 (~channel mask) */
slouken@1542
  1586
	
slouken@1542
  1587
	/* get alpha channel shift */
slouken@1542
  1588
	"movd      %1, %%mm5\n\t" /* Ashift -> mm5 */
slouken@1542
  1589
slouken@1542
  1590
	  : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) );
slouken@689
  1591
slouken@689
  1592
	while(height--) {
slouken@689
  1593
slouken@689
  1594
	    DUFFS_LOOP4({
slouken@1542
  1595
		Uint32 alpha;
slouken@689
  1596
slouken@689
  1597
		__asm__ (
slouken@689
  1598
		"prefetch 64(%0)\n"
slouken@689
  1599
		"prefetch 64(%1)\n"
slouken@689
  1600
			: : "r" (srcp), "r" (dstp) );
slouken@689
  1601
slouken@1542
  1602
		alpha = *srcp & amask;
slouken@689
  1603
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
  1604
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
  1605
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
  1606
		   Benchmark this! */
slouken@1542
  1607
		if(alpha == 0) {
slouken@1542
  1608
		    /* do nothing */
slouken@1542
  1609
		}
slouken@1542
  1610
		else if(alpha == amask) {
slouken@1542
  1611
			/* opaque alpha -- copy RGB, keep dst alpha */
slouken@1542
  1612
		    /* using MMX here to free up regular registers for other things */
slouken@1542
  1613
			    __asm__ (
slouken@1542
  1614
		    "movd      (%0),  %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
slouken@1542
  1615
		    "movd      (%1),  %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
slouken@1542
  1616
		    "pand      %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
slouken@1542
  1617
		    "pand      %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
slouken@1542
  1618
		    "por       %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
slouken@1542
  1619
		    "movd      %%mm1, (%1) \n\t" /* mm1 -> dst */
slouken@1542
  1620
slouken@1542
  1621
		     : : "r" (srcp), "r" (dstp) );
slouken@689
  1622
		} 
slouken@689
  1623
slouken@689
  1624
		else {
slouken@689
  1625
			    __asm__ (
slouken@689
  1626
		    /* load in the source, and dst. */
slouken@689
  1627
		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
slouken@689
  1628
		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
slouken@689
  1629
slouken@689
  1630
		    /* Move the src alpha into mm2 */
slouken@689
  1631
slouken@689
  1632
		    /* if supporting pshufw */
slouken@689
  1633
		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
  1634
		    /*"psrlw     $8, %%mm2\n" */
slouken@689
  1635
		    
slouken@689
  1636
		    /* else: */
slouken@1542
  1637
		    "movd       %2,    %%mm2\n"
slouken@1542
  1638
		    "psrld      %%mm5, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
slouken@689
  1639
		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
slouken@689
  1640
		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@1542
  1641
		    "pand       %%mm7, %%mm2\n"              /* to preserve dest alpha */
slouken@689
  1642
slouken@689
  1643
		    /* move the colors into words. */
slouken@689
  1644
		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
slouken@689
  1645
		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
slouken@689
  1646
slouken@689
  1647
		    /* src - dst */
slouken@689
  1648
		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
slouken@689
  1649
slouken@689
  1650
		    /* A * (src-dst) */
slouken@1542
  1651
		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = 0*As-d As*Rs-d | As*Gs-d  As*Bs-d */
slouken@1542
  1652
		    "psrlw     $8,    %%mm0\n"		    /* mm0 = 0>>8 Rc>>8 | Gc>>8  Bc>>8 */
slouken@1542
  1653
		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = 0+Ad Rc+Rd | Gc+Gd  Bc+Bd */
slouken@689
  1654
slouken@689
  1655
		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
slouken@689
  1656
		    
slouken@689
  1657
		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
slouken@689
  1658
slouken@1542
  1659
		     : : "r" (srcp), "r" (dstp), "r" (alpha) );
slouken@689
  1660
slouken@689
  1661
		}
slouken@689
  1662
		++srcp;
slouken@689
  1663
		++dstp;
slouken@689
  1664
	    }, width);
slouken@689
  1665
	    srcp += srcskip;
slouken@689
  1666
	    dstp += dstskip;
slouken@689
  1667
	}
slouken@689
  1668
slouken@689
  1669
	__asm__ (
slouken@689
  1670
	"emms\n"
slouken@689
  1671
		:   );
slouken@689
  1672
}
slouken@1542
  1673
/* End GCC_ASMBLIT*/
slouken@1542
  1674
slouken@1542
  1675
#elif MSVC_ASMBLIT
slouken@1542
  1676
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@1542
  1677
static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
slouken@1542
  1678
{
slouken@1542
  1679
	int width = info->d_width;
slouken@1542
  1680
	int height = info->d_height;
slouken@1542
  1681
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1542
  1682
	int srcskip = info->s_skip >> 2;
slouken@1542
  1683
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1542
  1684
	int dstskip = info->d_skip >> 2;
slouken@1542
  1685
	SDL_PixelFormat* sf = info->src;
slouken@1542
  1686
	Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
slouken@1542
  1687
	Uint32 amask = sf->Amask;
slouken@1542
  1688
	Uint32 ashift = sf->Ashift;
slouken@1542
  1689
	Uint64 multmask;
slouken@1542
  1690
	
slouken@1542
  1691
	__m64 src1, dst1, mm_alpha, mm_zero, dmask;
slouken@1542
  1692
slouken@1542
  1693
	mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
slouken@1542
  1694
	multmask = ~(0xFFFFi64 << (ashift * 2));
slouken@1542
  1695
	dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
slouken@1542
  1696
slouken@1542
  1697
	while(height--) {
slouken@1542
  1698
	    DUFFS_LOOP4({
slouken@1542
  1699
		Uint32 alpha;
slouken@1542
  1700
slouken@1542
  1701
		_m_prefetch(srcp + 16);
slouken@1542
  1702
		_m_prefetch(dstp + 16);
slouken@1542
  1703
slouken@1542
  1704
		alpha = *srcp & amask;
slouken@1542
  1705
		if (alpha == 0) {
slouken@1542
  1706
			/* do nothing */
slouken@1542
  1707
		} else if (alpha == amask) {
slouken@1542
  1708
			/* copy RGB, keep dst alpha */
slouken@1542
  1709
			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
slouken@1542
  1710
		} else {
slouken@1542
  1711
			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
slouken@1542
  1712
			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
slouken@1542
  1713
slouken@1542
  1714
			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
slouken@1542
  1715
			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
slouken@1542
  1716
slouken@1542
  1717
			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
slouken@1542
  1718
			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
slouken@1542
  1719
			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
  1720
			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
  1721
			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
slouken@1542
  1722
slouken@1542
  1723
			/* blend */		    
slouken@1542
  1724
			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
slouken@1542
  1725
			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
slouken@1542
  1726
			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
slouken@1542
  1727
			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
slouken@1542
  1728
			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
slouken@1542
  1729
			
slouken@1542
  1730
			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
slouken@1542
  1731
		}
slouken@1542
  1732
		++srcp;
slouken@1542
  1733
		++dstp;
slouken@1542
  1734
	    }, width);
slouken@1542
  1735
	    srcp += srcskip;
slouken@1542
  1736
	    dstp += dstskip;
slouken@1542
  1737
	}
slouken@1542
  1738
	_mm_empty();
slouken@1542
  1739
}
slouken@1542
  1740
/* End MSVC_ASMBLIT */
slouken@1542
  1741
slouken@1542
  1742
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
  1743
slouken@1
  1744
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
  1745
slouken@1
  1746
/* blend a single 16 bit pixel at 50% */
slouken@1
  1747
#define BLEND16_50(d, s, mask)						\
slouken@1
  1748
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
  1749
slouken@1
  1750
/* blend two 16 bit pixels at 50% */
slouken@1
  1751
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
  1752
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
  1753
	 + (s & d & (~(mask | mask << 16))))
slouken@1
  1754
slouken@1
  1755
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
slouken@0
  1756
{
slouken@0
  1757
	int width = info->d_width;
slouken@0
  1758
	int height = info->d_height;
slouken@0
  1759
	Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@0
  1760
	int srcskip = info->s_skip >> 1;
slouken@0
  1761
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1762
	int dstskip = info->d_skip >> 1;
slouken@0
  1763
slouken@0
  1764
	while(height--) {
slouken@1456
  1765
		if(((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
slouken@1
  1766
			/*
slouken@1
  1767
			 * Source and destination not aligned, pipeline it.
slouken@1
  1768
			 * This is mostly a win for big blits but no loss for
slouken@1
  1769
			 * small ones
slouken@1
  1770
			 */
slouken@1
  1771
			Uint32 prev_sw;
slouken@1
  1772
			int w = width;
slouken@1
  1773
slouken@1
  1774
			/* handle odd destination */
slouken@1456
  1775
			if((uintptr_t)dstp & 2) {
slouken@1
  1776
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1777
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1778
				dstp++;
slouken@1
  1779
				srcp++;
slouken@1
  1780
				w--;
slouken@1
  1781
			}
slouken@1
  1782
			srcp++;	/* srcp is now 32-bit aligned */
slouken@1
  1783
slouken@1
  1784
			/* bootstrap pipeline with first halfword */
slouken@1
  1785
			prev_sw = ((Uint32 *)srcp)[-1];
slouken@1
  1786
slouken@1
  1787
			while(w > 1) {
slouken@1
  1788
				Uint32 sw, dw, s;
slouken@1
  1789
				sw = *(Uint32 *)srcp;
slouken@1
  1790
				dw = *(Uint32 *)dstp;
slouken@1443
  1791
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1443
  1792
				s = (prev_sw << 16) + (sw >> 16);
slouken@1443
  1793
#else
slouken@1443
  1794
				s = (prev_sw >> 16) + (sw << 16);
slouken@1443
  1795
#endif
slouken@1
  1796
				prev_sw = sw;
slouken@1
  1797
				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
slouken@1
  1798
				dstp += 2;
slouken@1
  1799
				srcp += 2;
slouken@1
  1800
				w -= 2;
slouken@1
  1801
			}
slouken@1
  1802
slouken@1
  1803
			/* final pixel if any */
slouken@1
  1804
			if(w) {
slouken@1
  1805
				Uint16 d = *dstp, s;
slouken@1443
  1806
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
slouken@1443
  1807
				s = (Uint16)prev_sw;
slouken@1443
  1808
#else
slouken@1443
  1809
				s = (Uint16)(prev_sw >> 16);
slouken@1443
  1810
#endif
slouken@1
  1811
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1812
				srcp++;
slouken@1
  1813
				dstp++;
slouken@1
  1814
			}
slouken@1
  1815
			srcp += srcskip - 1;
slouken@1
  1816
			dstp += dstskip;
slouken@1
  1817
		} else {
slouken@1
  1818
			/* source and destination are aligned */
slouken@1
  1819
			int w = width;
slouken@1
  1820
slouken@1
  1821
			/* first odd pixel? */
slouken@1456
  1822
			if((uintptr_t)srcp & 2) {
slouken@1
  1823
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1824
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1825
				srcp++;
slouken@1
  1826
				dstp++;
slouken@1
  1827
				w--;
slouken@1
  1828
			}
slouken@1
  1829
			/* srcp and dstp are now 32-bit aligned */
slouken@1
  1830
slouken@1
  1831
			while(w > 1) {
slouken@1
  1832
				Uint32 sw = *(Uint32 *)srcp;
slouken@1
  1833
				Uint32 dw = *(Uint32 *)dstp;
slouken@1
  1834
				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
slouken@1
  1835
				srcp += 2;
slouken@1
  1836
				dstp += 2;
slouken@1
  1837
				w -= 2;
slouken@1
  1838
			}
slouken@1
  1839
slouken@1
  1840
			/* last odd pixel? */
slouken@1
  1841
			if(w) {
slouken@1
  1842
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1843
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1844
				srcp++;
slouken@1
  1845
				dstp++;
slouken@1
  1846
			}
slouken@1
  1847
			srcp += srcskip;
slouken@1
  1848
			dstp += dstskip;
slouken@1
  1849
		}
slouken@1
  1850
	}
slouken@1
  1851
}
slouken@1
  1852
slouken@1542
  1853
#if GCC_ASMBLIT
slouken@689
  1854
/* fast RGB565->RGB565 blending with surface alpha */
slouken@689
  1855
static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
  1856
{
slouken@689
  1857
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
  1858
	if(alpha == 128) {
slouken@689
  1859
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@689
  1860
	} else {
slouken@689
  1861
		int width = info->d_width;
slouken@689
  1862
		int height = info->d_height;
slouken@689
  1863
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
  1864
		int srcskip = info->s_skip >> 1;
slouken@689
  1865
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
  1866
		int dstskip = info->d_skip >> 1;
slouken@1542
  1867
		Uint32 s, d;
slouken@1542
  1868
		Uint8 load[8];
slouken@689
  1869
	  
slouken@689
  1870
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@1542
  1871
		*(Uint64 *)load = alpha;
slouken@689
  1872
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
  1873
slouken@1542
  1874
		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@1542
  1875
		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@1542
  1876
		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@1542
  1877
		/* position alpha to allow for mullo and mulhi on diff channels
slouken@1542
  1878
		   to reduce the number of operations */
slouken@1542
  1879
		psllq_i2r(3, mm0);
slouken@689
  1880
	  
slouken@1542
  1881
		/* Setup the 565 color channel masks */
slouken@720
  1882
		*(Uint64 *)load = 0x07E007E007E007E0ULL;
slouken@689
  1883
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@720
  1884
		*(Uint64 *)load = 0x001F001F001F001FULL;
slouken@689
  1885
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
  1886
		while(height--) {
slouken@1542
  1887
			DUFFS_LOOP_QUATRO2(
slouken@1542
  1888
			{
slouken@1542
  1889
				s = *srcp++;
slouken@689
  1890
				d = *dstp;
slouken@689
  1891
				/*
slouken@689
  1892
				 * shift out the middle component (green) to
slouken@689
  1893
				 * the high 16 bits, and process all three RGB
slouken@689
  1894
				 * components at the same time.
slouken@689
  1895
				 */
slouken@689
  1896
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1897
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1898
				d += (s - d) * alpha >> 5;
slouken@689
  1899
				d &= 0x07e0f81f;
slouken@689
  1900
				*dstp++ = d | d >> 16;
slouken@1542
  1901
			},{
slouken@1542
  1902
				s = *srcp++;
slouken@689
  1903
				d = *dstp;
slouken@689
  1904
				/*
slouken@689
  1905
				 * shift out the middle component (green) to
slouken@689
  1906
				 * the high 16 bits, and process all three RGB
slouken@689
  1907
				 * components at the same time.
slouken@689
  1908
				 */
slouken@689
  1909
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1910
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1911
				d += (s - d) * alpha >> 5;
slouken@689
  1912
				d &= 0x07e0f81f;
slouken@689
  1913
				*dstp++ = d | d >> 16;
slouken@1542
  1914
				s = *srcp++;
slouken@689
  1915
				d = *dstp;
slouken@689
  1916
				/*
slouken@689
  1917
				 * shift out the middle component (green) to
slouken@689
  1918
				 * the high 16 bits, and process all three RGB
slouken@689
  1919
				 * components at the same time.
slouken@689
  1920
				 */
slouken@689
  1921
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1922
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1923
				d += (s - d) * alpha >> 5;
slouken@689
  1924
				d &= 0x07e0f81f;
slouken@689
  1925
				*dstp++ = d | d >> 16;
slouken@1542
  1926
			},{
slouken@1542
  1927
				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@1542
  1928
				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@1542
  1929
slouken@1542
  1930
				/* red -- does not need a mask since the right shift clears
slouken@1542
  1931
				   the uninteresting bits */
slouken@1542
  1932
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  1933
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  1934
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
slouken@1542
  1935
				psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
slouken@1542
  1936
slouken@1542
  1937
				/* blend */
slouken@1542
  1938
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  1939
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  1940
				/* alpha used is actually 11 bits
slouken@1542
  1941
				   11 + 5 = 16 bits, so the sign bits are lost */
slouken@1542
  1942
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  1943
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  1944
				psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
slouken@1542
  1945
slouken@1542
  1946
				movq_r2r(mm6, mm1); /* save new reds in dsts */
slouken@1542
  1947
slouken@1542
  1948
				/* green -- process the bits in place */
slouken@1542
  1949
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  1950
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  1951
				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
slouken@1542
  1952
				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
slouken@1542
  1953
slouken@1542
  1954
				/* blend */
slouken@1542
  1955
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  1956
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  1957
				/* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
slouken@1542
  1958
				   bits are gone and the sign bits present */
slouken@1542
  1959
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  1960
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  1961
slouken@1542
  1962
				por_r2r(mm6, mm1); /* save new greens in dsts */
slouken@1542
  1963
slouken@1542
  1964
				/* blue */
slouken@1542
  1965
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  1966
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  1967
				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
slouken@1542
  1968
				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  1969
slouken@1542
  1970
				/* blend */
slouken@1542
  1971
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  1972
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  1973
				/* 11 + 5 = 16 bits, so the sign bits are lost and
slouken@1542
  1974
				   the interesting bits will need to be MASKed */
slouken@1542
  1975
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  1976
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  1977
				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  1978
slouken@1542
  1979
				por_r2r(mm6, mm1); /* save new blues in dsts */
slouken@1542
  1980
slouken@1542
  1981
				movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
slouken@1542
  1982
slouken@1542
  1983
				srcp += 4;
slouken@1542
  1984
				dstp += 4;
slouken@1542
  1985
			}, width);			
slouken@689
  1986
			srcp += srcskip;
slouken@689
  1987
			dstp += dstskip;
slouken@689
  1988
		}
slouken@689
  1989
		emms();
slouken@689
  1990
	}
slouken@689
  1991
}
slouken@689
  1992
slouken@689
  1993
/* fast RGB555->RGB555 blending with surface alpha */
slouken@689
  1994
static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
  1995
{
slouken@689
  1996
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
  1997
	if(alpha == 128) {
slouken@689
  1998
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@689
  1999
	} else {
slouken@689
  2000
		int width = info->d_width;
slouken@689
  2001
		int height = info->d_height;
slouken@689
  2002
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
  2003
		int srcskip = info->s_skip >> 1;
slouken@689
  2004
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
  2005
		int dstskip = info->d_skip >> 1;
slouken@1542
  2006
		Uint32 s, d;
slouken@1542
  2007
		Uint8 load[8];
slouken@689
  2008
	  
slouken@689
  2009
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@1542
  2010
		*(Uint64 *)load = alpha;
slouken@689
  2011
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
  2012
slouken@1542
  2013
		movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@1542
  2014
		punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@1542
  2015
		punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@1542
  2016
		/* position alpha to allow for mullo and mulhi on diff channels
slouken@1542
  2017
		   to reduce the number of operations */
slouken@1542
  2018
		psllq_i2r(3, mm0);
slouken@1542
  2019
slouken@1542
  2020
		/* Setup the 555 color channel masks */
slouken@720
  2021
		*(Uint64 *)load = 0x03E003E003E003E0ULL;
slouken@689
  2022
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@720
  2023
		*(Uint64 *)load = 0x001F001F001F001FULL;
slouken@689
  2024
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
  2025
		while(height--) {
slouken@1542
  2026
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2027
			{
slouken@1542
  2028
				s = *srcp++;
slouken@689
  2029
				d = *dstp;
slouken@689
  2030
				/*
slouken@689
  2031
				 * shift out the middle component (green) to
slouken@689
  2032
				 * the high 16 bits, and process all three RGB
slouken@689
  2033
				 * components at the same time.
slouken@689
  2034
				 */
slouken@689
  2035
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2036
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2037
				d += (s - d) * alpha >> 5;
slouken@689
  2038
				d &= 0x03e07c1f;
slouken@689
  2039
				*dstp++ = d | d >> 16;
slouken@1542
  2040
			},{
slouken@1542
  2041
				s = *srcp++;
slouken@689
  2042
				d = *dstp;
slouken@689
  2043
				/*
slouken@689
  2044
				 * shift out the middle component (green) to
slouken@689
  2045
				 * the high 16 bits, and process all three RGB
slouken@689
  2046
				 * components at the same time.
slouken@689
  2047
				 */
slouken@689
  2048
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2049
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2050
				d += (s - d) * alpha >> 5;
slouken@689
  2051
				d &= 0x03e07c1f;
slouken@689
  2052
				*dstp++ = d | d >> 16;
slouken@689
  2053
			        s = *srcp++;
slouken@689
  2054
				d = *dstp;
slouken@689
  2055
				/*
slouken@689
  2056
				 * shift out the middle component (green) to
slouken@689
  2057
				 * the high 16 bits, and process all three RGB
slouken@689
  2058
				 * components at the same time.
slouken@689
  2059
				 */
slouken@689
  2060
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  2061
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  2062
				d += (s - d) * alpha >> 5;
slouken@689
  2063
				d &= 0x03e07c1f;
slouken@689
  2064
				*dstp++ = d | d >> 16;
slouken@1542
  2065
			},{
slouken@1542
  2066
				movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@1542
  2067
				movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@1542
  2068
slouken@1542
  2069
				/* red -- process the bits in place */
slouken@1542
  2070
				psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
slouken@1542
  2071
					/* by reusing the GREEN mask we free up another mmx
slouken@1542
  2072
					   register to accumulate the result */
slouken@1542
  2073
slouken@1542
  2074
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2075
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2076
				pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
slouken@1542
  2077
				pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
slouken@1542
  2078
slouken@1542
  2079
				/* blend */
slouken@1542
  2080
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2081
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2082
				/* 11 + 15 - 16 = 10 bits, uninteresting bits will be
slouken@1542
  2083
				   cleared by a MASK below */
slouken@1542
  2084
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  2085
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2086
				pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
slouken@1542
  2087
slouken@1542
  2088
				psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
slouken@1542
  2089
slouken@1542
  2090
				movq_r2r(mm6, mm1); /* save new reds in dsts */
slouken@1542
  2091
slouken@1542
  2092
				/* green -- process the bits in place */
slouken@1542
  2093
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2094
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2095
				pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
slouken@1542
  2096
				pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
slouken@1542
  2097
slouken@1542
  2098
				/* blend */
slouken@1542
  2099
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2100
				pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2101
				/* 11 + 10 - 16 = 5 bits,  so all the lower uninteresting
slouken@1542
  2102
				   bits are gone and the sign bits present */
slouken@1542
  2103
				psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
slouken@1542
  2104
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2105
slouken@1542
  2106
				por_r2r(mm6, mm1); /* save new greens in dsts */
slouken@1542
  2107
slouken@1542
  2108
				/* blue */
slouken@1542
  2109
				movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@1542
  2110
				movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@1542
  2111
				pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
slouken@1542
  2112
				pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2113
slouken@1542
  2114
				/* blend */
slouken@1542
  2115
				psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@1542
  2116
				pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@1542
  2117
				/* 11 + 5 = 16 bits, so the sign bits are lost and
slouken@1542
  2118
				   the interesting bits will need to be MASKed */
slouken@1542
  2119
				psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
slouken@1542
  2120
				paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@1542
  2121
				pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@1542
  2122
slouken@1542
  2123
				por_r2r(mm6, mm1); /* save new blues in dsts */
slouken@1542
  2124
slouken@1542
  2125
				movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
slouken@1542
  2126
slouken@1542
  2127
				srcp += 4;
slouken@1542
  2128
				dstp += 4;
slouken@1542
  2129
			}, width);			
slouken@689
  2130
			srcp += srcskip;
slouken@689
  2131
			dstp += dstskip;
slouken@689
  2132
		}
slouken@689
  2133
		emms();
slouken@689
  2134
	}
slouken@689
  2135
}
slouken@1542
  2136
/* End GCC_ASMBLIT */
slouken@1542
  2137
slouken@1542
  2138
#elif MSVC_ASMBLIT
slouken@1542
  2139
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1542
  2140
static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@1542
  2141
{
slouken@1542
  2142
	unsigned alpha = info->src->alpha;
slouken@1542
  2143
	if(alpha == 128) {
slouken@1542
  2144
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1542
  2145
	} else {
slouken@1542
  2146
		int width = info->d_width;
slouken@1542
  2147
		int height = info->d_height;
slouken@1542
  2148
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1542
  2149
		int srcskip = info->s_skip >> 1;
slouken@1542
  2150
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1542
  2151
		int dstskip = info->d_skip >> 1;
slouken@1542
  2152
		Uint32 s, d;
slouken@1542
  2153
	  
slouken@1542
  2154
		__m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
slouken@1542
  2155
slouken@1542
  2156
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@1542
  2157
		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
slouken@1542
  2158
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@1542
  2159
slouken@1542
  2160
		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
  2161
		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
  2162
		/* position alpha to allow for mullo and mulhi on diff channels
slouken@1542
  2163
		   to reduce the number of operations */
slouken@1542
  2164
		mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1542
  2165
	  
slouken@1542
  2166
		/* Setup the 565 color channel masks */
slouken@1542
  2167
		gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
slouken@1542
  2168
		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
slouken@1542
  2169
		
slouken@1542
  2170
		while(height--) {
slouken@1542
  2171
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2172
			{
slouken@1542
  2173
				s = *srcp++;
slouken@1542
  2174
				d = *dstp;
slouken@1542
  2175
				/*
slouken@1542
  2176
				 * shift out the middle component (green) to
slouken@1542
  2177
				 * the high 16 bits, and process all three RGB
slouken@1542
  2178
				 * components at the same time.
slouken@1542
  2179
				 */
slouken@1542
  2180
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2181
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2182
				d += (s - d) * alpha >> 5;
slouken@1542
  2183
				d &= 0x07e0f81f;
slouken@1546
  2184
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2185
			},{
slouken@1542
  2186
				s = *srcp++;
slouken@1542
  2187
				d = *dstp;
slouken@1542
  2188
				/*
slouken@1542
  2189
				 * shift out the middle component (green) to
slouken@1542
  2190
				 * the high 16 bits, and process all three RGB
slouken@1542
  2191
				 * components at the same time.
slouken@1542
  2192
				 */
slouken@1542
  2193
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2194
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2195
				d += (s - d) * alpha >> 5;
slouken@1542
  2196
				d &= 0x07e0f81f;
slouken@1546
  2197
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2198
				s = *srcp++;
slouken@1542
  2199
				d = *dstp;
slouken@1542
  2200
				/*
slouken@1542
  2201
				 * shift out the middle component (green) to
slouken@1542
  2202
				 * the high 16 bits, and process all three RGB
slouken@1542
  2203
				 * components at the same time.
slouken@1542
  2204
				 */
slouken@1542
  2205
				s = (s | s << 16) & 0x07e0f81f;
slouken@1542
  2206
				d = (d | d << 16) & 0x07e0f81f;
slouken@1542
  2207
				d += (s - d) * alpha >> 5;
slouken@1542
  2208
				d &= 0x07e0f81f;
slouken@1546
  2209
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2210
			},{
slouken@1542
  2211
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
  2212
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
  2213
slouken@1542
  2214
				/* red */
slouken@1542
  2215
				src2 = src1;
slouken@1542
  2216
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
slouken@1542
  2217
slouken@1542
  2218
				dst2 = dst1;
slouken@1542
  2219
				dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
slouken@1542
  2220
slouken@1542
  2221
				/* blend */
slouken@1542
  2222
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2223
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2224
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2225
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2226
				dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
slouken@1542
  2227
slouken@1542
  2228
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
  2229
slouken@1542
  2230
				/* green -- process the bits in place */
slouken@1542
  2231
				src2 = src1;
slouken@1542
  2232
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
  2233
slouken@1542
  2234
				dst2 = dst1;
slouken@1542
  2235
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
  2236
slouken@1542
  2237
				/* blend */
slouken@1542
  2238
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2239
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2240
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2241
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2242
slouken@1542
  2243
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
  2244
slouken@1542
  2245
				/* blue */
slouken@1542
  2246
				src2 = src1;
slouken@1542
  2247
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
  2248
slouken@1542
  2249
				dst2 = dst1;
slouken@1542
  2250
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
  2251
slouken@1542
  2252
				/* blend */
slouken@1542
  2253
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2254
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2255
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2256
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2257
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
  2258
slouken@1542
  2259
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
  2260
slouken@1542
  2261
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
  2262
slouken@1542
  2263
				srcp += 4;
slouken@1542
  2264
				dstp += 4;
slouken@1542
  2265
			}, width);			
slouken@1542
  2266
			srcp += srcskip;
slouken@1542
  2267
			dstp += dstskip;
slouken@1542
  2268
		}
slouken@1542
  2269
		_mm_empty();
slouken@1542
  2270
	}
slouken@1542
  2271
}
slouken@1542
  2272
slouken@1542
  2273
/* fast RGB555->RGB555 blending with surface alpha */
slouken@1542
  2274
static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@1542
  2275
{
slouken@1542
  2276
	unsigned alpha = info->src->alpha;
slouken@1542
  2277
	if(alpha == 128) {
slouken@1542
  2278
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1542
  2279
	} else {
slouken@1542
  2280
		int width = info->d_width;
slouken@1542
  2281
		int height = info->d_height;
slouken@1542
  2282
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1542
  2283
		int srcskip = info->s_skip >> 1;
slouken@1542
  2284
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1542
  2285
		int dstskip = info->d_skip >> 1;
slouken@1542
  2286
		Uint32 s, d;
slouken@1542
  2287
	  
slouken@1542
  2288
		__m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
slouken@1542
  2289
slouken@1542
  2290
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@1542
  2291
		mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
slouken@1542
  2292
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@1542
  2293
slouken@1542
  2294
		mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
slouken@1542
  2295
		mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
slouken@1542
  2296
		/* position alpha to allow for mullo and mulhi on diff channels
slouken@1542
  2297
		   to reduce the number of operations */
slouken@1542
  2298
		mm_alpha = _mm_slli_si64(mm_alpha, 3);
slouken@1542
  2299
	  
slouken@1542
  2300
		/* Setup the 555 color channel masks */
slouken@1542
  2301
		rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
slouken@1542
  2302
		gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
slouken@1542
  2303
		bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
slouken@1542
  2304
slouken@1542
  2305
		while(height--) {
slouken@1542
  2306
			DUFFS_LOOP_QUATRO2(
slouken@1542
  2307
			{
slouken@1542
  2308
				s = *srcp++;
slouken@1542
  2309
				d = *dstp;
slouken@1542
  2310
				/*
slouken@1542
  2311
				 * shift out the middle component (green) to
slouken@1542
  2312
				 * the high 16 bits, and process all three RGB
slouken@1542
  2313
				 * components at the same time.
slouken@1542
  2314
				 */
slouken@1542
  2315
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2316
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2317
				d += (s - d) * alpha >> 5;
slouken@1542
  2318
				d &= 0x03e07c1f;
slouken@1546
  2319
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2320
			},{
slouken@1542
  2321
				s = *srcp++;
slouken@1542
  2322
				d = *dstp;
slouken@1542
  2323
				/*
slouken@1542
  2324
				 * shift out the middle component (green) to
slouken@1542
  2325
				 * the high 16 bits, and process all three RGB
slouken@1542
  2326
				 * components at the same time.
slouken@1542
  2327
				 */
slouken@1542
  2328
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2329
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2330
				d += (s - d) * alpha >> 5;
slouken@1542
  2331
				d &= 0x03e07c1f;
slouken@1546
  2332
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2333
			        s = *srcp++;
slouken@1542
  2334
				d = *dstp;
slouken@1542
  2335
				/*
slouken@1542
  2336
				 * shift out the middle component (green) to
slouken@1542
  2337
				 * the high 16 bits, and process all three RGB
slouken@1542
  2338
				 * components at the same time.
slouken@1542
  2339
				 */
slouken@1542
  2340
				s = (s | s << 16) & 0x03e07c1f;
slouken@1542
  2341
				d = (d | d << 16) & 0x03e07c1f;
slouken@1542
  2342
				d += (s - d) * alpha >> 5;
slouken@1542
  2343
				d &= 0x03e07c1f;
slouken@1546
  2344
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1542
  2345
			},{
slouken@1542
  2346
				src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
slouken@1542
  2347
				dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
slouken@1542
  2348
slouken@1542
  2349
				/* red -- process the bits in place */
slouken@1542
  2350
				src2 = src1;
slouken@1542
  2351
				src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
slouken@1542
  2352
slouken@1542
  2353
				dst2 = dst1;
slouken@1542
  2354
				dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
slouken@1542
  2355
slouken@1542
  2356
				/* blend */
slouken@1542
  2357
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2358
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2359
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2360
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2361
				dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
slouken@1542
  2362
slouken@1542
  2363
				mm_res = dst2; /* RED -> mm_res */
slouken@1542
  2364
				
slouken@1542
  2365
				/* green -- process the bits in place */
slouken@1542
  2366
				src2 = src1;
slouken@1542
  2367
				src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
slouken@1542
  2368
slouken@1542
  2369
				dst2 = dst1;
slouken@1542
  2370
				dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
slouken@1542
  2371
slouken@1542
  2372
				/* blend */
slouken@1542
  2373
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2374
				src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2375
				src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
slouken@1542
  2376
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2377
slouken@1542
  2378
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
slouken@1542
  2379
slouken@1542
  2380
				/* blue */
slouken@1542
  2381
				src2 = src1; /* src -> src2 */
slouken@1542
  2382
				src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
slouken@1542
  2383
slouken@1542
  2384
				dst2 = dst1; /* dst -> dst2 */
slouken@1542
  2385
				dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
slouken@1542
  2386
slouken@1542
  2387
				/* blend */
slouken@1542
  2388
				src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
slouken@1542
  2389
				src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
slouken@1542
  2390
				src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
slouken@1542
  2391
				dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
slouken@1542
  2392
				dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
slouken@1542
  2393
slouken@1542
  2394
				mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
slouken@1542
  2395
slouken@1542
  2396
				*(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
slouken@1542
  2397
slouken@1542
  2398
				srcp += 4;
slouken@1542
  2399
				dstp += 4;
slouken@1542
  2400
			}, width);			
slouken@1542
  2401
			srcp += srcskip;
slouken@1542
  2402
			dstp += dstskip;
slouken@1542
  2403
		}
slouken@1542
  2404
		_mm_empty();
slouken@1542
  2405
	}
slouken@1542
  2406
}
slouken@1542
  2407
#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
slouken@689
  2408
slouken@1
  2409
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1
  2410
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
slouken@1
  2411
{
slouken@1
  2412
	unsigned alpha = info->src->alpha;
slouken@1
  2413
	if(alpha == 128) {
slouken@1
  2414
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1
  2415
	} else {
slouken@1
  2416
		int width = info->d_width;
slouken@1
  2417
		int height = info->d_height;
slouken@1
  2418
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  2419
		int srcskip = info->s_skip >> 1;
slouken@1
  2420
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  2421
		int dstskip = info->d_skip >> 1;
slouken@1
  2422
		alpha >>= 3;	/* downscale alpha to 5 bits */
slouken@1
  2423
slouken@1
  2424
		while(height--) {
slouken@1
  2425
			DUFFS_LOOP4({
slouken@1
  2426
				Uint32 s = *srcp++;
slouken@1
  2427
				Uint32 d = *dstp;
slouken@1
  2428
				/*
slouken@1
  2429
				 * shift out the middle component (green) to
slouken@1
  2430
				 * the high 16 bits, and process all three RGB
slouken@1
  2431
				 * components at the same time.
slouken@1
  2432
				 */
slouken@1
  2433
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  2434
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  2435
				d += (s - d) * alpha >> 5;
slouken@1
  2436
				d &= 0x07e0f81f;
slouken@1428
  2437
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  2438
			}, width);
slouken@1
  2439
			srcp += srcskip;
slouken@1
  2440
			dstp += dstskip;
slouken@1
  2441
		}
slouken@0
  2442
	}
slouken@0
  2443
}
slouken@0
  2444
slouken@0
  2445
/* fast RGB555->RGB555 blending with surface alpha */
slouken@0
  2446
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  2447
{
slouken@1
  2448
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@1
  2449
	if(alpha == 128) {
slouken@1
  2450
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1
  2451
	} else {
slouken@1
  2452
		int width = info->d_width;
slouken@1
  2453
		int height = info->d_height;
slouken@1
  2454
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  2455
		int srcskip = info->s_skip >> 1;
slouken@1
  2456
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  2457
		int dstskip = info->d_skip >> 1;
slouken@1
  2458
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@0
  2459
slouken@1
  2460
		while(height--) {
slouken@1
  2461
			DUFFS_LOOP4({
slouken@1
  2462
				Uint32 s = *srcp++;
slouken@1
  2463
				Uint32 d = *dstp;
slouken@1
  2464
				/*
slouken@1
  2465
				 * shift out the middle component (green) to
slouken@1
  2466
				 * the high 16 bits, and process all three RGB
slouken@1
  2467
				 * components at the same time.
slouken@1
  2468
				 */
slouken@1
  2469
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  2470
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  2471
				d += (s - d) * alpha >> 5;
slouken@1
  2472
				d &= 0x03e07c1f;
slouken@1428
  2473
				*dstp++ = (Uint16)(d | d >> 16);
slouken@1
  2474
			}, width);
slouken@1
  2475
			srcp += srcskip;
slouken@1
  2476
			dstp += dstskip;
slouken@1
  2477
		}
slouken@0
  2478
	}
slouken@0
  2479
}
slouken@0
  2480
slouken@0
  2481
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@0
  2482
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
slouken@0
  2483
{
slouken@0
  2484
	int width = info->d_width;
slouken@0
  2485
	int height = info->d_height;
slouken@0
  2486
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  2487
	int srcskip = info->s_skip >> 2;
slouken@0
  2488
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  2489
	int dstskip = info->d_skip >> 1;
slouken@0
  2490
slouken@0
  2491
	while(height--) {
slouken@0
  2492
	    DUFFS_LOOP4({
slouken@0
  2493
		Uint32 s = *srcp;
slouken@0
  2494
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  2495
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  2496
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  2497
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  2498
		   Benchmark this! */
slouken@689
  2499
		if(alpha) {   
slouken@689
  2500
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  2501
		    *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
slouken@689
  2502
		  } else {
slouken@0
  2503
		    Uint32 d = *dstp;
slouken@0
  2504
		    /*
slouken@0
  2505
		     * convert source and destination to G0RAB65565
slouken@0
  2506
		     * and blend all components at the same time
slouken@0
  2507
		     */
slouken@0
  2508
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  2509
		      + (s >> 3 & 0x1f);
slouken@0
  2510
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  2511
		    d += (s - d) * alpha >> 5;
slouken@0
  2512
		    d &= 0x07e0f81f;
slouken@1428
  2513
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  2514
		  }
slouken@0
  2515
		}
slouken@0
  2516
		srcp++;
slouken@0
  2517
		dstp++;
slouken@0
  2518
	    }, width);
slouken@0
  2519
	    srcp += srcskip;
slouken@0
  2520
	    dstp += dstskip;
slouken@0
  2521
	}
slouken@0
  2522
}
slouken@0
  2523
slouken@0
  2524
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@0
  2525
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
slouken@0
  2526
{
slouken@0
  2527
	int width = info->d_width;
slouken@0
  2528
	int height = info->d_height;
slouken@0
  2529
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  2530
	int srcskip = info->s_skip >> 2;
slouken@0
  2531
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  2532
	int dstskip = info->d_skip >> 1;
slouken@0
  2533
slouken@0
  2534
	while(height--) {
slouken@0
  2535
	    DUFFS_LOOP4({
slouken@0
  2536
		unsigned alpha;
slouken@0
  2537
		Uint32 s = *srcp;
slouken@0
  2538
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  2539
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  2540
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  2541
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  2542
		   Benchmark this! */
slouken@689
  2543
		if(alpha) {   
slouken@689
  2544
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@1428
  2545
		    *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
slouken@689
  2546
		  } else {
slouken@0
  2547
		    Uint32 d = *dstp;
slouken@0
  2548
		    /*
slouken@0
  2549
		     * convert source and destination to G0RAB65565
slouken@0
  2550
		     * and blend all components at the same time
slouken@0
  2551
		     */
slouken@0
  2552
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  2553
		      + (s >> 3 & 0x1f);
slouken@0
  2554
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  2555
		    d += (s - d) * alpha >> 5;
slouken@0
  2556
		    d &= 0x03e07c1f;
slouken@1428
  2557
		    *dstp = (Uint16)(d | d >> 16);
slouken@689
  2558
		  }
slouken@0
  2559
		}
slouken@0
  2560
		srcp++;
slouken@0
  2561
		dstp++;
slouken@0
  2562
	    }, width);
slouken@0
  2563
	    srcp += srcskip;
slouken@0
  2564
	    dstp += dstskip;
slouken@0
  2565
	}
slouken@0
  2566
}
slouken@0
  2567
slouken@0
  2568
/* General (slow) N->N blending with per-surface alpha */
slouken@0
  2569
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  2570
{
slouken@0
  2571
	int width = info->d_width;
slouken@0
  2572
	int height = info->d_height;
slouken@0
  2573
	Uint8 *src = info->s_pixels;
slouken@0
  2574
	int srcskip = info->s_skip;
slouken@0
  2575
	Uint8 *dst = info->d_pixels;
slouken@0
  2576
	int dstskip = info->d_skip;
slouken@0
  2577
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2578
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2579
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2580
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2581
	unsigned sA = srcfmt->alpha;
slouken@0
  2582
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  2583
slouken@689
  2584
	if(sA) {
slouken@689
  2585
	  while ( height-- ) {
slouken@0
  2586
	    DUFFS_LOOP4(
slouken@0
  2587
	    {
icculus@1162
  2588
		Uint32 Pixel;
slouken@0
  2589
		unsigned sR;
slouken@0
  2590
		unsigned sG;
slouken@0
  2591
		unsigned sB;
slouken@0
  2592
		unsigned dR;
slouken@0
  2593
		unsigned dG;
slouken@0
  2594
		unsigned dB;
icculus@1162
  2595
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
  2596
		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  2597
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  2598
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  2599
		src += srcbpp;
slouken@0
  2600
		dst += dstbpp;
slouken@0
  2601
	    },
slouken@0
  2602
	    width);
slouken@0
  2603
	    src += srcskip;
slouken@0
  2604
	    dst += dstskip;
slouken@689
  2605
	  }
slouken@0
  2606
	}
slouken@0
  2607
}
slouken@0
  2608
slouken@0
  2609
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@0
  2610
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
  2611
{
slouken@0
  2612
	int width = info->d_width;
slouken@0
  2613
	int height = info->d_height;
slouken@0
  2614
	Uint8 *src = info->s_pixels;
slouken@0
  2615
	int srcskip = info->s_skip;
slouken@0
  2616
	Uint8 *dst = info->d_pixels;
slouken@0
  2617
	int dstskip = info->d_skip;
slouken@0
  2618
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2619
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2620
	Uint32 ckey = srcfmt->colorkey;
slouken@0
  2621
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2622
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2623
	unsigned sA = srcfmt->alpha;
slouken@0
  2624
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  2625
slouken@0
  2626
	while ( height-- ) {
slouken@0
  2627
	    DUFFS_LOOP4(
slouken@0
  2628
	    {
icculus@1162
  2629
		Uint32 Pixel;
slouken@0
  2630
		unsigned sR;
slouken@0
  2631
		unsigned sG;
slouken@0
  2632
		unsigned sB;
slouken@0
  2633
		unsigned dR;
slouken@0
  2634
		unsigned dG;
slouken@0
  2635
		unsigned dB;
icculus@1162
  2636
		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
icculus@1162
  2637
		if(sA && Pixel != ckey) {
icculus@1162
  2638
		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
icculus@1162
  2639
		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  2640
		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  2641
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  2642
		}
slouken@0
  2643
		src += srcbpp;
slouken@0
  2644
		dst += dstbpp;
slouken@0
  2645
	    },
slouken@0
  2646
	    width);
slouken@0
  2647
	    src += srcskip;
slouken@0
  2648
	    dst += dstskip;
slouken@0
  2649
	}
slouken@0
  2650
}
slouken@0
  2651
slouken@0
  2652
/* General (slow) N->N blending with pixel alpha */
slouken@0
  2653
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
slouken@0
  2654
{
slouken@0
  2655
	int width = info->d_width;
slouken@0
  2656
	int height = info->d_height;
slouken@0
  2657
	Uint8 *src = info->s_pixels;
slouken@0
  2658
	int srcskip = info->s_skip;
slouken@0
  2659
	Uint8 *dst = info->d_pixels;
slouken@0
  2660
	int dstskip = info->d_skip;
slouken@0
  2661
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2662
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2663
slouken@0
  2664
	int  srcbpp;
slouken@0
  2665
	int  dstbpp;
slouken@0
  2666
slouken@0
  2667
	/* Set up some basic variables */
slouken@0
  2668
	srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2669
	dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2670
slouken@0
  2671
	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
slouken@0
  2672
	   quite right. for <8bpp source alpha, it gets them very wrong
slouken@0
  2673
	   (check all macros!)
slouken@0
  2674
	   It is unclear whether there is a good general solution that doesn't
slouken@0
  2675
	   need a branch (or a divide). */
slouken@0
  2676
	while ( height-- ) {
slouken@0
  2677
	    DUFFS_LOOP4(
slouken@0
  2678
	    {
icculus@1162
  2679
		Uint32 Pixel;
slouken@0
  2680
		unsigned sR;
slouken@0
  2681
		unsigned sG;
slouken@0
  2682
		unsigned sB;
slouken@0
  2683
		unsigned dR;
slouken@0
  2684
		unsigned dG;
slouken@0
  2685
		unsigned dB;
slouken@0
  2686
		unsigned sA;
slouken@0
  2687
		unsigned dA;
icculus@1162
  2688
		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@689
  2689
		if(sA) {
icculus@1162
  2690
		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@689
  2691
		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@689
  2692
		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  2693
		}
slouken@0
  2694
		src += srcbpp;
slouken@0
  2695
		dst += dstbpp;
slouken@0
  2696
	    },
slouken@0
  2697
	    width);
slouken@0
  2698
	    src += srcskip;
slouken@0
  2699
	    dst += dstskip;
slouken@0
  2700
	}
slouken@0
  2701
}
slouken@0
  2702
slouken@0
  2703
slouken@0
  2704
SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
slouken@0
  2705
{
slouken@0
  2706
    SDL_PixelFormat *sf = surface->format;
slouken@0
  2707
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  2708
slouken@0
  2709
    if(sf->Amask == 0) {
slouken@0
  2710
	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
slouken@0
  2711
	    if(df->BytesPerPixel == 1)
slouken@0
  2712
		return BlitNto1SurfaceAlphaKey;
slouken@0
  2713
	    else
slouken@1361
  2714
#if SDL_ALTIVEC_BLITTERS
icculus@1240
  2715
	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
icculus@1240
  2716
	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2717
            return Blit32to32SurfaceAlphaKeyAltivec;
icculus@1047
  2718
        else
icculus@1047
  2719
#endif
icculus@1047
  2720
            return BlitNtoNSurfaceAlphaKey;
slouken@0
  2721
	} else {
slouken@0
  2722
	    /* Per-surface alpha blits */
slouken@0
  2723
	    switch(df->BytesPerPixel) {
slouken@0
  2724
	    case 1:
slouken@0
  2725
		return BlitNto1SurfaceAlpha;
slouken@0
  2726
slouken@0
  2727
	    case 2:
slouken@0
  2728
		if(surface->map->identity) {
slouken@0
  2729
		    if(df->Gmask == 0x7e0)
slouken@689
  2730
		    {
slouken@1361
  2731
#if MMX_ASMBLIT
slouken@739
  2732
		if(SDL_HasMMX())
slouken@689
  2733
			return Blit565to565SurfaceAlphaMMX;
slouken@689
  2734
		else
slouken@689
  2735
#endif
slouken@0
  2736
			return Blit565to565SurfaceAlpha;
slouken@689
  2737
		    }
slouken@0
  2738
		    else if(df->Gmask == 0x3e0)
slouken@689
  2739
		    {
slouken@1361
  2740
#if MMX_ASMBLIT
slouken@739
  2741
		if(SDL_HasMMX())
slouken@689
  2742
			return Blit555to555SurfaceAlphaMMX;
slouken@689
  2743
		else
slouken@689
  2744
#endif
slouken@0
  2745
			return Blit555to555SurfaceAlpha;
slouken@689
  2746
		    }
slouken@0
  2747
		}
slouken@0
  2748
		return BlitNtoNSurfaceAlpha;
slouken@0
  2749
slouken@0
  2750
	    case 4:
slouken@0
  2751
		if(sf->Rmask == df->Rmask
slouken@0
  2752
		   && sf->Gmask == df->Gmask
slouken@0
  2753
		   && sf->Bmask == df->Bmask
slouken@0
  2754
		   && sf->BytesPerPixel == 4)
slouken@689
  2755
		{
slouken@1361
  2756
#if MMX_ASMBLIT
slouken@1542
  2757
			if(sf->Rshift % 8 == 0
slouken@1542
  2758
			   && sf->Gshift % 8 == 0
slouken@1542
  2759
			   && sf->Bshift % 8 == 0
slouken@1542
  2760
			   && SDL_HasMMX())
slouken@1542
  2761
			    return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@1542
  2762
#endif
slouken@1542
  2763
			if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
slouken@1542
  2764
			{
slouken@1617
  2765
#if SDL_ALTIVEC_BLITTERS
slouken@1617
  2766
				if(!(surface->map->dst->flags & SDL_HWSURFACE)
slouken@1617
  2767
					&& SDL_HasAltiVec())
slouken@1542
  2768
					return BlitRGBtoRGBSurfaceAlphaAltivec;
slouken@1542
  2769
#endif
slouken@1542
  2770
				return BlitRGBtoRGBSurfaceAlpha;
slouken@1542
  2771
			}
slouken@1542
  2772
		}
slouken@1542
  2773
#if SDL_ALTIVEC_BLITTERS
slouken@1542
  2774
		if((sf->BytesPerPixel == 4) &&
slouken@1542
  2775
		   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
slouken@1542
  2776
			return Blit32to32SurfaceAlphaAltivec;
slouken@689
  2777
		else
slouken@689
  2778
#endif
slouken@1542
  2779
			return BlitNtoNSurfaceAlpha;
slouken@0
  2780
slouken@0
  2781
	    case 3:
slouken@0
  2782
	    default:
slouken@0
  2783
		return BlitNtoNSurfaceAlpha;
slouken@0
  2784
	    }
slouken@0
  2785
	}
slouken@0
  2786
    } else {
slouken@0
  2787
	/* Per-pixel alpha blits */
slouken@0
  2788
	switch(df->BytesPerPixel) {
slouken@0
  2789
	case 1:
slouken@0
  2790
	    return BlitNto1PixelAlpha;
slouken@0
  2791
slouken@0
  2792
	case 2:
slouken@1361
  2793
#if SDL_ALTIVEC_BLITTERS
icculus@1240
  2794
	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
icculus@1047
  2795
           df->Gmask == 0x7e0 &&
icculus@1240
  2796
	   df->Bmask == 0x1f && SDL_HasAltiVec())
icculus@1047
  2797
            return Blit32to565PixelAlphaAltivec;
icculus@1047
  2798
        else
icculus@1047
  2799
#endif
slouken@0
  2800
	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@0
  2801
	       && sf->Gmask == 0xff00
slouken@0
  2802
	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@0
  2803
		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@0
  2804
		if(df->Gmask == 0x7e0)
slouken@0
  2805
		    return BlitARGBto565PixelAlpha;
slouken@0
  2806
		else if(df->Gmask == 0x3e0)
slouken@0
  2807
		    return BlitARGBto555PixelAlpha;
slouken@0
  2808
	    }
slouken@0
  2809
	    return BlitNtoNPixelAlpha;
slouken@0
  2810
slouken@0
  2811
	case 4:
slouken@1542
  2812
	    if(sf->Rmask == df->Rmask
slouken@0
  2813
	       && sf->Gmask == df->Gmask
slouken@0
  2814
	       && sf->Bmask == df->Bmask
slouken@0
  2815
	       && sf->BytesPerPixel == 4)
slouken@689
  2816
	    {
slouken@1361
  2817
#if MMX_ASMBLIT
slouken@1542
  2818
		if(sf->Rshift % 8 == 0
slouken@1542
  2819
		   && sf->Gshift % 8 == 0
slouken@1542
  2820
		   && sf->Bshift % 8 == 0
slouken@1542
  2821
		   && sf->Ashift % 8 == 0
slouken@1542
  2822
		   && sf->Aloss == 0)
slouken@1542
  2823
		{
slouken@1542
  2824
			if(SDL_Has3DNow())
slouken@1542
  2825
				return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@1542
  2826
			if(SDL_HasMMX())
slouken@1542
  2827
				return BlitRGBtoRGBPixelAlphaMMX;
slouken@1542
  2828
		}
slouken@689
  2829
#endif
slouken@1542
  2830
		if(sf->Amask == 0xff000000)
slouken@1542
  2831
		{
slouken@1617
  2832
#if SDL_ALTIVEC_BLITTERS
slouken@1617
  2833
			if(!(surface->map->dst->flags & SDL_HWSURFACE)
slouken@1617
  2834
				&& SDL_HasAltiVec())
slouken@1542
  2835
				return BlitRGBtoRGBPixelAlphaAltivec;
icculus@1047
  2836
#endif
slouken@1542
  2837
			return BlitRGBtoRGBPixelAlpha;
slouken@1542
  2838
		}
slouken@689
  2839
	    }
slouken@1361
  2840
#if SDL_ALTIVEC_BLITTERS
slouken@1542
  2841
	    if (sf->Amask && sf->BytesPerPixel == 4 &&
slouken@1542
  2842
	        !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
slouken@1542
  2843
		return Blit32to32PixelAlphaAltivec;
slouken@1542
  2844
	    else
icculus@1047
  2845
#endif
slouken@1542
  2846
		return BlitNtoNPixelAlpha;
slouken@0
  2847
slouken@0
  2848
	case 3:
slouken@0
  2849
	default:
slouken@0
  2850
	    return BlitNtoNPixelAlpha;
slouken@0
  2851
	}
slouken@0
  2852
    }
slouken@0
  2853
}
slouken@0
  2854