src/video/SDL_blit_A.c
author Ryan C. Gordon <icculus@icculus.org>
Sun, 08 Jan 2006 21:18:15 +0000
changeset 1240 3b8a43c428bb
parent 1175 867f521591e5
child 1312 c9b51268668f
permissions -rw-r--r--
From Bug #36:

There are a couple of issues with the selection of Altivec alpha-blitting
routines in CalculateAlphaBlit() in src/video/SDL_Blit_A.c.

1) There's no check for the presence of Altivec when checking if the
Blit32to565PixelAlphaAltivec() routine can be selected.

2) Altivec cannot be used in video memory, and there's no check if the
destination surface is a hardware surface. (Alpha-blitting to a hardware
surface with GPU support is a bad idea, but somebody's bound to do it anyway.)

Patch to fix these attached.
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@769
     3
    Copyright (C) 1997-2004 Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@0
     6
    modify it under the terms of the GNU Library General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@0
     8
    version 2 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@0
    13
    Library General Public License for more details.
slouken@0
    14
slouken@0
    15
    You should have received a copy of the GNU Library General Public
slouken@0
    16
    License along with this library; if not, write to the Free
slouken@0
    17
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@0
    22
slouken@0
    23
#ifdef SAVE_RCSID
slouken@0
    24
static char rcsid =
slouken@0
    25
 "@(#) $Id$";
slouken@0
    26
#endif
slouken@0
    27
slouken@0
    28
#include <stdio.h>
slouken@0
    29
slouken@0
    30
#include "SDL_types.h"
slouken@0
    31
#include "SDL_video.h"
slouken@0
    32
#include "SDL_blit.h"
slouken@0
    33
slouken@880
    34
#if (defined(i386) || defined(__x86_64__)) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@880
    35
#define MMX_ASMBLIT
slouken@880
    36
#endif
slouken@880
    37
slouken@739
    38
/* Function to check the CPU flags */
slouken@739
    39
#include "SDL_cpuinfo.h"
icculus@1047
    40
#ifdef MMX_ASMBLIT
slouken@689
    41
#include "mmx.h"
slouken@689
    42
#endif
slouken@689
    43
slouken@0
    44
/* Functions to perform alpha blended blitting */
slouken@0
    45
slouken@0
    46
/* N->1 blending with per-surface alpha */
slouken@0
    47
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
    48
{
slouken@0
    49
	int width = info->d_width;
slouken@0
    50
	int height = info->d_height;
slouken@0
    51
	Uint8 *src = info->s_pixels;
slouken@0
    52
	int srcskip = info->s_skip;
slouken@0
    53
	Uint8 *dst = info->d_pixels;
slouken@0
    54
	int dstskip = info->d_skip;
slouken@0
    55
	Uint8 *palmap = info->table;
slouken@0
    56
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
    57
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
    58
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    59
slouken@0
    60
	const unsigned A = srcfmt->alpha;
slouken@0
    61
slouken@0
    62
	while ( height-- ) {
slouken@0
    63
	    DUFFS_LOOP4(
slouken@0
    64
	    {
icculus@1162
    65
		Uint32 Pixel;
slouken@0
    66
		unsigned sR;
slouken@0
    67
		unsigned sG;
slouken@0
    68
		unsigned sB;
slouken@0
    69
		unsigned dR;
slouken@0
    70
		unsigned dG;
slouken@0
    71
		unsigned dB;
icculus@1162
    72
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
slouken@0
    73
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    74
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    75
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
    76
		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
    77
		dR &= 0xff;
slouken@0
    78
		dG &= 0xff;
slouken@0
    79
		dB &= 0xff;
slouken@0
    80
		/* Pack RGB into 8bit pixel */
slouken@0
    81
		if ( palmap == NULL ) {
slouken@0
    82
		    *dst =((dR>>5)<<(3+2))|
slouken@0
    83
			  ((dG>>5)<<(2))|
slouken@0
    84
			  ((dB>>6)<<(0));
slouken@0
    85
		} else {
slouken@0
    86
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
    87
				  ((dG>>5)<<(2))  |
slouken@0
    88
				  ((dB>>6)<<(0))];
slouken@0
    89
		}
slouken@0
    90
		dst++;
slouken@0
    91
		src += srcbpp;
slouken@0
    92
	    },
slouken@0
    93
	    width);
slouken@0
    94
	    src += srcskip;
slouken@0
    95
	    dst += dstskip;
slouken@0
    96
	}
slouken@0
    97
}
slouken@0
    98
slouken@0
    99
/* N->1 blending with pixel alpha */
slouken@0
   100
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
slouken@0
   101
{
slouken@0
   102
	int width = info->d_width;
slouken@0
   103
	int height = info->d_height;
slouken@0
   104
	Uint8 *src = info->s_pixels;
slouken@0
   105
	int srcskip = info->s_skip;
slouken@0
   106
	Uint8 *dst = info->d_pixels;
slouken@0
   107
	int dstskip = info->d_skip;
slouken@0
   108
	Uint8 *palmap = info->table;
slouken@0
   109
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   110
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   111
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   112
slouken@0
   113
	/* FIXME: fix alpha bit field expansion here too? */
slouken@0
   114
	while ( height-- ) {
slouken@0
   115
	    DUFFS_LOOP4(
slouken@0
   116
	    {
icculus@1162
   117
		Uint32 Pixel;
slouken@0
   118
		unsigned sR;
slouken@0
   119
		unsigned sG;
slouken@0
   120
		unsigned sB;
slouken@0
   121
		unsigned sA;
slouken@0
   122
		unsigned dR;
slouken@0
   123
		unsigned dG;
slouken@0
   124
		unsigned dB;
icculus@1162
   125
		DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
slouken@0
   126
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
   127
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   128
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
   129
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   130
		dR &= 0xff;
slouken@0
   131
		dG &= 0xff;
slouken@0
   132
		dB &= 0xff;
slouken@0
   133
		/* Pack RGB into 8bit pixel */
slouken@0
   134
		if ( palmap == NULL ) {
slouken@0
   135
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   136
			  ((dG>>5)<<(2))|
slouken@0
   137
			  ((dB>>6)<<(0));
slouken@0
   138
		} else {
slouken@0
   139
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   140
				  ((dG>>5)<<(2))  |
slouken@0
   141
				  ((dB>>6)<<(0))  ];
slouken@0
   142
		}
slouken@0
   143
		dst++;
slouken@0
   144
		src += srcbpp;
slouken@0
   145
	    },
slouken@0
   146
	    width);
slouken@0
   147
	    src += srcskip;
slouken@0
   148
	    dst += dstskip;
slouken@0
   149
	}
slouken@0
   150
}
slouken@0
   151
slouken@0
   152
/* colorkeyed N->1 blending with per-surface alpha */
slouken@0
   153
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
   154
{
slouken@0
   155
	int width = info->d_width;
slouken@0
   156
	int height = info->d_height;
slouken@0
   157
	Uint8 *src = info->s_pixels;
slouken@0
   158
	int srcskip = info->s_skip;
slouken@0
   159
	Uint8 *dst = info->d_pixels;
slouken@0
   160
	int dstskip = info->d_skip;
slouken@0
   161
	Uint8 *palmap = info->table;
slouken@0
   162
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   163
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   164
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   165
	Uint32 ckey = srcfmt->colorkey;
slouken@0
   166
slouken@0
   167
	const int A = srcfmt->alpha;
slouken@0
   168
slouken@0
   169
	while ( height-- ) {
slouken@0
   170
	    DUFFS_LOOP(
slouken@0
   171
	    {
icculus@1162
   172
		Uint32 Pixel;
slouken@0
   173
		unsigned sR;
slouken@0
   174
		unsigned sG;
slouken@0
   175
		unsigned sB;
slouken@0
   176
		unsigned dR;
slouken@0
   177
		unsigned dG;
slouken@0
   178
		unsigned dB;
icculus@1162
   179
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
   180
		if ( Pixel != ckey ) {
slouken@0
   181
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   182
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   183
		    dB = dstfmt->palette->colors[*dst].b;
slouken@0
   184
		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
   185
		    dR &= 0xff;
slouken@0
   186
		    dG &= 0xff;
slouken@0
   187
		    dB &= 0xff;
slouken@0
   188
		    /* Pack RGB into 8bit pixel */
slouken@0
   189
		    if ( palmap == NULL ) {
slouken@0
   190
			*dst =((dR>>5)<<(3+2))|
slouken@0
   191
			      ((dG>>5)<<(2)) |
slouken@0
   192
			      ((dB>>6)<<(0));
slouken@0
   193
		    } else {
slouken@0
   194
			*dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   195
				      ((dG>>5)<<(2))  |
slouken@0
   196
				      ((dB>>6)<<(0))  ];
slouken@0
   197
		    }
slouken@0
   198
		}
slouken@0
   199
		dst++;
slouken@0
   200
		src += srcbpp;
slouken@0
   201
	    },
slouken@0
   202
	    width);
slouken@0
   203
	    src += srcskip;
slouken@0
   204
	    dst += dstskip;
slouken@0
   205
	}
slouken@0
   206
}
slouken@0
   207
slouken@880
   208
#ifdef MMX_ASMBLIT
slouken@689
   209
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@689
   210
static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
slouken@689
   211
{
slouken@689
   212
	int width = info->d_width;
slouken@689
   213
	int height = info->d_height;
slouken@689
   214
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   215
	int srcskip = info->s_skip >> 2;
slouken@689
   216
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   217
	int dstskip = info->d_skip >> 2;
slouken@689
   218
        Uint8 load[8];
slouken@689
   219
  
slouken@720
   220
        *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
slouken@689
   221
        movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
slouken@720
   222
        *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
slouken@689
   223
        movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
slouken@720
   224
        *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */
slouken@689
   225
        movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
slouken@689
   226
	while(height--) {
slouken@689
   227
            DUFFS_LOOP_DOUBLE2(
slouken@689
   228
            {
slouken@689
   229
		    Uint32 s = *srcp++;
slouken@689
   230
		    Uint32 d = *dstp;
slouken@689
   231
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@689
   232
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@689
   233
            },{
slouken@689
   234
	            movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@689
   235
	            movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@689
   236
	      
slouken@689
   237
	            movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
slouken@689
   238
	            movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
slouken@689
   239
		
slouken@689
   240
	            pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
slouken@689
   241
	            pand_r2r(mm4, mm5); /* src & mask -> mm5 */
slouken@689
   242
	            paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
slouken@689
   243
	            psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
slouken@689
   244
	
slouken@689
   245
	            pand_r2r(mm1, mm2); /* src & dst -> mm2 */
slouken@689
   246
	            pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
slouken@689
   247
	            paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
slouken@689
   248
	            por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@689
   249
	            movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
slouken@689
   250
	            dstp += 2;
slouken@689
   251
	            srcp += 2;
slouken@689
   252
            }, width);
slouken@689
   253
	    srcp += srcskip;
slouken@689
   254
	    dstp += dstskip;
slouken@689
   255
	}
slouken@689
   256
	emms();
slouken@689
   257
}
slouken@689
   258
slouken@689
   259
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@689
   260
static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
   261
{
slouken@689
   262
	unsigned alpha = info->src->alpha;
slouken@689
   263
	if(alpha == 128) {
slouken@689
   264
		BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@689
   265
	} else {
slouken@689
   266
		int width = info->d_width;
slouken@689
   267
		int height = info->d_height;
slouken@689
   268
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   269
		int srcskip = info->s_skip >> 2;
slouken@689
   270
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   271
		int dstskip = info->d_skip >> 2;
slouken@689
   272
                Uint8 load[8] = {alpha, alpha, alpha, alpha,
slouken@689
   273
    					alpha, alpha, alpha, alpha};
slouken@689
   274
					
slouken@689
   275
                movq_m2r(*load, mm4); /* alpha -> mm4 */
slouken@720
   276
		*(Uint64 *)load = 0x00FF00FF00FF00FFULL;
slouken@689
   277
                movq_m2r(*load, mm3); /* mask -> mm3 */
slouken@689
   278
		pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */
slouken@720
   279
		*(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */
slouken@689
   280
		movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
slouken@689
   281
		
slouken@689
   282
		while(height--) {
slouken@689
   283
			DUFFS_LOOP_DOUBLE2({
slouken@689
   284
				/* One Pixel Blend */
slouken@689
   285
	                        movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   286
                                punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
slouken@689
   287
                                pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
slouken@689
   288
			  
slouken@689
   289
	                        movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   290
			        movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/
slouken@689
   291
                                punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
slouken@689
   292
                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   293
			  
slouken@689
   294
                                psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@689
   295
	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   296
	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@689
   297
	                        paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@689
   298
	                        pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   299
	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   300
	                        por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
icculus@1162
   301
			        movd_r2m(mm2, *dstp);/* mm2 -> Pixel */
slouken@689
   302
				++srcp;
slouken@689
   303
				++dstp;
slouken@689
   304
			},{
slouken@689
   305
			        /* Two Pixels Blend */
slouken@689
   306
				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
slouken@689
   307
			        movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
slouken@689
   308
                                punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */
slouken@689
   309
			        pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */
slouken@689
   310
			        punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */
slouken@689
   311
	                        pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */
slouken@689
   312
	
slouken@689
   313
	                        movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@689
   314
	                        movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */
slouken@689
   315
			        movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@689
   316
                                punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */
slouken@689
   317
	                        punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */
slouken@689
   318
                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */
slouken@689
   319
	                  
slouken@689
   320
                                psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
slouken@689
   321
	                        pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
slouken@689
   322
			        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */
slouken@689
   323
			        psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
slouken@689
   324
			        psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
slouken@689
   325
	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   326
				paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
slouken@689
   327
	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */
slouken@689
   328
				pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   329
	                        paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
slouken@689
   330
	                        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */
slouken@689
   331
	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   332
	                        packuswb_r2r(mm6, mm6);  /* ARGBARGB -> mm6 */
slouken@689
   333
	                        psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */
slouken@689
   334
	                        psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */
slouken@689
   335
	                        por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */				
slouken@689
   336
				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
icculus@1162
   337
                                movq_r2m(mm2, *dstp);/* mm2 -> 2 x Pixel */
slouken@689
   338
				srcp += 2;
slouken@689
   339
				dstp += 2;
slouken@689
   340
			}, width);
slouken@689
   341
			srcp += srcskip;
slouken@689
   342
			dstp += dstskip;
slouken@689
   343
		}
slouken@689
   344
		emms();
slouken@689
   345
	}
slouken@689
   346
}
slouken@689
   347
slouken@689
   348
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
   349
static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
slouken@689
   350
{
slouken@689
   351
	int width = info->d_width;
slouken@689
   352
	int height = info->d_height;
slouken@689
   353
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   354
	int srcskip = info->s_skip >> 2;
slouken@689
   355
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   356
	int dstskip = info->d_skip >> 2;
slouken@689
   357
        Uint32 alpha = 0;
slouken@689
   358
        Uint8 load[8];
slouken@689
   359
	                
slouken@720
   360
	*(Uint64 *)load = 0x00FF00FF00FF00FFULL;
slouken@689
   361
        movq_m2r(*load, mm3); /* mask -> mm2 */
slouken@720
   362
	*(Uint64 *)load = 0x00FF000000000000ULL;
slouken@689
   363
        movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */
slouken@720
   364
        *(Uint64 *)load = 0x00FFFFFF00FFFFFFULL;
slouken@689
   365
        movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */
slouken@720
   366
        *(Uint64 *)load = 0xFF000000FF000000ULL;
slouken@689
   367
        movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */
slouken@689
   368
	while(height--) {
slouken@689
   369
	    DUFFS_LOOP4({
slouken@689
   370
	        alpha = *srcp;
slouken@689
   371
	        alpha >>= 24;
slouken@689
   372
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
   373
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
   374
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
   375
		   Benchmark this! */
slouken@689
   376
		if(alpha) {   
slouken@689
   377
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@689
   378
		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   379
		    movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   380
		    pand_r2r(mm0, mm1);
slouken@689
   381
		    pand_r2r(mm6, mm2);
slouken@689
   382
		    por_r2r(mm1, mm2);
slouken@689
   383
		    movd_r2m(mm2, (*dstp));
slouken@689
   384
		  } else {
slouken@689
   385
		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   386
                    punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
slouken@689
   387
                    pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
slouken@689
   388
			  
slouken@689
   389
	            movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   390
                    punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
slouken@689
   391
                    pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   392
		
slouken@689
   393
		    movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */
slouken@689
   394
		    pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */
slouken@689
   395
		    psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/
slouken@689
   396
		    
slouken@689
   397
		    movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */
slouken@689
   398
		    psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */
slouken@689
   399
		    punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
slouken@689
   400
                    punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
slouken@689
   401
		                        		    
slouken@689
   402
                    /* blend */		    
slouken@689
   403
                    psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@689
   404
	            pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   405
	            psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@689
   406
	            paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@689
   407
	            pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   408
		    packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   409
		    pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */
slouken@689
   410
		    por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */
slouken@689
   411
		    movd_r2m(mm2, *dstp);/* mm2 -> dst */
slouken@689
   412
		  }
slouken@689
   413
		}
slouken@689
   414
		++srcp;
slouken@689
   415
		++dstp;
slouken@689
   416
	    }, width);
slouken@689
   417
	    srcp += srcskip;
slouken@689
   418
	    dstp += dstskip;
slouken@689
   419
	}
slouken@689
   420
	emms();
slouken@689
   421
}
slouken@689
   422
#endif
slouken@689
   423
icculus@1047
   424
#ifdef USE_ALTIVEC_BLITTERS
icculus@1175
   425
#ifdef HAVE_ALTIVEC_H
icculus@1162
   426
#include <altivec.h>
icculus@1175
   427
#endif
icculus@1047
   428
#include <assert.h>
icculus@1162
   429
icculus@1162
   430
#if ((defined MACOSX) && (__GNUC__ < 4))
icculus@1162
   431
    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   432
        (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
icculus@1162
   433
    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   434
        (vector unsigned short) ( a,b,c,d,e,f,g,h )
icculus@1162
   435
#else
icculus@1162
   436
    #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
icculus@1162
   437
        (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
icculus@1162
   438
    #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
icculus@1162
   439
        (vector unsigned short) { a,b,c,d,e,f,g,h }
icculus@1162
   440
#endif
icculus@1162
   441
icculus@1047
   442
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
icculus@1047
   443
#define VECPRINT(msg, v) do { \
icculus@1047
   444
    vector unsigned int tmpvec = (vector unsigned int)(v); \
icculus@1047
   445
    unsigned int *vp = (unsigned int *)&tmpvec; \
icculus@1047
   446
    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
icculus@1047
   447
} while (0)
icculus@1047
   448
icculus@1047
   449
/* the permuation vector that takes the high bytes out of all the appropriate shorts 
icculus@1047
   450
    (vector unsigned char)(
icculus@1047
   451
        0x00, 0x10, 0x02, 0x12,
icculus@1047
   452
        0x04, 0x14, 0x06, 0x16,
icculus@1047
   453
        0x08, 0x18, 0x0A, 0x1A,
icculus@1047
   454
        0x0C, 0x1C, 0x0E, 0x1E );
icculus@1047
   455
*/
icculus@1047
   456
#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
icculus@1047
   457
#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
icculus@1047
   458
#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
icculus@1047
   459
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
icculus@1047
   460
    ? vec_lvsl(0, src) \
icculus@1047
   461
    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
icculus@1047
   462
icculus@1047
   463
   
icculus@1047
   464
#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
icculus@1047
   465
    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
icculus@1047
   466
    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
icculus@1047
   467
    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
icculus@1047
   468
    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
icculus@1047
   469
    /* valpha2 is 255-alpha */ \
icculus@1047
   470
    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
icculus@1047
   471
    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
icculus@1047
   472
    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
icculus@1047
   473
    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
icculus@1047
   474
    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
icculus@1047
   475
    /* add source and dest */ \
icculus@1047
   476
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   477
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   478
    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
icculus@1047
   479
    vtemp1 = vec_add(vtemp1, v1_16); \
icculus@1047
   480
    vtemp3 = vec_sr(vtemp1, v8_16); \
icculus@1047
   481
    vtemp1 = vec_add(vtemp1, vtemp3); \
icculus@1047
   482
    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
icculus@1047
   483
    vtemp2 = vec_add(vtemp2, v1_16); \
icculus@1047
   484
    vtemp4 = vec_sr(vtemp2, v8_16); \
icculus@1047
   485
    vtemp2 = vec_add(vtemp2, vtemp4); \
icculus@1047
   486
    /* (>>8) and get ARGBARGBARGBARGB */ \
icculus@1047
   487
    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
icculus@1047
   488
} while (0)
icculus@1047
   489
 
icculus@1047
   490
/* Calculate the permute vector used for 32->32 swizzling */
icculus@1047
   491
static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
icculus@1047
   492
                                  const SDL_PixelFormat *dstfmt)
icculus@1047
   493
{
icculus@1047
   494
    /*
icculus@1047
   495
     * We have to assume that the bits that aren't used by other
icculus@1047
   496
     *  colors is alpha, and it's one complete byte, since some formats
icculus@1047
   497
     *  leave alpha with a zero mask, but we should still swizzle the bits.
icculus@1047
   498
     */
icculus@1047
   499
    /* ARGB */
icculus@1047
   500
    const static struct SDL_PixelFormat default_pixel_format = {
icculus@1047
   501
        NULL, 0, 0,
icculus@1047
   502
        0, 0, 0, 0,
icculus@1047
   503
        16, 8, 0, 24,
icculus@1047
   504
        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
icculus@1047
   505
        0, 0};
icculus@1047
   506
    if (!srcfmt) {
icculus@1047
   507
        srcfmt = &default_pixel_format;
icculus@1047
   508
    }
icculus@1047
   509
    if (!dstfmt) {
icculus@1047
   510
        dstfmt = &default_pixel_format;
icculus@1047
   511
    }
icculus@1162
   512
    vector unsigned char plus = VECUINT8_LITERAL
icculus@1047
   513
                                            ( 0x00, 0x00, 0x00, 0x00,
icculus@1047
   514
                                              0x04, 0x04, 0x04, 0x04,
icculus@1047
   515
                                              0x08, 0x08, 0x08, 0x08,
icculus@1047
   516
                                              0x0C, 0x0C, 0x0C, 0x0C );
icculus@1047
   517
    vector unsigned char vswiz;
icculus@1047
   518
    vector unsigned int srcvec;
icculus@1047
   519
#define RESHIFT(X) (3 - ((X) >> 3))
icculus@1047
   520
    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
icculus@1047
   521
    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
icculus@1047
   522
    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
icculus@1047
   523
    Uint32 amask;
icculus@1047
   524
    /* Use zero for alpha if either surface doesn't have alpha */
icculus@1047
   525
    if (dstfmt->Amask) {
icculus@1047
   526
        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
icculus@1047
   527
    } else {
icculus@1047
   528
        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
icculus@1047
   529
    }
icculus@1047
   530
#undef RESHIFT  
icculus@1162
   531
    ((unsigned int *)(char*)&srcvec)[0] = (rmask | gmask | bmask | amask);
icculus@1047
   532
    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
icculus@1047
   533
    return(vswiz);
icculus@1047
   534
}
icculus@1047
   535
icculus@1047
   536
static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
   537
{
icculus@1047
   538
    int height = info->d_height;
icculus@1047
   539
    Uint8 *src = (Uint8 *)info->s_pixels;
icculus@1047
   540
    int srcskip = info->s_skip;
icculus@1047
   541
    Uint8 *dst = (Uint8 *)info->d_pixels;
icculus@1047
   542
    int dstskip = info->d_skip;
icculus@1047
   543
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   544
icculus@1047
   545
    vector unsigned char v0 = vec_splat_u8(0);
icculus@1047
   546
    vector unsigned short v8_16 = vec_splat_u16(8);
icculus@1047
   547
    vector unsigned short v1_16 = vec_splat_u16(1);
icculus@1047
   548
    vector unsigned short v2_16 = vec_splat_u16(2);
icculus@1047
   549
    vector unsigned short v3_16 = vec_splat_u16(3);
icculus@1047
   550
    vector unsigned int v8_32 = vec_splat_u32(8);
icculus@1047
   551
    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
icculus@1162
   552
    vector unsigned short v3f = VECUINT16_LITERAL(
icculus@1047
   553
        0x003f, 0x003f, 0x003f, 0x003f,
icculus@1047
   554
        0x003f, 0x003f, 0x003f, 0x003f);
icculus@1162
   555
    vector unsigned short vfc = VECUINT16_LITERAL(
icculus@1047
   556
        0x00fc, 0x00fc, 0x00fc, 0x00fc,
icculus@1047
   557
        0x00fc, 0x00fc, 0x00fc, 0x00fc);
icculus@1047
   558
icculus@1047
   559
    /* 
icculus@1047
   560
        0x10 - 0x1f is the alpha
icculus@1047
   561
        0x00 - 0x0e evens are the red
icculus@1047
   562
        0x01 - 0x0f odds are zero
icculus@1047
   563
    */
icculus@1162
   564
    vector unsigned char vredalpha1 = VECUINT8_LITERAL(
icculus@1047
   565
        0x10, 0x00, 0x01, 0x01,
icculus@1047
   566
        0x10, 0x02, 0x01, 0x01,
icculus@1047
   567
        0x10, 0x04, 0x01, 0x01,
icculus@1047
   568
        0x10, 0x06, 0x01, 0x01
icculus@1047
   569
    );
icculus@1047
   570
    vector unsigned char vredalpha2 = (vector unsigned char)(
icculus@1047
   571
        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
icculus@1047
   572
    );
icculus@1047
   573
    /*
icculus@1047
   574
        0x00 - 0x0f is ARxx ARxx ARxx ARxx
icculus@1047
   575
        0x11 - 0x0f odds are blue
icculus@1047
   576
    */
icculus@1162
   577
    vector unsigned char vblue1 = VECUINT8_LITERAL(
icculus@1047
   578
        0x00, 0x01, 0x02, 0x11,
icculus@1047
   579
        0x04, 0x05, 0x06, 0x13,
icculus@1047
   580
        0x08, 0x09, 0x0a, 0x15,
icculus@1047
   581
        0x0c, 0x0d, 0x0e, 0x17
icculus@1047
   582
    );
icculus@1047
   583
    vector unsigned char vblue2 = (vector unsigned char)(
icculus@1047
   584
        vec_add((vector unsigned int)vblue1, v8_32)
icculus@1047
   585
    );
icculus@1047
   586
    /*
icculus@1047
   587
        0x00 - 0x0f is ARxB ARxB ARxB ARxB
icculus@1047
   588
        0x10 - 0x0e evens are green
icculus@1047
   589
    */
icculus@1162
   590
    vector unsigned char vgreen1 = VECUINT8_LITERAL(
icculus@1047
   591
        0x00, 0x01, 0x10, 0x03,
icculus@1047
   592
        0x04, 0x05, 0x12, 0x07,
icculus@1047
   593
        0x08, 0x09, 0x14, 0x0b,
icculus@1047
   594
        0x0c, 0x0d, 0x16, 0x0f
icculus@1047
   595
    );
icculus@1047
   596
    vector unsigned char vgreen2 = (vector unsigned char)(
icculus@1047
   597
        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
icculus@1047
   598
    );
icculus@1162
   599
    vector unsigned char vgmerge = VECUINT8_LITERAL(
icculus@1047
   600
        0x00, 0x02, 0x00, 0x06,
icculus@1047
   601
        0x00, 0x0a, 0x00, 0x0e,
icculus@1047
   602
        0x00, 0x12, 0x00, 0x16,
icculus@1047
   603
        0x00, 0x1a, 0x00, 0x1e);
icculus@1047
   604
    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   605
    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
   606
    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
   607
icculus@1047
   608
    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
icculus@1047
   609
    vf800 = vec_sl(vf800, vec_splat_u16(8));
icculus@1047
   610
icculus@1047
   611
    while(height--) {
icculus@1047
   612
        int extrawidth;
icculus@1047
   613
        vector unsigned char valigner;
icculus@1047
   614
        vector unsigned char vsrc;
icculus@1047
   615
        vector unsigned char voverflow;
icculus@1047
   616
        int width = info->d_width;
icculus@1047
   617
icculus@1047
   618
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   619
        while (condition) { \
icculus@1162
   620
            Uint32 Pixel; \
icculus@1047
   621
            unsigned sR, sG, sB, dR, dG, dB, sA; \
icculus@1162
   622
            DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
   623
            if(sA) { \
icculus@1047
   624
                unsigned short dstpixel = *((unsigned short *)dst); \
icculus@1047
   625
                dR = (dstpixel >> 8) & 0xf8; \
icculus@1047
   626
                dG = (dstpixel >> 3) & 0xfc; \
icculus@1047
   627
                dB = (dstpixel << 3) & 0xf8; \
icculus@1047
   628
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   629
                *((unsigned short *)dst) = ( \
icculus@1047
   630
                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
icculus@1047
   631
                ); \
icculus@1047
   632
            } \
icculus@1047
   633
            src += 4; \
icculus@1047
   634
            dst += 2; \
icculus@1047
   635
            widthvar--; \
icculus@1047
   636
        }
icculus@1047
   637
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
icculus@1047
   638
        extrawidth = (width % 8);
icculus@1047
   639
        valigner = VEC_ALIGNER(src);
icculus@1047
   640
        vsrc = (vector unsigned char)vec_ld(0, src);
icculus@1047
   641
        width -= extrawidth;
icculus@1047
   642
        while (width) {
icculus@1047
   643
            vector unsigned char valpha;
icculus@1047
   644
            vector unsigned char vsrc1, vsrc2;
icculus@1047
   645
            vector unsigned char vdst1, vdst2;
icculus@1047
   646
            vector unsigned short vR, vG, vB;
icculus@1047
   647
            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
icculus@1047
   648
icculus@1047
   649
            /* Load 8 pixels from src as ARGB */
icculus@1047
   650
            voverflow = (vector unsigned char)vec_ld(15, src);
icculus@1047
   651
            vsrc = vec_perm(vsrc, voverflow, valigner);
icculus@1047
   652
            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
icculus@1047
   653
            src += 16;
icculus@1047
   654
            vsrc = (vector unsigned char)vec_ld(15, src);
icculus@1047
   655
            voverflow = vec_perm(voverflow, vsrc, valigner);
icculus@1047
   656
            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
icculus@1047
   657
            src += 16;
icculus@1047
   658
icculus@1047
   659
            /* Load 8 pixels from dst as XRGB */
icculus@1047
   660
            voverflow = vec_ld(0, dst);
icculus@1047
   661
            vR = vec_and((vector unsigned short)voverflow, vf800);
icculus@1047
   662
            vB = vec_sl((vector unsigned short)voverflow, v3_16);
icculus@1047
   663
            vG = vec_sl(vB, v2_16);
icculus@1047
   664
            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
icculus@1047
   665
            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
icculus@1047
   666
            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
icculus@1047
   667
            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
icculus@1047
   668
            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
icculus@1047
   669
            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
icculus@1047
   670
icculus@1047
   671
            /* Alpha blend 8 pixels as ARGB */
icculus@1047
   672
            valpha = vec_perm(vsrc1, v0, valphaPermute);
icculus@1047
   673
            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
icculus@1047
   674
            valpha = vec_perm(vsrc2, v0, valphaPermute);
icculus@1047
   675
            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
icculus@1047
   676
icculus@1047
   677
            /* Convert 8 pixels to 565 */
icculus@1047
   678
            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
icculus@1047
   679
            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
icculus@1047
   680
            vgpixel = vec_and(vgpixel, vfc);
icculus@1047
   681
            vgpixel = vec_sl(vgpixel, v3_16);
icculus@1047
   682
            vrpixel = vec_sl(vpixel, v1_16);
icculus@1047
   683
            vrpixel = vec_and(vrpixel, vf800);
icculus@1047
   684
            vbpixel = vec_and(vpixel, v3f);
icculus@1047
   685
            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
icculus@1047
   686
            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
icculus@1047
   687
            
icculus@1047
   688
            /* Store 8 pixels */
icculus@1047
   689
            vec_st(vdst1, 0, dst);
icculus@1047
   690
icculus@1047
   691
            width -= 8;
icculus@1047
   692
            dst += 16;
icculus@1047
   693
        }
icculus@1047
   694
        ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
   695
#undef ONE_PIXEL_BLEND
icculus@1047
   696
        src += srcskip;
icculus@1047
   697
        dst += dstskip;
icculus@1047
   698
    }
icculus@1047
   699
}
icculus@1047
   700
icculus@1047
   701
static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
icculus@1047
   702
{
icculus@1047
   703
    unsigned alpha = info->src->alpha;
icculus@1047
   704
    int height = info->d_height;
icculus@1047
   705
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
   706
    int srcskip = info->s_skip >> 2;
icculus@1047
   707
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
   708
    int dstskip = info->d_skip >> 2;
icculus@1047
   709
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   710
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
   711
    unsigned sA = srcfmt->alpha;
icculus@1047
   712
    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
   713
    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
icculus@1047
   714
    Uint32 ckey = info->src->colorkey;
icculus@1047
   715
    vector unsigned char mergePermute;
icculus@1047
   716
    vector unsigned char vsrcPermute;
icculus@1047
   717
    vector unsigned char vdstPermute;
icculus@1047
   718
    vector unsigned char vsdstPermute;
icculus@1047
   719
    vector unsigned char valpha;
icculus@1047
   720
    vector unsigned char valphamask;
icculus@1047
   721
    vector unsigned char vbits;
icculus@1047
   722
    vector unsigned char v0;
icculus@1047
   723
    vector unsigned short v1;
icculus@1047
   724
    vector unsigned short v8;
icculus@1047
   725
    vector unsigned int vckey;
icculus@1047
   726
    vector unsigned int vrgbmask;
icculus@1047
   727
icculus@1047
   728
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   729
    v0 = vec_splat_u8(0);
icculus@1047
   730
    v1 = vec_splat_u16(1);
icculus@1047
   731
    v8 = vec_splat_u16(8);
icculus@1047
   732
icculus@1047
   733
    /* set the alpha to 255 on the destination surf */
icculus@1047
   734
    valphamask = VEC_ALPHA_MASK();
icculus@1047
   735
icculus@1047
   736
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
   737
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
   738
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
   739
icculus@1047
   740
    /* set a vector full of alpha and 255-alpha */
icculus@1047
   741
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
   742
    valpha = vec_splat(valpha, 0);
icculus@1047
   743
    vbits = (vector unsigned char)vec_splat_s8(-1);
icculus@1047
   744
icculus@1047
   745
    ckey &= rgbmask;
icculus@1162
   746
    ((unsigned int *)(char*)&vckey)[0] = ckey;
icculus@1047
   747
    vckey = vec_splat(vckey, 0);
icculus@1162
   748
    ((unsigned int *)(char*)&vrgbmask)[0] = rgbmask;
icculus@1047
   749
    vrgbmask = vec_splat(vrgbmask, 0);
icculus@1047
   750
icculus@1047
   751
    while(height--) {
icculus@1047
   752
        int width = info->d_width;
icculus@1047
   753
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   754
        while (condition) { \
icculus@1162
   755
            Uint32 Pixel; \
icculus@1047
   756
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
   757
            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
icculus@1162
   758
            if(sA && Pixel != ckey) { \
icculus@1162
   759
                RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
icculus@1162
   760
                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
   761
                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   762
                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
   763
            } \
icculus@1162
   764
            dstp++; \
icculus@1162
   765
            srcp++; \
icculus@1047
   766
            widthvar--; \
icculus@1047
   767
        }
icculus@1047
   768
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
   769
        if (width > 0) {
icculus@1047
   770
            int extrawidth = (width % 4);
icculus@1047
   771
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
   772
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
   773
            width -= extrawidth;
icculus@1047
   774
            while (width) {
icculus@1047
   775
                vector unsigned char vsel;
icculus@1047
   776
                vector unsigned char voverflow;
icculus@1047
   777
                vector unsigned char vd;
icculus@1047
   778
                vector unsigned char vd_orig;
icculus@1047
   779
icculus@1047
   780
                /* s = *srcp */
icculus@1047
   781
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
   782
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
   783
                
icculus@1047
   784
                /* vsel is set for items that match the key */
icculus@1047
   785
                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
icculus@1047
   786
                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
icculus@1047
   787
icculus@1047
   788
                /* permute to source format */
icculus@1047
   789
                vs = vec_perm(vs, valpha, vsrcPermute);
icculus@1047
   790
icculus@1047
   791
                /* d = *dstp */
icculus@1047
   792
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
   793
                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
   794
icculus@1047
   795
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
   796
icculus@1047
   797
                /* set the alpha channel to full on */
icculus@1047
   798
                vd = vec_or(vd, valphamask);
icculus@1047
   799
icculus@1047
   800
                /* mask out color key */
icculus@1047
   801
                vd = vec_sel(vd, vd_orig, vsel);
icculus@1047
   802
                
icculus@1047
   803
                /* permute to dest format */
icculus@1047
   804
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
   805
icculus@1047
   806
                /* *dstp = res */
icculus@1047
   807
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
   808
                
icculus@1047
   809
                srcp += 4;
icculus@1047
   810
                dstp += 4;
icculus@1047
   811
                width -= 4;
icculus@1047
   812
                vs = voverflow;
icculus@1047
   813
            }
icculus@1047
   814
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
   815
        }
icculus@1047
   816
#undef ONE_PIXEL_BLEND
icculus@1047
   817
 
icculus@1047
   818
        srcp += srcskip;
icculus@1047
   819
        dstp += dstskip;
icculus@1047
   820
    }
icculus@1047
   821
}
icculus@1047
   822
icculus@1047
   823
icculus@1047
   824
static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
   825
{
icculus@1047
   826
    int width = info->d_width;
icculus@1047
   827
    int height = info->d_height;
icculus@1047
   828
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
   829
    int srcskip = info->s_skip >> 2;
icculus@1047
   830
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
   831
    int dstskip = info->d_skip >> 2;
icculus@1047
   832
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
   833
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
   834
    vector unsigned char mergePermute;
icculus@1047
   835
    vector unsigned char valphaPermute;
icculus@1047
   836
    vector unsigned char vsrcPermute;
icculus@1047
   837
    vector unsigned char vdstPermute;
icculus@1047
   838
    vector unsigned char vsdstPermute;
icculus@1047
   839
    vector unsigned char valphamask;
icculus@1047
   840
    vector unsigned char vpixelmask;
icculus@1047
   841
    vector unsigned char v0;
icculus@1047
   842
    vector unsigned short v1;
icculus@1047
   843
    vector unsigned short v8;
icculus@1047
   844
icculus@1047
   845
    v0 = vec_splat_u8(0);
icculus@1047
   846
    v1 = vec_splat_u16(1);
icculus@1047
   847
    v8 = vec_splat_u16(8);
icculus@1047
   848
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   849
    valphamask = VEC_ALPHA_MASK();
icculus@1047
   850
    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
   851
    vpixelmask = vec_nor(valphamask, v0);
icculus@1047
   852
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
   853
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
   854
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
   855
icculus@1047
   856
	while ( height-- ) {
icculus@1047
   857
        width = info->d_width;
icculus@1047
   858
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
   859
            Uint32 Pixel; \
icculus@1047
   860
            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
icculus@1162
   861
            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
icculus@1047
   862
            if(sA) { \
icculus@1162
   863
              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
icculus@1047
   864
              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
   865
              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
   866
            } \
icculus@1047
   867
            ++srcp; \
icculus@1047
   868
            ++dstp; \
icculus@1047
   869
            widthvar--; \
icculus@1047
   870
        }
icculus@1047
   871
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
   872
        if (width > 0) {
icculus@1047
   873
            // vsrcPermute
icculus@1047
   874
            // vdstPermute
icculus@1047
   875
            int extrawidth = (width % 4);
icculus@1047
   876
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
   877
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
   878
            width -= extrawidth;
icculus@1047
   879
            while (width) {
icculus@1047
   880
                vector unsigned char voverflow;
icculus@1047
   881
                vector unsigned char vd;
icculus@1047
   882
                vector unsigned char valpha;
icculus@1047
   883
                vector unsigned char vdstalpha;
icculus@1047
   884
                /* s = *srcp */
icculus@1047
   885
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
   886
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
   887
                vs = vec_perm(vs, v0, vsrcPermute);
icculus@1047
   888
icculus@1047
   889
                valpha = vec_perm(vs, v0, valphaPermute);
icculus@1047
   890
                
icculus@1047
   891
                /* d = *dstp */
icculus@1047
   892
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
   893
                vd = vec_perm(vd, v0, vsdstPermute);
icculus@1047
   894
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
   895
icculus@1047
   896
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
   897
icculus@1047
   898
                /* set the alpha to the dest alpha */
icculus@1047
   899
                vd = vec_and(vd, vpixelmask);
icculus@1047
   900
                vd = vec_or(vd, vdstalpha);
icculus@1047
   901
                vd = vec_perm(vd, v0, vdstPermute);
icculus@1047
   902
icculus@1047
   903
                /* *dstp = res */
icculus@1047
   904
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
   905
                
icculus@1047
   906
                srcp += 4;
icculus@1047
   907
                dstp += 4;
icculus@1047
   908
                width -= 4;
icculus@1047
   909
                vs = voverflow;
icculus@1047
   910
icculus@1047
   911
            }
icculus@1047
   912
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
   913
        }
icculus@1047
   914
	    srcp += srcskip;
icculus@1047
   915
	    dstp += dstskip;
icculus@1047
   916
#undef ONE_PIXEL_BLEND
icculus@1047
   917
	}
icculus@1047
   918
}
icculus@1047
   919
icculus@1047
   920
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
icculus@1047
   921
static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
   922
{
icculus@1047
   923
	int width = info->d_width;
icculus@1047
   924
	int height = info->d_height;
icculus@1047
   925
	Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
   926
	int srcskip = info->s_skip >> 2;
icculus@1047
   927
	Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
   928
	int dstskip = info->d_skip >> 2;
icculus@1047
   929
    vector unsigned char mergePermute;
icculus@1047
   930
    vector unsigned char valphaPermute;
icculus@1047
   931
    vector unsigned char valphamask;
icculus@1047
   932
    vector unsigned char vpixelmask;
icculus@1047
   933
    vector unsigned char v0;
icculus@1047
   934
    vector unsigned short v1;
icculus@1047
   935
    vector unsigned short v8;
icculus@1047
   936
    v0 = vec_splat_u8(0);
icculus@1047
   937
    v1 = vec_splat_u16(1);
icculus@1047
   938
    v8 = vec_splat_u16(8);
icculus@1047
   939
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
   940
    valphamask = VEC_ALPHA_MASK();
icculus@1047
   941
    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
icculus@1047
   942
    
icculus@1047
   943
 
icculus@1047
   944
    vpixelmask = vec_nor(valphamask, v0);
icculus@1047
   945
	while(height--) {
icculus@1047
   946
        width = info->d_width;
icculus@1047
   947
#define ONE_PIXEL_BLEND(condition, widthvar) \
icculus@1047
   948
        while ((condition)) { \
icculus@1047
   949
            Uint32 dalpha; \
icculus@1047
   950
            Uint32 d; \
icculus@1047
   951
            Uint32 s1; \
icculus@1047
   952
            Uint32 d1; \
icculus@1047
   953
            Uint32 s = *srcp; \
icculus@1047
   954
            Uint32 alpha = s >> 24; \
icculus@1047
   955
            if(alpha) { \
icculus@1047
   956
              if(alpha == SDL_ALPHA_OPAQUE) { \
icculus@1047
   957
                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
icculus@1047
   958
              } else { \
icculus@1047
   959
                d = *dstp; \
icculus@1047
   960
                dalpha = d & 0xff000000; \
icculus@1047
   961
                s1 = s & 0xff00ff; \
icculus@1047
   962
                d1 = d & 0xff00ff; \
icculus@1047
   963
                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
icculus@1047
   964
                s &= 0xff00; \
icculus@1047
   965
                d &= 0xff00; \
icculus@1047
   966
                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
   967
                *dstp = d1 | d | dalpha; \
icculus@1047
   968
              } \
icculus@1047
   969
            } \
icculus@1047
   970
            ++srcp; \
icculus@1047
   971
            ++dstp; \
icculus@1047
   972
            widthvar--; \
icculus@1047
   973
	    }
icculus@1047
   974
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
   975
        if (width > 0) {
icculus@1047
   976
            int extrawidth = (width % 4);
icculus@1047
   977
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
   978
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
   979
            width -= extrawidth;
icculus@1047
   980
            while (width) {
icculus@1047
   981
                vector unsigned char voverflow;
icculus@1047
   982
                vector unsigned char vd;
icculus@1047
   983
                vector unsigned char valpha;
icculus@1047
   984
                vector unsigned char vdstalpha;
icculus@1047
   985
                /* s = *srcp */
icculus@1047
   986
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
   987
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
   988
icculus@1047
   989
                valpha = vec_perm(vs, v0, valphaPermute);
icculus@1047
   990
                
icculus@1047
   991
                /* d = *dstp */
icculus@1047
   992
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
   993
                vdstalpha = vec_and(vd, valphamask);
icculus@1047
   994
icculus@1047
   995
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
   996
icculus@1047
   997
                /* set the alpha to the dest alpha */
icculus@1047
   998
                vd = vec_and(vd, vpixelmask);
icculus@1047
   999
                vd = vec_or(vd, vdstalpha);
icculus@1047
  1000
icculus@1047
  1001
                /* *dstp = res */
icculus@1047
  1002
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1003
                
icculus@1047
  1004
                srcp += 4;
icculus@1047
  1005
                dstp += 4;
icculus@1047
  1006
                width -= 4;
icculus@1047
  1007
                vs = voverflow;
icculus@1047
  1008
            }
icculus@1047
  1009
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1010
        }
icculus@1047
  1011
	    srcp += srcskip;
icculus@1047
  1012
	    dstp += dstskip;
icculus@1047
  1013
	}
icculus@1047
  1014
#undef ONE_PIXEL_BLEND
icculus@1047
  1015
}
icculus@1047
  1016
icculus@1047
  1017
static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1018
{
icculus@1047
  1019
    /* XXX : 6 */
icculus@1047
  1020
	unsigned alpha = info->src->alpha;
icculus@1047
  1021
    int height = info->d_height;
icculus@1047
  1022
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1023
    int srcskip = info->s_skip >> 2;
icculus@1047
  1024
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1025
    int dstskip = info->d_skip >> 2;
icculus@1047
  1026
    SDL_PixelFormat *srcfmt = info->src;
icculus@1047
  1027
    SDL_PixelFormat *dstfmt = info->dst;
icculus@1047
  1028
	unsigned sA = srcfmt->alpha;
icculus@1047
  1029
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
icculus@1047
  1030
    vector unsigned char mergePermute;
icculus@1047
  1031
    vector unsigned char vsrcPermute;
icculus@1047
  1032
    vector unsigned char vdstPermute;
icculus@1047
  1033
    vector unsigned char vsdstPermute;
icculus@1047
  1034
    vector unsigned char valpha;
icculus@1047
  1035
    vector unsigned char valphamask;
icculus@1047
  1036
    vector unsigned char vbits;
icculus@1047
  1037
    vector unsigned short v1;
icculus@1047
  1038
    vector unsigned short v8;
icculus@1047
  1039
icculus@1047
  1040
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1041
    v1 = vec_splat_u16(1);
icculus@1047
  1042
    v8 = vec_splat_u16(8);
icculus@1047
  1043
icculus@1047
  1044
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1045
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1046
icculus@1047
  1047
    vsrcPermute = calc_swizzle32(srcfmt, NULL);
icculus@1047
  1048
    vdstPermute = calc_swizzle32(NULL, dstfmt);
icculus@1047
  1049
    vsdstPermute = calc_swizzle32(dstfmt, NULL);
icculus@1047
  1050
icculus@1047
  1051
    /* set a vector full of alpha and 255-alpha */
icculus@1047
  1052
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
  1053
    valpha = vec_splat(valpha, 0);
icculus@1047
  1054
    vbits = (vector unsigned char)vec_splat_s8(-1);
icculus@1047
  1055
icculus@1047
  1056
    while(height--) {
icculus@1047
  1057
        int width = info->d_width;
icculus@1047
  1058
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1162
  1059
            Uint32 Pixel; \
icculus@1047
  1060
            unsigned sR, sG, sB, dR, dG, dB; \
icculus@1162
  1061
            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
icculus@1162
  1062
            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
icculus@1047
  1063
            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
icculus@1047
  1064
            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
icculus@1047
  1065
            ++srcp; \
icculus@1047
  1066
            ++dstp; \
icculus@1047
  1067
            widthvar--; \
icculus@1047
  1068
        }
icculus@1047
  1069
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1070
        if (width > 0) {
icculus@1047
  1071
            int extrawidth = (width % 4);
icculus@1047
  1072
            vector unsigned char valigner = vec_lvsl(0, srcp);
icculus@1047
  1073
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1074
            width -= extrawidth;
icculus@1047
  1075
            while (width) {
icculus@1047
  1076
                vector unsigned char voverflow;
icculus@1047
  1077
                vector unsigned char vd;
icculus@1047
  1078
icculus@1047
  1079
                /* s = *srcp */
icculus@1047
  1080
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1081
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1082
                vs = vec_perm(vs, valpha, vsrcPermute);
icculus@1047
  1083
                
icculus@1047
  1084
                /* d = *dstp */
icculus@1047
  1085
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1086
                vd = vec_perm(vd, vd, vsdstPermute);
icculus@1047
  1087
icculus@1047
  1088
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1089
icculus@1047
  1090
                /* set the alpha channel to full on */
icculus@1047
  1091
                vd = vec_or(vd, valphamask);
icculus@1047
  1092
                vd = vec_perm(vd, vbits, vdstPermute);
icculus@1047
  1093
icculus@1047
  1094
                /* *dstp = res */
icculus@1047
  1095
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1096
                
icculus@1047
  1097
                srcp += 4;
icculus@1047
  1098
                dstp += 4;
icculus@1047
  1099
                width -= 4;
icculus@1047
  1100
                vs = voverflow;
icculus@1047
  1101
            }
icculus@1047
  1102
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1103
        }
icculus@1047
  1104
#undef ONE_PIXEL_BLEND
icculus@1047
  1105
 
icculus@1047
  1106
        srcp += srcskip;
icculus@1047
  1107
        dstp += dstskip;
icculus@1047
  1108
    }
icculus@1047
  1109
icculus@1047
  1110
}
icculus@1047
  1111
icculus@1047
  1112
icculus@1047
  1113
/* fast RGB888->(A)RGB888 blending */
icculus@1047
  1114
static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
icculus@1047
  1115
{
icculus@1047
  1116
	unsigned alpha = info->src->alpha;
icculus@1047
  1117
    int height = info->d_height;
icculus@1047
  1118
    Uint32 *srcp = (Uint32 *)info->s_pixels;
icculus@1047
  1119
    int srcskip = info->s_skip >> 2;
icculus@1047
  1120
    Uint32 *dstp = (Uint32 *)info->d_pixels;
icculus@1047
  1121
    int dstskip = info->d_skip >> 2;
icculus@1047
  1122
    vector unsigned char mergePermute;
icculus@1047
  1123
    vector unsigned char valpha;
icculus@1047
  1124
    vector unsigned char valphamask;
icculus@1047
  1125
    vector unsigned short v1;
icculus@1047
  1126
    vector unsigned short v8;
icculus@1047
  1127
icculus@1047
  1128
    mergePermute = VEC_MERGE_PERMUTE();
icculus@1047
  1129
    v1 = vec_splat_u16(1);
icculus@1047
  1130
    v8 = vec_splat_u16(8);
icculus@1047
  1131
icculus@1047
  1132
    /* set the alpha to 255 on the destination surf */
icculus@1047
  1133
    valphamask = VEC_ALPHA_MASK();
icculus@1047
  1134
icculus@1047
  1135
    /* set a vector full of alpha and 255-alpha */
icculus@1047
  1136
    ((unsigned char *)&valpha)[0] = alpha;
icculus@1047
  1137
    valpha = vec_splat(valpha, 0);
icculus@1047
  1138
icculus@1047
  1139
    while(height--) {
icculus@1047
  1140
        int width = info->d_width;
icculus@1047
  1141
#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
icculus@1047
  1142
            Uint32 s = *srcp; \
icculus@1047
  1143
            Uint32 d = *dstp; \
icculus@1047
  1144
            Uint32 s1 = s & 0xff00ff; \
icculus@1047
  1145
            Uint32 d1 = d & 0xff00ff; \
icculus@1047
  1146
            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
icculus@1047
  1147
                 & 0xff00ff; \
icculus@1047
  1148
            s &= 0xff00; \
icculus@1047
  1149
            d &= 0xff00; \
icculus@1047
  1150
            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
icculus@1047
  1151
            *dstp = d1 | d | 0xff000000; \
icculus@1047
  1152
            ++srcp; \
icculus@1047
  1153
            ++dstp; \
icculus@1047
  1154
            widthvar--; \
icculus@1047
  1155
        }
icculus@1047
  1156
        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
icculus@1047
  1157
        if (width > 0) {
icculus@1047
  1158
            int extrawidth = (width % 4);
icculus@1047
  1159
            vector unsigned char valigner = VEC_ALIGNER(srcp);
icculus@1047
  1160
            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
icculus@1047
  1161
            width -= extrawidth;
icculus@1047
  1162
            while (width) {
icculus@1047
  1163
                vector unsigned char voverflow;
icculus@1047
  1164
                vector unsigned char vd;
icculus@1047
  1165
icculus@1047
  1166
                /* s = *srcp */
icculus@1047
  1167
                voverflow = (vector unsigned char)vec_ld(15, srcp);
icculus@1047
  1168
                vs = vec_perm(vs, voverflow, valigner);
icculus@1047
  1169
                
icculus@1047
  1170
                /* d = *dstp */
icculus@1047
  1171
                vd = (vector unsigned char)vec_ld(0, dstp);
icculus@1047
  1172
icculus@1047
  1173
                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
icculus@1047
  1174
icculus@1047
  1175
                /* set the alpha channel to full on */
icculus@1047
  1176
                vd = vec_or(vd, valphamask);
icculus@1047
  1177
icculus@1047
  1178
                /* *dstp = res */
icculus@1047
  1179
                vec_st((vector unsigned int)vd, 0, dstp);
icculus@1047
  1180
                
icculus@1047
  1181
                srcp += 4;
icculus@1047
  1182
                dstp += 4;
icculus@1047
  1183
                width -= 4;
icculus@1047
  1184
                vs = voverflow;
icculus@1047
  1185
            }
icculus@1047
  1186
            ONE_PIXEL_BLEND((extrawidth), extrawidth);
icculus@1047
  1187
        }
icculus@1047
  1188
#undef ONE_PIXEL_BLEND
icculus@1047
  1189
 
icculus@1047
  1190
        srcp += srcskip;
icculus@1047
  1191
        dstp += dstskip;
icculus@1047
  1192
    }
icculus@1047
  1193
}
icculus@1047
  1194
#endif /* USE_ALTIVEC_BLITTERS */
icculus@1047
  1195
slouken@1
  1196
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1
  1197
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
slouken@0
  1198
{
slouken@0
  1199
	int width = info->d_width;
slouken@0
  1200
	int height = info->d_height;
slouken@0
  1201
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1202
	int srcskip = info->s_skip >> 2;
slouken@0
  1203
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
  1204
	int dstskip = info->d_skip >> 2;
slouken@0
  1205
slouken@0
  1206
	while(height--) {
slouken@0
  1207
	    DUFFS_LOOP4({
slouken@1
  1208
		    Uint32 s = *srcp++;
slouken@1
  1209
		    Uint32 d = *dstp;
slouken@1
  1210
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
  1211
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
  1212
	    }, width);
slouken@0
  1213
	    srcp += srcskip;
slouken@0
  1214
	    dstp += dstskip;
slouken@0
  1215
	}
slouken@0
  1216
}
slouken@0
  1217
slouken@1
  1218
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1
  1219
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
slouken@1
  1220
{
slouken@1
  1221
	unsigned alpha = info->src->alpha;
slouken@1
  1222
	if(alpha == 128) {
slouken@1
  1223
		BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1
  1224
	} else {
slouken@1
  1225
		int width = info->d_width;
slouken@1
  1226
		int height = info->d_height;
slouken@1
  1227
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1
  1228
		int srcskip = info->s_skip >> 2;
slouken@1
  1229
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1
  1230
		int dstskip = info->d_skip >> 2;
slouken@689
  1231
		Uint32 s;
slouken@689
  1232
		Uint32 d;
slouken@689
  1233
		Uint32 s1;
slouken@689
  1234
		Uint32 d1;
slouken@1
  1235
slouken@1
  1236
		while(height--) {
slouken@689
  1237
			DUFFS_LOOP_DOUBLE2({
slouken@689
  1238
				/* One Pixel Blend */
slouken@1
  1239
				s = *srcp;
slouken@1
  1240
				d = *dstp;
slouken@1
  1241
				s1 = s & 0xff00ff;
slouken@1
  1242
				d1 = d & 0xff00ff;
slouken@1
  1243
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
  1244
				     & 0xff00ff;
slouken@1
  1245
				s &= 0xff00;
slouken@1
  1246
				d &= 0xff00;
slouken@1
  1247
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
  1248
				*dstp = d1 | d | 0xff000000;
slouken@1
  1249
				++srcp;
slouken@1
  1250
				++dstp;
slouken@689
  1251
			},{
slouken@689
  1252
			        /* Two Pixels Blend */
slouken@689
  1253
				s = *srcp;
slouken@689
  1254
				d = *dstp;
slouken@689
  1255
				s1 = s & 0xff00ff;
slouken@689
  1256
				d1 = d & 0xff00ff;
slouken@689
  1257
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1258
				d1 &= 0xff00ff;
slouken@689
  1259
				     
slouken@689
  1260
				s = ((s & 0xff00) >> 8) | 
slouken@689
  1261
					((srcp[1] & 0xff00) << 8);
slouken@689
  1262
				d = ((d & 0xff00) >> 8) |
slouken@689
  1263
					((dstp[1] & 0xff00) << 8);
slouken@689
  1264
				d += (s - d) * alpha >> 8;
slouken@689
  1265
				d &= 0x00ff00ff;
slouken@689
  1266
				
slouken@689
  1267
				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
slouken@689
  1268
				++srcp;
slouken@689
  1269
				
slouken@689
  1270
			        s1 = *srcp;
slouken@689
  1271
				d1 = *dstp;
slouken@689
  1272
				s1 &= 0xff00ff;
slouken@689
  1273
				d1 &= 0xff00ff;
slouken@689
  1274
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
  1275
				d1 &= 0xff00ff;
slouken@689
  1276
				
slouken@689
  1277
				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
slouken@689
  1278
				++srcp;
slouken@689
  1279
				++dstp;
slouken@1
  1280
			}, width);
slouken@1
  1281
			srcp += srcskip;
slouken@1
  1282
			dstp += dstskip;
slouken@1
  1283
		}
slouken@1
  1284
	}
slouken@1
  1285
}
slouken@1
  1286
slouken@0
  1287
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@0
  1288
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
slouken@0
  1289
{
slouken@0
  1290
	int width = info->d_width;
slouken@0
  1291
	int height = info->d_height;
slouken@0
  1292
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1293
	int srcskip = info->s_skip >> 2;
slouken@0
  1294
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
  1295
	int dstskip = info->d_skip >> 2;
slouken@0
  1296
slouken@0
  1297
	while(height--) {
slouken@0
  1298
	    DUFFS_LOOP4({
slouken@0
  1299
		Uint32 dalpha;
slouken@0
  1300
		Uint32 d;
slouken@0
  1301
		Uint32 s1;
slouken@0
  1302
		Uint32 d1;
slouken@0
  1303
		Uint32 s = *srcp;
slouken@0
  1304
		Uint32 alpha = s >> 24;
slouken@0
  1305
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1306
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1307
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1308
		   Benchmark this! */
slouken@689
  1309
		if(alpha) {   
slouken@689
  1310
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@0
  1311
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
  1312
		  } else {
slouken@0
  1313
		    /*
slouken@0
  1314
		     * take out the middle component (green), and process
slouken@0
  1315
		     * the other two in parallel. One multiply less.
slouken@0
  1316
		     */
slouken@0
  1317
		    d = *dstp;
slouken@0
  1318
		    dalpha = d & 0xff000000;
slouken@0
  1319
		    s1 = s & 0xff00ff;
slouken@0
  1320
		    d1 = d & 0xff00ff;
slouken@0
  1321
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
  1322
		    s &= 0xff00;
slouken@0
  1323
		    d &= 0xff00;
slouken@0
  1324
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@0
  1325
		    *dstp = d1 | d | dalpha;
slouken@689
  1326
		  }
slouken@0
  1327
		}
slouken@0
  1328
		++srcp;
slouken@0
  1329
		++dstp;
slouken@0
  1330
	    }, width);
slouken@0
  1331
	    srcp += srcskip;
slouken@0
  1332
	    dstp += dstskip;
slouken@0
  1333
	}
slouken@0
  1334
}
slouken@0
  1335
slouken@880
  1336
#ifdef MMX_ASMBLIT
slouken@689
  1337
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
  1338
inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
slouken@689
  1339
{
slouken@689
  1340
	int width = info->d_width;
slouken@689
  1341
	int height = info->d_height;
slouken@689
  1342
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
  1343
	int srcskip = info->s_skip >> 2;
slouken@689
  1344
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
  1345
	int dstskip = info->d_skip >> 2;
slouken@689
  1346
slouken@689
  1347
	Uint32 s;
slouken@689
  1348
	Uint32 alpha;
slouken@689
  1349
slouken@689
  1350
	__asm__ (
slouken@689
  1351
	/* make mm6 all zeros. */
slouken@689
  1352
	"pxor       %%mm6, %%mm6\n"
slouken@689
  1353
	
slouken@689
  1354
	/* Make a mask to preserve the alpha. */
slouken@689
  1355
	"pcmpeqb   %%mm7, %%mm7\n\t"            /* mm7(s) = FF FF FF FF | FF FF FF FF */
slouken@689
  1356
	"psrlq     $16, %%mm7\n\t"		    /* mm7(s) = 00 00 FF FF | FF FF FF FF */
slouken@689
  1357
slouken@689
  1358
		: );
slouken@689
  1359
slouken@689
  1360
	while(height--) {
slouken@689
  1361
slouken@689
  1362
	    DUFFS_LOOP4({
slouken@689
  1363
slouken@689
  1364
		__asm__ (
slouken@689
  1365
		"prefetch 64(%0)\n"
slouken@689
  1366
		"prefetch 64(%1)\n"
slouken@689
  1367
			: : "r" (srcp), "r" (dstp) );
slouken@689
  1368
slouken@689
  1369
		s = *srcp;
slouken@689
  1370
		alpha = s >> 24;
slouken@689
  1371
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
  1372
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
  1373
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
  1374
		   Benchmark this! */
slouken@689
  1375
		
slouken@689
  1376
		if(alpha == SDL_ALPHA_OPAQUE) {
slouken@689
  1377
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
  1378
		} 
slouken@689
  1379
slouken@689
  1380
		else {
slouken@689
  1381
			    __asm__ (
slouken@689
  1382
		    /* load in the source, and dst. */
slouken@689
  1383
		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
slouken@689
  1384
		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
slouken@689
  1385
slouken@689
  1386
		    /* Move the src alpha into mm2 */
slouken@689
  1387
slouken@689
  1388
		    /* if supporting pshufw */
slouken@689
  1389
		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
  1390
		    /*"psrlw     $8, %%mm2\n" */
slouken@689
  1391
		    
slouken@689
  1392
		    /* else: */
slouken@689
  1393
		    "movq      %%mm0, %%mm2\n"
slouken@689
  1394
		    "psrld     $24, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
slouken@689
  1395
		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
slouken@689
  1396
		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
  1397
slouken@689
  1398
		    /* move the colors into words. */
slouken@689
  1399
		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
slouken@689
  1400
		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
slouken@689
  1401
slouken@689
  1402
		    /* src - dst */
slouken@689
  1403
		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
slouken@689
  1404
slouken@689
  1405
		    /* A * (src-dst) */
slouken@689
  1406
		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = As*As-d As*Rs-d | As*Gs-d  As*Bs-d */
slouken@689
  1407
		    "pand      %%mm7, %%mm0\n"              /* to preserve dest alpha */
slouken@689
  1408
		    "psrlw     $8,    %%mm0\n"		    /* mm0 = Ac>>8 Rc>>8 | Gc>>8  Bc>>8 */
slouken@689
  1409
		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = Ac+Ad Rc+Rd | Gc+Gd  Bc+Bd */
slouken@689
  1410
slouken@689
  1411
		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
slouken@689
  1412
		    
slouken@689
  1413
		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
slouken@689
  1414
slouken@689
  1415
		     : : "r" (srcp), "r" (dstp) );
slouken@689
  1416
slouken@689
  1417
		}
slouken@689
  1418
		++srcp;
slouken@689
  1419
		++dstp;
slouken@689
  1420
	    }, width);
slouken@689
  1421
	    srcp += srcskip;
slouken@689
  1422
	    dstp += dstskip;
slouken@689
  1423
	}
slouken@689
  1424
slouken@689
  1425
	__asm__ (
slouken@689
  1426
	"emms\n"
slouken@689
  1427
		:   );
slouken@689
  1428
}
slouken@689
  1429
#endif
slouken@689
  1430
slouken@1
  1431
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
  1432
slouken@1
  1433
/* blend a single 16 bit pixel at 50% */
slouken@1
  1434
#define BLEND16_50(d, s, mask)						\
slouken@1
  1435
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
  1436
slouken@1
  1437
/* blend two 16 bit pixels at 50% */
slouken@1
  1438
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
  1439
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
  1440
	 + (s & d & (~(mask | mask << 16))))
slouken@1
  1441
slouken@1
  1442
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
slouken@0
  1443
{
slouken@0
  1444
	int width = info->d_width;
slouken@0
  1445
	int height = info->d_height;
slouken@0
  1446
	Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@0
  1447
	int srcskip = info->s_skip >> 1;
slouken@0
  1448
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1449
	int dstskip = info->d_skip >> 1;
slouken@0
  1450
slouken@0
  1451
	while(height--) {
slouken@1
  1452
		if(((unsigned long)srcp ^ (unsigned long)dstp) & 2) {
slouken@1
  1453
			/*
slouken@1
  1454
			 * Source and destination not aligned, pipeline it.
slouken@1
  1455
			 * This is mostly a win for big blits but no loss for
slouken@1
  1456
			 * small ones
slouken@1
  1457
			 */
slouken@1
  1458
			Uint32 prev_sw;
slouken@1
  1459
			int w = width;
slouken@1
  1460
slouken@1
  1461
			/* handle odd destination */
slouken@1
  1462
			if((unsigned long)dstp & 2) {
slouken@1
  1463
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1464
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1465
				dstp++;
slouken@1
  1466
				srcp++;
slouken@1
  1467
				w--;
slouken@1
  1468
			}
slouken@1
  1469
			srcp++;	/* srcp is now 32-bit aligned */
slouken@1
  1470
slouken@1
  1471
			/* bootstrap pipeline with first halfword */
slouken@1
  1472
			prev_sw = ((Uint32 *)srcp)[-1];
slouken@1
  1473
slouken@1
  1474
			while(w > 1) {
slouken@1
  1475
				Uint32 sw, dw, s;
slouken@1
  1476
				sw = *(Uint32 *)srcp;
slouken@1
  1477
				dw = *(Uint32 *)dstp;
slouken@1
  1478
				if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
slouken@1
  1479
					s = (prev_sw << 16) + (sw >> 16);
slouken@1
  1480
				else
slouken@1
  1481
					s = (prev_sw >> 16) + (sw << 16);
slouken@1
  1482
				prev_sw = sw;
slouken@1
  1483
				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
slouken@1
  1484
				dstp += 2;
slouken@1
  1485
				srcp += 2;
slouken@1
  1486
				w -= 2;
slouken@1
  1487
			}
slouken@1
  1488
slouken@1
  1489
			/* final pixel if any */
slouken@1
  1490
			if(w) {
slouken@1
  1491
				Uint16 d = *dstp, s;
slouken@1
  1492
				if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
slouken@1
  1493
					s = prev_sw;
slouken@1
  1494
				else
slouken@1
  1495
					s = prev_sw >> 16;
slouken@1
  1496
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1497
				srcp++;
slouken@1
  1498
				dstp++;
slouken@1
  1499
			}
slouken@1
  1500
			srcp += srcskip - 1;
slouken@1
  1501
			dstp += dstskip;
slouken@1
  1502
		} else {
slouken@1
  1503
			/* source and destination are aligned */
slouken@1
  1504
			int w = width;
slouken@1
  1505
slouken@1
  1506
			/* first odd pixel? */
slouken@1
  1507
			if((unsigned long)srcp & 2) {
slouken@1
  1508
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1509
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1510
				srcp++;
slouken@1
  1511
				dstp++;
slouken@1
  1512
				w--;
slouken@1
  1513
			}
slouken@1
  1514
			/* srcp and dstp are now 32-bit aligned */
slouken@1
  1515
slouken@1
  1516
			while(w > 1) {
slouken@1
  1517
				Uint32 sw = *(Uint32 *)srcp;
slouken@1
  1518
				Uint32 dw = *(Uint32 *)dstp;
slouken@1
  1519
				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
slouken@1
  1520
				srcp += 2;
slouken@1
  1521
				dstp += 2;
slouken@1
  1522
				w -= 2;
slouken@1
  1523
			}
slouken@1
  1524
slouken@1
  1525
			/* last odd pixel? */
slouken@1
  1526
			if(w) {
slouken@1
  1527
				Uint16 d = *dstp, s = *srcp;
slouken@1
  1528
				*dstp = BLEND16_50(d, s, mask);
slouken@1
  1529
				srcp++;
slouken@1
  1530
				dstp++;
slouken@1
  1531
			}
slouken@1
  1532
			srcp += srcskip;
slouken@1
  1533
			dstp += dstskip;
slouken@1
  1534
		}
slouken@1
  1535
	}
slouken@1
  1536
}
slouken@1
  1537
slouken@880
  1538
#ifdef MMX_ASMBLIT
slouken@689
  1539
/* fast RGB565->RGB565 blending with surface alpha */
slouken@689
  1540
static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
  1541
{
slouken@689
  1542
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
  1543
	if(alpha == 128) {
slouken@689
  1544
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@689
  1545
	} else {
slouken@689
  1546
		int width = info->d_width;
slouken@689
  1547
		int height = info->d_height;
slouken@689
  1548
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
  1549
		int srcskip = info->s_skip >> 1;
slouken@689
  1550
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
  1551
		int dstskip = info->d_skip >> 1;
slouken@689
  1552
	        Uint32 s, d;
slouken@689
  1553
	        Uint8 load[8];
slouken@689
  1554
	  
slouken@689
  1555
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@689
  1556
	        *(Uint64 *)load = alpha;
slouken@689
  1557
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
  1558
slouken@689
  1559
                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@689
  1560
                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@689
  1561
                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@689
  1562
	  
slouken@689
  1563
 	        /* Setup the 565 color channel masks */
slouken@720
  1564
	        *(Uint64 *)load = 0xF800F800F800F800ULL;
slouken@689
  1565
		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
slouken@720
  1566
		*(Uint64 *)load = 0x07E007E007E007E0ULL;
slouken@689
  1567
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@720
  1568
		*(Uint64 *)load = 0x001F001F001F001FULL;
slouken@689
  1569
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
  1570
		while(height--) {
slouken@689
  1571
                        DUFFS_LOOP_QUATRO2(
slouken@689
  1572
                        {
slouken@689
  1573
	                        s = *srcp++;
slouken@689
  1574
				d = *dstp;
slouken@689
  1575
				/*
slouken@689
  1576
				 * shift out the middle component (green) to
slouken@689
  1577
				 * the high 16 bits, and process all three RGB
slouken@689
  1578
				 * components at the same time.
slouken@689
  1579
				 */
slouken@689
  1580
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1581
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1582
				d += (s - d) * alpha >> 5;
slouken@689
  1583
				d &= 0x07e0f81f;
slouken@689
  1584
				*dstp++ = d | d >> 16;
slouken@689
  1585
                        },{
slouken@689
  1586
	                        s = *srcp++;
slouken@689
  1587
				d = *dstp;
slouken@689
  1588
				/*
slouken@689
  1589
				 * shift out the middle component (green) to
slouken@689
  1590
				 * the high 16 bits, and process all three RGB
slouken@689
  1591
				 * components at the same time.
slouken@689
  1592
				 */
slouken@689
  1593
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1594
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1595
				d += (s - d) * alpha >> 5;
slouken@689
  1596
				d &= 0x07e0f81f;
slouken@689
  1597
				*dstp++ = d | d >> 16;
slouken@689
  1598
			        s = *srcp++;
slouken@689
  1599
				d = *dstp;
slouken@689
  1600
				/*
slouken@689
  1601
				 * shift out the middle component (green) to
slouken@689
  1602
				 * the high 16 bits, and process all three RGB
slouken@689
  1603
				 * components at the same time.
slouken@689
  1604
				 */
slouken@689
  1605
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
  1606
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
  1607
				d += (s - d) * alpha >> 5;
slouken@689
  1608
				d &= 0x07e0f81f;
slouken@689
  1609
				*dstp++ = d | d >> 16;
slouken@689
  1610
                        },{
slouken@689
  1611
	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@689
  1612
	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@689
  1613
			  
slouken@689
  1614
	                        /* RED */
slouken@689
  1615
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1616
	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
slouken@689
  1617
	                        psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
slouken@689
  1618
	
slouken@689
  1619
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1620
	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
slouken@689
  1621
	                        psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
slouken@689
  1622
	
slouken@689
  1623
	                        /* blend */
slouken@689
  1624
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1625
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1626
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1627
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1628
	                        psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */
slouken@689
  1629
	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
slouken@689
  1630
	
slouken@689
  1631
	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
slouken@689
  1632
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1633
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
slouken@689
  1634
	                        por_r2r(mm6, mm3); /* save new reds in dsts */
slouken@689
  1635
	
slouken@689
  1636
	                        /* green */
slouken@689
  1637
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1638
	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
slouken@689
  1639
	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
slouken@689
  1640
	
slouken@689
  1641
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1642
	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
slouken@689
  1643
	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
slouken@689
  1644
	
slouken@689
  1645
	                        /* blend */
slouken@689
  1646
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1647
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1648
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1649
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1650
	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
slouken@689
  1651
	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
slouken@689
  1652
	
slouken@689
  1653
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1654
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1655
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
slouken@689
  1656
	                        por_r2r(mm6, mm3); /* save new greens in dsts */
slouken@689
  1657
	
slouken@689
  1658
	                        /* blue */
slouken@689
  1659
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1660
	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
slouken@689
  1661
		
slouken@689
  1662
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1663
	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@689
  1664
	
slouken@689
  1665
	                        /* blend */
slouken@689
  1666
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1667
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1668
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1669
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1670
	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
slouken@689
  1671
	
slouken@689
  1672
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1673
	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
slouken@689
  1674
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
slouken@689
  1675
	                        por_r2r(mm6, mm3); /* save new blues in dsts */
slouken@689
  1676
	
slouken@689
  1677
	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
slouken@689
  1678
	
slouken@689
  1679
	                        srcp += 4;
slouken@689
  1680
	                        dstp += 4;
slouken@689
  1681
                        }, width);			
slouken@689
  1682
			srcp += srcskip;
slouken@689
  1683
			dstp += dstskip;
slouken@689
  1684
		}
slouken@689
  1685
		emms();
slouken@689
  1686
	}
slouken@689
  1687
}
slouken@689
  1688
slouken@689
  1689
/* fast RGB555->RGB555 blending with surface alpha */
slouken@689
  1690
static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
  1691
{
slouken@689
  1692
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
  1693
	if(alpha == 128) {
slouken@689
  1694
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@689
  1695
	} else {
slouken@689
  1696
		int width = info->d_width;
slouken@689
  1697
		int height = info->d_height;
slouken@689
  1698
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
  1699
		int srcskip = info->s_skip >> 1;
slouken@689
  1700
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
  1701
		int dstskip = info->d_skip >> 1;
slouken@689
  1702
	        Uint32 s, d;
slouken@689
  1703
	        Uint8 load[8];
slouken@689
  1704
	  
slouken@689
  1705
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@689
  1706
	        *(Uint64 *)load = alpha;
slouken@689
  1707
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
  1708
slouken@689
  1709
                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@689
  1710
                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@689
  1711
                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@689
  1712
	  
slouken@689
  1713
 	        /* Setup the 555 color channel masks */
slouken@720
  1714
	        *(Uint64 *)load = 0x7C007C007C007C00ULL;
slouken@689
  1715
		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
slouken@720
  1716
		*(Uint64 *)load = 0x03E003E003E003E0ULL;
slouken@689
  1717
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@720
  1718
		*(Uint64 *)load = 0x001F001F001F001FULL;
slouken@689
  1719
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
  1720
		while(height--) {
slouken@689
  1721
                        DUFFS_LOOP_QUATRO2(
slouken@689
  1722
                        {
slouken@689
  1723
	                        s = *srcp++;
slouken@689
  1724
				d = *dstp;
slouken@689
  1725
				/*
slouken@689
  1726
				 * shift out the middle component (green) to
slouken@689
  1727
				 * the high 16 bits, and process all three RGB
slouken@689
  1728
				 * components at the same time.
slouken@689
  1729
				 */
slouken@689
  1730
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  1731
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  1732
				d += (s - d) * alpha >> 5;
slouken@689
  1733
				d &= 0x03e07c1f;
slouken@689
  1734
				*dstp++ = d | d >> 16;
slouken@689
  1735
                        },{
slouken@689
  1736
	                        s = *srcp++;
slouken@689
  1737
				d = *dstp;
slouken@689
  1738
				/*
slouken@689
  1739
				 * shift out the middle component (green) to
slouken@689
  1740
				 * the high 16 bits, and process all three RGB
slouken@689
  1741
				 * components at the same time.
slouken@689
  1742
				 */
slouken@689
  1743
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  1744
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  1745
				d += (s - d) * alpha >> 5;
slouken@689
  1746
				d &= 0x03e07c1f;
slouken@689
  1747
				*dstp++ = d | d >> 16;
slouken@689
  1748
			        s = *srcp++;
slouken@689
  1749
				d = *dstp;
slouken@689
  1750
				/*
slouken@689
  1751
				 * shift out the middle component (green) to
slouken@689
  1752
				 * the high 16 bits, and process all three RGB
slouken@689
  1753
				 * components at the same time.
slouken@689
  1754
				 */
slouken@689
  1755
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
  1756
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
  1757
				d += (s - d) * alpha >> 5;
slouken@689
  1758
				d &= 0x03e07c1f;
slouken@689
  1759
				*dstp++ = d | d >> 16;
slouken@689
  1760
                        },{
slouken@689
  1761
	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@689
  1762
	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@689
  1763
			  
slouken@689
  1764
	                        /* RED */
slouken@689
  1765
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1766
	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
slouken@689
  1767
	                        psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */
slouken@689
  1768
	
slouken@689
  1769
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1770
	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
slouken@689
  1771
	                        psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */
slouken@689
  1772
	
slouken@689
  1773
	                        /* blend */
slouken@689
  1774
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1775
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1776
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1777
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1778
	                        psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */
slouken@689
  1779
	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
slouken@689
  1780
	
slouken@689
  1781
	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
slouken@689
  1782
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1783
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
slouken@689
  1784
	                        por_r2r(mm6, mm3); /* save new reds in dsts */
slouken@689
  1785
	
slouken@689
  1786
	                        /* green */
slouken@689
  1787
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1788
	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
slouken@689
  1789
	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
slouken@689
  1790
	
slouken@689
  1791
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1792
	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
slouken@689
  1793
	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
slouken@689
  1794
	
slouken@689
  1795
	                        /* blend */
slouken@689
  1796
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1797
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1798
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1799
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1800
	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
slouken@689
  1801
	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
slouken@689
  1802
	
slouken@689
  1803
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1804
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1805
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
slouken@689
  1806
	                        por_r2r(mm6, mm3); /* save new greens in dsts */
slouken@689
  1807
	
slouken@689
  1808
	                        /* blue */
slouken@689
  1809
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1810
	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
slouken@689
  1811
		
slouken@689
  1812
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1813
	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@689
  1814
	
slouken@689
  1815
	                        /* blend */
slouken@689
  1816
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1817
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1818
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1819
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1820
	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
slouken@689
  1821
	
slouken@689
  1822
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1823
	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
slouken@689
  1824
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
slouken@689
  1825
	                        por_r2r(mm6, mm3); /* save new blues in dsts */
slouken@689
  1826
	
slouken@689
  1827
	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
slouken@689
  1828
	
slouken@689
  1829
	                        srcp += 4;
slouken@689
  1830
	                        dstp += 4;
slouken@689
  1831
                        }, width);			
slouken@689
  1832
			srcp += srcskip;
slouken@689
  1833
			dstp += dstskip;
slouken@689
  1834
		}
slouken@689
  1835
		emms();
slouken@689
  1836
	}
slouken@689
  1837
}
slouken@689
  1838
#endif
slouken@689
  1839
slouken@1
  1840
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1
  1841
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
slouken@1
  1842
{
slouken@1
  1843
	unsigned alpha = info->src->alpha;
slouken@1
  1844
	if(alpha == 128) {
slouken@1
  1845
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1
  1846
	} else {
slouken@1
  1847
		int width = info->d_width;
slouken@1
  1848
		int height = info->d_height;
slouken@1
  1849
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  1850
		int srcskip = info->s_skip >> 1;
slouken@1
  1851
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  1852
		int dstskip = info->d_skip >> 1;
slouken@1
  1853
		alpha >>= 3;	/* downscale alpha to 5 bits */
slouken@1
  1854
slouken@1
  1855
		while(height--) {
slouken@1
  1856
			DUFFS_LOOP4({
slouken@1
  1857
				Uint32 s = *srcp++;
slouken@1
  1858
				Uint32 d = *dstp;
slouken@1
  1859
				/*
slouken@1
  1860
				 * shift out the middle component (green) to
slouken@1
  1861
				 * the high 16 bits, and process all three RGB
slouken@1
  1862
				 * components at the same time.
slouken@1
  1863
				 */
slouken@1
  1864
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  1865
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  1866
				d += (s - d) * alpha >> 5;
slouken@1
  1867
				d &= 0x07e0f81f;
slouken@1
  1868
				*dstp++ = d | d >> 16;
slouken@1
  1869
			}, width);
slouken@1
  1870
			srcp += srcskip;
slouken@1
  1871
			dstp += dstskip;
slouken@1
  1872
		}
slouken@0
  1873
	}
slouken@0
  1874
}
slouken@0
  1875
slouken@0
  1876
/* fast RGB555->RGB555 blending with surface alpha */
slouken@0
  1877
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  1878
{
slouken@1
  1879
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@1
  1880
	if(alpha == 128) {
slouken@1
  1881
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1
  1882
	} else {
slouken@1
  1883
		int width = info->d_width;
slouken@1
  1884
		int height = info->d_height;
slouken@1
  1885
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  1886
		int srcskip = info->s_skip >> 1;
slouken@1
  1887
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  1888
		int dstskip = info->d_skip >> 1;
slouken@1
  1889
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@0
  1890
slouken@1
  1891
		while(height--) {
slouken@1
  1892
			DUFFS_LOOP4({
slouken@1
  1893
				Uint32 s = *srcp++;
slouken@1
  1894
				Uint32 d = *dstp;
slouken@1
  1895
				/*
slouken@1
  1896
				 * shift out the middle component (green) to
slouken@1
  1897
				 * the high 16 bits, and process all three RGB
slouken@1
  1898
				 * components at the same time.
slouken@1
  1899
				 */
slouken@1
  1900
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  1901
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  1902
				d += (s - d) * alpha >> 5;
slouken@1
  1903
				d &= 0x03e07c1f;
slouken@1
  1904
				*dstp++ = d | d >> 16;
slouken@1
  1905
			}, width);
slouken@1
  1906
			srcp += srcskip;
slouken@1
  1907
			dstp += dstskip;
slouken@1
  1908
		}
slouken@0
  1909
	}
slouken@0
  1910
}
slouken@0
  1911
slouken@0
  1912
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@0
  1913
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
slouken@0
  1914
{
slouken@0
  1915
	int width = info->d_width;
slouken@0
  1916
	int height = info->d_height;
slouken@0
  1917
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1918
	int srcskip = info->s_skip >> 2;
slouken@0
  1919
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1920
	int dstskip = info->d_skip >> 1;
slouken@0
  1921
slouken@0
  1922
	while(height--) {
slouken@0
  1923
	    DUFFS_LOOP4({
slouken@0
  1924
		Uint32 s = *srcp;
slouken@0
  1925
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1926
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1927
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1928
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1929
		   Benchmark this! */
slouken@689
  1930
		if(alpha) {   
slouken@689
  1931
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@0
  1932
		    *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0)
slouken@0
  1933
			  + (s >> 3  & 0x1f);
slouken@689
  1934
		  } else {
slouken@0
  1935
		    Uint32 d = *dstp;
slouken@0
  1936
		    /*
slouken@0
  1937
		     * convert source and destination to G0RAB65565
slouken@0
  1938
		     * and blend all components at the same time
slouken@0
  1939
		     */
slouken@0
  1940
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  1941
		      + (s >> 3 & 0x1f);
slouken@0
  1942
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  1943
		    d += (s - d) * alpha >> 5;
slouken@0
  1944
		    d &= 0x07e0f81f;
slouken@0
  1945
		    *dstp = d | d >> 16;
slouken@689
  1946
		  }
slouken@0
  1947
		}
slouken@0
  1948
		srcp++;
slouken@0
  1949
		dstp++;
slouken@0
  1950
	    }, width);
slouken@0
  1951
	    srcp += srcskip;
slouken@0
  1952
	    dstp += dstskip;
slouken@0
  1953
	}
slouken@0
  1954
}
slouken@0
  1955
slouken@0
  1956
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@0
  1957
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
slouken@0
  1958
{
slouken@0
  1959
	int width = info->d_width;
slouken@0
  1960
	int height = info->d_height;
slouken@0
  1961
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1962
	int srcskip = info->s_skip >> 2;
slouken@0
  1963
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1964
	int dstskip = info->d_skip >> 1;
slouken@0
  1965
slouken@0
  1966
	while(height--) {
slouken@0
  1967
	    DUFFS_LOOP4({
slouken@0
  1968
		unsigned alpha;
slouken@0
  1969
		Uint32 s = *srcp;
slouken@0
  1970
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1971
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1972
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1973
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1974
		   Benchmark this! */
slouken@689
  1975
		if(alpha) {   
slouken@689
  1976
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@0
  1977
		    *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0)
slouken@0
  1978
			  + (s >> 3  & 0x1f);
slouken@689
  1979
		  } else {
slouken@0
  1980
		    Uint32 d = *dstp;
slouken@0
  1981
		    /*
slouken@0
  1982
		     * convert source and destination to G0RAB65565
slouken@0
  1983
		     * and blend all components at the same time
slouken@0
  1984
		     */
slouken@0
  1985
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  1986
		      + (s >> 3 & 0x1f);
slouken@0
  1987
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  1988
		    d += (s - d) * alpha >> 5;
slouken@0
  1989
		    d &= 0x03e07c1f;
slouken@0
  1990
		    *dstp = d | d >> 16;
slouken@689
  1991
		  }
slouken@0
  1992
		}
slouken@0
  1993
		srcp++;
slouken@0
  1994
		dstp++;
slouken@0
  1995
	    }, width);
slouken@0
  1996
	    srcp += srcskip;
slouken@0
  1997
	    dstp += dstskip;
slouken@0
  1998
	}
slouken@0
  1999
}
slouken@0
  2000
slouken@0
  2001
/* General (slow) N->N blending with per-surface alpha */
slouken@0
  2002
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  2003
{
slouken@0
  2004
	int width = info->d_width;
slouken@0
  2005
	int height = info->d_height;
slouken@0
  2006
	Uint8 *src = info->s_pixels;
slouken@0
  2007
	int srcskip = info->s_skip;
slouken@0
  2008
	Uint8 *dst = info->d_pixels;
slouken@0
  2009
	int dstskip = info->d_skip;
slouken@0
  2010
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2011
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2012
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2013
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2014
	unsigned sA = srcfmt->alpha;
slouken@0
  2015
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  2016
slouken@689
  2017
	if(sA) {
slouken@689
  2018
	  while ( height-- ) {
slouken@0
  2019
	    DUFFS_LOOP4(
slouken@0
  2020
	    {
icculus@1162
  2021
		Uint32 Pixel;
slouken@0
  2022
		unsigned sR;
slouken@0
  2023
		unsigned sG;
slouken@0
  2024
		unsigned sB;
slouken@0
  2025
		unsigned dR;
slouken@0
  2026
		unsigned dG;
slouken@0
  2027
		unsigned dB;
icculus@1162
  2028
		DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
icculus@1162
  2029
		DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  2030
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  2031
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  2032
		src += srcbpp;
slouken@0
  2033
		dst += dstbpp;
slouken@0
  2034
	    },
slouken@0
  2035
	    width);
slouken@0
  2036
	    src += srcskip;
slouken@0
  2037
	    dst += dstskip;
slouken@689
  2038
	  }
slouken@0
  2039
	}
slouken@0
  2040
}
slouken@0
  2041
slouken@0
  2042
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@0
  2043
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
  2044
{
slouken@0
  2045
	int width = info->d_width;
slouken@0
  2046
	int height = info->d_height;
slouken@0
  2047
	Uint8 *src = info->s_pixels;
slouken@0
  2048
	int srcskip = info->s_skip;
slouken@0
  2049
	Uint8 *dst = info->d_pixels;
slouken@0
  2050
	int dstskip = info->d_skip;
slouken@0
  2051
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2052
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2053
	Uint32 ckey = srcfmt->colorkey;
slouken@0
  2054
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2055
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2056
	unsigned sA = srcfmt->alpha;
slouken@0
  2057
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  2058
slouken@0
  2059
	while ( height-- ) {
slouken@0
  2060
	    DUFFS_LOOP4(
slouken@0
  2061
	    {
icculus@1162
  2062
		Uint32 Pixel;
slouken@0
  2063
		unsigned sR;
slouken@0
  2064
		unsigned sG;
slouken@0
  2065
		unsigned sB;
slouken@0
  2066
		unsigned dR;
slouken@0
  2067
		unsigned dG;
slouken@0
  2068
		unsigned dB;
icculus@1162
  2069
		RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
icculus@1162
  2070
		if(sA && Pixel != ckey) {
icculus@1162
  2071
		    RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
icculus@1162
  2072
		    DISEMBLE_RGB(dst, dstbpp, dstfmt, Pixel, dR, dG, dB);
slouken@0
  2073
		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  2074
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  2075
		}
slouken@0
  2076
		src += srcbpp;
slouken@0
  2077
		dst += dstbpp;
slouken@0
  2078
	    },
slouken@0
  2079
	    width);
slouken@0
  2080
	    src += srcskip;
slouken@0
  2081
	    dst += dstskip;
slouken@0
  2082
	}
slouken@0
  2083
}
slouken@0
  2084
slouken@0
  2085
/* General (slow) N->N blending with pixel alpha */
slouken@0
  2086
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
slouken@0
  2087
{
slouken@0
  2088
	int width = info->d_width;
slouken@0
  2089
	int height = info->d_height;
slouken@0
  2090
	Uint8 *src = info->s_pixels;
slouken@0
  2091
	int srcskip = info->s_skip;
slouken@0
  2092
	Uint8 *dst = info->d_pixels;
slouken@0
  2093
	int dstskip = info->d_skip;
slouken@0
  2094
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  2095
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  2096
slouken@0
  2097
	int  srcbpp;
slouken@0
  2098
	int  dstbpp;
slouken@0
  2099
slouken@0
  2100
	/* Set up some basic variables */
slouken@0
  2101
	srcbpp = srcfmt->BytesPerPixel;
slouken@0
  2102
	dstbpp = dstfmt->BytesPerPixel;
slouken@0
  2103
slouken@0
  2104
	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
slouken@0
  2105
	   quite right. for <8bpp source alpha, it gets them very wrong
slouken@0
  2106
	   (check all macros!)
slouken@0
  2107
	   It is unclear whether there is a good general solution that doesn't
slouken@0
  2108
	   need a branch (or a divide). */
slouken@0
  2109
	while ( height-- ) {
slouken@0
  2110
	    DUFFS_LOOP4(
slouken@0
  2111
	    {
icculus@1162
  2112
		Uint32 Pixel;
slouken@0
  2113
		unsigned sR;
slouken@0
  2114
		unsigned sG;
slouken@0
  2115
		unsigned sB;
slouken@0
  2116
		unsigned dR;
slouken@0
  2117
		unsigned dG;
slouken@0
  2118
		unsigned dB;
slouken@0
  2119
		unsigned sA;
slouken@0
  2120
		unsigned dA;
icculus@1162
  2121
		DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
slouken@689
  2122
		if(sA) {
icculus@1162
  2123
		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
slouken@689
  2124
		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@689
  2125
		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  2126
		}
slouken@0
  2127
		src += srcbpp;
slouken@0
  2128
		dst += dstbpp;
slouken@0
  2129
	    },
slouken@0
  2130
	    width);
slouken@0
  2131
	    src += srcskip;
slouken@0
  2132
	    dst += dstskip;
slouken@0
  2133
	}
slouken@0
  2134
}
slouken@0
  2135
slouken@0
  2136
slouken@0
  2137
SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
slouken@0
  2138
{
slouken@0
  2139
    SDL_PixelFormat *sf = surface->format;
slouken@0
  2140
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  2141
slouken@0
  2142
    if(sf->Amask == 0) {
slouken@0
  2143
	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
slouken@0
  2144
	    if(df->BytesPerPixel == 1)
slouken@0
  2145
		return BlitNto1SurfaceAlphaKey;
slouken@0
  2146
	    else
icculus@1047
  2147
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2148
	if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
icculus@1240
  2149
	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2150
            return Blit32to32SurfaceAlphaKeyAltivec;
icculus@1047
  2151
        else
icculus@1047
  2152
#endif
icculus@1047
  2153
            return BlitNtoNSurfaceAlphaKey;
slouken@0
  2154
	} else {
slouken@0
  2155
	    /* Per-surface alpha blits */
slouken@0
  2156
	    switch(df->BytesPerPixel) {
slouken@0
  2157
	    case 1:
slouken@0
  2158
		return BlitNto1SurfaceAlpha;
slouken@0
  2159
slouken@0
  2160
	    case 2:
slouken@0
  2161
		if(surface->map->identity) {
slouken@0
  2162
		    if(df->Gmask == 0x7e0)
slouken@689
  2163
		    {
slouken@880
  2164
#ifdef MMX_ASMBLIT
slouken@739
  2165
		if(SDL_HasMMX())
slouken@689
  2166
			return Blit565to565SurfaceAlphaMMX;
slouken@689
  2167
		else
slouken@689
  2168
#endif
slouken@0
  2169
			return Blit565to565SurfaceAlpha;
slouken@689
  2170
		    }
slouken@0
  2171
		    else if(df->Gmask == 0x3e0)
slouken@689
  2172
		    {
slouken@880
  2173
#ifdef MMX_ASMBLIT
slouken@739
  2174
		if(SDL_HasMMX())
slouken@689
  2175
			return Blit555to555SurfaceAlphaMMX;
slouken@689
  2176
		else
slouken@689
  2177
#endif
slouken@0
  2178
			return Blit555to555SurfaceAlpha;
slouken@689
  2179
		    }
slouken@0
  2180
		}
slouken@0
  2181
		return BlitNtoNSurfaceAlpha;
slouken@0
  2182
slouken@0
  2183
	    case 4:
slouken@0
  2184
		if(sf->Rmask == df->Rmask
slouken@0
  2185
		   && sf->Gmask == df->Gmask
slouken@0
  2186
		   && sf->Bmask == df->Bmask
slouken@0
  2187
		   && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff
slouken@0
  2188
		   && sf->BytesPerPixel == 4)
slouken@689
  2189
		{
slouken@880
  2190
#ifdef MMX_ASMBLIT
slouken@739
  2191
		if(SDL_HasMMX())
slouken@689
  2192
		    return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@689
  2193
		else
slouken@689
  2194
#endif
icculus@1047
  2195
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2196
	if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2197
            return BlitRGBtoRGBSurfaceAlphaAltivec;
icculus@1047
  2198
        else
icculus@1047
  2199
#endif
slouken@0
  2200
		    return BlitRGBtoRGBSurfaceAlpha;
slouken@689
  2201
		}
slouken@0
  2202
		else
icculus@1047
  2203
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2204
        if((sf->BytesPerPixel == 4) &&
icculus@1240
  2205
	   !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2206
            return Blit32to32SurfaceAlphaAltivec;
icculus@1047
  2207
        else
icculus@1047
  2208
#endif
slouken@0
  2209
		    return BlitNtoNSurfaceAlpha;
slouken@0
  2210
slouken@0
  2211
	    case 3:
slouken@0
  2212
	    default:
slouken@0
  2213
		return BlitNtoNSurfaceAlpha;
slouken@0
  2214
	    }
slouken@0
  2215
	}
slouken@0
  2216
    } else {
slouken@0
  2217
	/* Per-pixel alpha blits */
slouken@0
  2218
	switch(df->BytesPerPixel) {
slouken@0
  2219
	case 1:
slouken@0
  2220
	    return BlitNto1PixelAlpha;
slouken@0
  2221
slouken@0
  2222
	case 2:
icculus@1047
  2223
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2224
	if(sf->BytesPerPixel == 4 && !(surface->map->dst->flags & SDL_HWSURFACE) &&
icculus@1047
  2225
           df->Gmask == 0x7e0 &&
icculus@1240
  2226
	   df->Bmask == 0x1f && SDL_HasAltiVec())
icculus@1047
  2227
            return Blit32to565PixelAlphaAltivec;
icculus@1047
  2228
        else
icculus@1047
  2229
#endif
slouken@0
  2230
	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@0
  2231
	       && sf->Gmask == 0xff00
slouken@0
  2232
	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@0
  2233
		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@0
  2234
		if(df->Gmask == 0x7e0)
slouken@0
  2235
		    return BlitARGBto565PixelAlpha;
slouken@0
  2236
		else if(df->Gmask == 0x3e0)
slouken@0
  2237
		    return BlitARGBto555PixelAlpha;
slouken@0
  2238
	    }
slouken@0
  2239
	    return BlitNtoNPixelAlpha;
slouken@0
  2240
slouken@0
  2241
	case 4:
slouken@0
  2242
	    if(sf->Amask == 0xff000000
slouken@0
  2243
	       && sf->Rmask == df->Rmask
slouken@0
  2244
	       && sf->Gmask == df->Gmask
slouken@0
  2245
	       && sf->Bmask == df->Bmask
slouken@0
  2246
	       && sf->BytesPerPixel == 4)
slouken@689
  2247
	    {
slouken@880
  2248
#ifdef MMX_ASMBLIT
slouken@739
  2249
		if(SDL_Has3DNow())
slouken@689
  2250
		    return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@689
  2251
		else
slouken@739
  2252
		if(SDL_HasMMX())
slouken@689
  2253
		    return BlitRGBtoRGBPixelAlphaMMX;
slouken@689
  2254
		else
slouken@689
  2255
#endif
icculus@1047
  2256
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2257
	if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2258
            return BlitRGBtoRGBPixelAlphaAltivec;
icculus@1047
  2259
        else
icculus@1047
  2260
#endif
slouken@689
  2261
		    return BlitRGBtoRGBPixelAlpha;
slouken@689
  2262
	    }
icculus@1047
  2263
#ifdef USE_ALTIVEC_BLITTERS
icculus@1240
  2264
        if (sf->Amask && sf->BytesPerPixel == 4 &&
icculus@1240
  2265
	    !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
icculus@1047
  2266
            return Blit32to32PixelAlphaAltivec;
icculus@1047
  2267
        else
icculus@1047
  2268
#endif
slouken@0
  2269
	    return BlitNtoNPixelAlpha;
slouken@0
  2270
slouken@0
  2271
	case 3:
slouken@0
  2272
	default:
slouken@0
  2273
	    return BlitNtoNPixelAlpha;
slouken@0
  2274
	}
slouken@0
  2275
    }
slouken@0
  2276
}
slouken@0
  2277