src/video/SDL_blit_A.c
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 297 f6ffac90895c
child 720 f90d80d68071
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@297
     3
    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@0
     6
    modify it under the terms of the GNU Library General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@0
     8
    version 2 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@0
    13
    Library General Public License for more details.
slouken@0
    14
slouken@0
    15
    You should have received a copy of the GNU Library General Public
slouken@0
    16
    License along with this library; if not, write to the Free
slouken@0
    17
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@0
    22
slouken@0
    23
#ifdef SAVE_RCSID
slouken@0
    24
static char rcsid =
slouken@0
    25
 "@(#) $Id$";
slouken@0
    26
#endif
slouken@0
    27
slouken@0
    28
#include <stdio.h>
slouken@0
    29
slouken@0
    30
#include "SDL_types.h"
slouken@0
    31
#include "SDL_video.h"
slouken@0
    32
#include "SDL_blit.h"
slouken@0
    33
slouken@689
    34
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
    35
#include "mmx.h"
slouken@689
    36
/* Function to check the CPU flags */
slouken@689
    37
#define MMX_CPU		0x800000
slouken@689
    38
#define TDNOW_CPU	0x80000000
slouken@689
    39
#define CPU_Flags()	Hermes_X86_CPU()
slouken@689
    40
#define X86_ASSEMBLER
slouken@689
    41
#define HermesConverterInterface	void
slouken@689
    42
#define HermesClearInterface		void
slouken@689
    43
#define STACKCALL
slouken@689
    44
#include "HeadX86.h"
slouken@689
    45
#endif
slouken@689
    46
slouken@0
    47
/* Functions to perform alpha blended blitting */
slouken@0
    48
slouken@0
    49
/* N->1 blending with per-surface alpha */
slouken@0
    50
static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
    51
{
slouken@0
    52
	int width = info->d_width;
slouken@0
    53
	int height = info->d_height;
slouken@0
    54
	Uint8 *src = info->s_pixels;
slouken@0
    55
	int srcskip = info->s_skip;
slouken@0
    56
	Uint8 *dst = info->d_pixels;
slouken@0
    57
	int dstskip = info->d_skip;
slouken@0
    58
	Uint8 *palmap = info->table;
slouken@0
    59
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
    60
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
    61
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
    62
slouken@0
    63
	const unsigned A = srcfmt->alpha;
slouken@0
    64
slouken@0
    65
	while ( height-- ) {
slouken@0
    66
	    DUFFS_LOOP4(
slouken@0
    67
	    {
slouken@0
    68
		Uint32 pixel;
slouken@0
    69
		unsigned sR;
slouken@0
    70
		unsigned sG;
slouken@0
    71
		unsigned sB;
slouken@0
    72
		unsigned dR;
slouken@0
    73
		unsigned dG;
slouken@0
    74
		unsigned dB;
slouken@0
    75
		DISEMBLE_RGB(src, srcbpp, srcfmt, pixel, sR, sG, sB);
slouken@0
    76
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
    77
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
    78
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
    79
		ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
    80
		dR &= 0xff;
slouken@0
    81
		dG &= 0xff;
slouken@0
    82
		dB &= 0xff;
slouken@0
    83
		/* Pack RGB into 8bit pixel */
slouken@0
    84
		if ( palmap == NULL ) {
slouken@0
    85
		    *dst =((dR>>5)<<(3+2))|
slouken@0
    86
			  ((dG>>5)<<(2))|
slouken@0
    87
			  ((dB>>6)<<(0));
slouken@0
    88
		} else {
slouken@0
    89
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
    90
				  ((dG>>5)<<(2))  |
slouken@0
    91
				  ((dB>>6)<<(0))];
slouken@0
    92
		}
slouken@0
    93
		dst++;
slouken@0
    94
		src += srcbpp;
slouken@0
    95
	    },
slouken@0
    96
	    width);
slouken@0
    97
	    src += srcskip;
slouken@0
    98
	    dst += dstskip;
slouken@0
    99
	}
slouken@0
   100
}
slouken@0
   101
slouken@0
   102
/* N->1 blending with pixel alpha */
slouken@0
   103
static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
slouken@0
   104
{
slouken@0
   105
	int width = info->d_width;
slouken@0
   106
	int height = info->d_height;
slouken@0
   107
	Uint8 *src = info->s_pixels;
slouken@0
   108
	int srcskip = info->s_skip;
slouken@0
   109
	Uint8 *dst = info->d_pixels;
slouken@0
   110
	int dstskip = info->d_skip;
slouken@0
   111
	Uint8 *palmap = info->table;
slouken@0
   112
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   113
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   114
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   115
slouken@0
   116
	/* FIXME: fix alpha bit field expansion here too? */
slouken@0
   117
	while ( height-- ) {
slouken@0
   118
	    DUFFS_LOOP4(
slouken@0
   119
	    {
slouken@0
   120
		Uint32 pixel;
slouken@0
   121
		unsigned sR;
slouken@0
   122
		unsigned sG;
slouken@0
   123
		unsigned sB;
slouken@0
   124
		unsigned sA;
slouken@0
   125
		unsigned dR;
slouken@0
   126
		unsigned dG;
slouken@0
   127
		unsigned dB;
slouken@0
   128
		DISEMBLE_RGBA(src,srcbpp,srcfmt,pixel,sR,sG,sB,sA);
slouken@0
   129
		dR = dstfmt->palette->colors[*dst].r;
slouken@0
   130
		dG = dstfmt->palette->colors[*dst].g;
slouken@0
   131
		dB = dstfmt->palette->colors[*dst].b;
slouken@0
   132
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
   133
		dR &= 0xff;
slouken@0
   134
		dG &= 0xff;
slouken@0
   135
		dB &= 0xff;
slouken@0
   136
		/* Pack RGB into 8bit pixel */
slouken@0
   137
		if ( palmap == NULL ) {
slouken@0
   138
		    *dst =((dR>>5)<<(3+2))|
slouken@0
   139
			  ((dG>>5)<<(2))|
slouken@0
   140
			  ((dB>>6)<<(0));
slouken@0
   141
		} else {
slouken@0
   142
		    *dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   143
				  ((dG>>5)<<(2))  |
slouken@0
   144
				  ((dB>>6)<<(0))  ];
slouken@0
   145
		}
slouken@0
   146
		dst++;
slouken@0
   147
		src += srcbpp;
slouken@0
   148
	    },
slouken@0
   149
	    width);
slouken@0
   150
	    src += srcskip;
slouken@0
   151
	    dst += dstskip;
slouken@0
   152
	}
slouken@0
   153
}
slouken@0
   154
slouken@0
   155
/* colorkeyed N->1 blending with per-surface alpha */
slouken@0
   156
static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
   157
{
slouken@0
   158
	int width = info->d_width;
slouken@0
   159
	int height = info->d_height;
slouken@0
   160
	Uint8 *src = info->s_pixels;
slouken@0
   161
	int srcskip = info->s_skip;
slouken@0
   162
	Uint8 *dst = info->d_pixels;
slouken@0
   163
	int dstskip = info->d_skip;
slouken@0
   164
	Uint8 *palmap = info->table;
slouken@0
   165
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
   166
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
   167
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
   168
	Uint32 ckey = srcfmt->colorkey;
slouken@0
   169
slouken@0
   170
	const int A = srcfmt->alpha;
slouken@0
   171
slouken@0
   172
	while ( height-- ) {
slouken@0
   173
	    DUFFS_LOOP(
slouken@0
   174
	    {
slouken@0
   175
		Uint32 pixel;
slouken@0
   176
		unsigned sR;
slouken@0
   177
		unsigned sG;
slouken@0
   178
		unsigned sB;
slouken@0
   179
		unsigned dR;
slouken@0
   180
		unsigned dG;
slouken@0
   181
		unsigned dB;
slouken@0
   182
		DISEMBLE_RGB(src, srcbpp, srcfmt, pixel, sR, sG, sB);
slouken@0
   183
		if ( pixel != ckey ) {
slouken@0
   184
		    dR = dstfmt->palette->colors[*dst].r;
slouken@0
   185
		    dG = dstfmt->palette->colors[*dst].g;
slouken@0
   186
		    dB = dstfmt->palette->colors[*dst].b;
slouken@0
   187
		    ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB);
slouken@0
   188
		    dR &= 0xff;
slouken@0
   189
		    dG &= 0xff;
slouken@0
   190
		    dB &= 0xff;
slouken@0
   191
		    /* Pack RGB into 8bit pixel */
slouken@0
   192
		    if ( palmap == NULL ) {
slouken@0
   193
			*dst =((dR>>5)<<(3+2))|
slouken@0
   194
			      ((dG>>5)<<(2)) |
slouken@0
   195
			      ((dB>>6)<<(0));
slouken@0
   196
		    } else {
slouken@0
   197
			*dst = palmap[((dR>>5)<<(3+2))|
slouken@0
   198
				      ((dG>>5)<<(2))  |
slouken@0
   199
				      ((dB>>6)<<(0))  ];
slouken@0
   200
		    }
slouken@0
   201
		}
slouken@0
   202
		dst++;
slouken@0
   203
		src += srcbpp;
slouken@0
   204
	    },
slouken@0
   205
	    width);
slouken@0
   206
	    src += srcskip;
slouken@0
   207
	    dst += dstskip;
slouken@0
   208
	}
slouken@0
   209
}
slouken@0
   210
slouken@689
   211
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   212
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@689
   213
static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
slouken@689
   214
{
slouken@689
   215
	int width = info->d_width;
slouken@689
   216
	int height = info->d_height;
slouken@689
   217
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   218
	int srcskip = info->s_skip >> 2;
slouken@689
   219
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   220
	int dstskip = info->d_skip >> 2;
slouken@689
   221
        Uint8 load[8];
slouken@689
   222
  
slouken@689
   223
        *(Uint64 *)load = 0x00fefefe00fefefe;/* alpha128 mask */
slouken@689
   224
        movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
slouken@689
   225
        *(Uint64 *)load = 0x0001010100010101;/* !alpha128 mask */
slouken@689
   226
        movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
slouken@689
   227
        *(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */
slouken@689
   228
        movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
slouken@689
   229
	while(height--) {
slouken@689
   230
            DUFFS_LOOP_DOUBLE2(
slouken@689
   231
            {
slouken@689
   232
		    Uint32 s = *srcp++;
slouken@689
   233
		    Uint32 d = *dstp;
slouken@689
   234
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@689
   235
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@689
   236
            },{
slouken@689
   237
	            movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@689
   238
	            movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@689
   239
	      
slouken@689
   240
	            movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
slouken@689
   241
	            movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
slouken@689
   242
		
slouken@689
   243
	            pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
slouken@689
   244
	            pand_r2r(mm4, mm5); /* src & mask -> mm5 */
slouken@689
   245
	            paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
slouken@689
   246
	            psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
slouken@689
   247
	
slouken@689
   248
	            pand_r2r(mm1, mm2); /* src & dst -> mm2 */
slouken@689
   249
	            pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
slouken@689
   250
	            paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
slouken@689
   251
	            por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@689
   252
	            movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
slouken@689
   253
	            dstp += 2;
slouken@689
   254
	            srcp += 2;
slouken@689
   255
            }, width);
slouken@689
   256
	    srcp += srcskip;
slouken@689
   257
	    dstp += dstskip;
slouken@689
   258
	}
slouken@689
   259
	emms();
slouken@689
   260
}
slouken@689
   261
slouken@689
   262
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@689
   263
static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
   264
{
slouken@689
   265
	unsigned alpha = info->src->alpha;
slouken@689
   266
	if(alpha == 128) {
slouken@689
   267
		BlitRGBtoRGBSurfaceAlpha128MMX(info);
slouken@689
   268
	} else {
slouken@689
   269
		int width = info->d_width;
slouken@689
   270
		int height = info->d_height;
slouken@689
   271
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   272
		int srcskip = info->s_skip >> 2;
slouken@689
   273
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   274
		int dstskip = info->d_skip >> 2;
slouken@689
   275
                Uint8 load[8] = {alpha, alpha, alpha, alpha,
slouken@689
   276
    					alpha, alpha, alpha, alpha};
slouken@689
   277
					
slouken@689
   278
                movq_m2r(*load, mm4); /* alpha -> mm4 */
slouken@689
   279
		*(Uint64 *)load = 0x00FF00FF00FF00FF;
slouken@689
   280
                movq_m2r(*load, mm3); /* mask -> mm3 */
slouken@689
   281
		pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */
slouken@689
   282
		*(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */
slouken@689
   283
		movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
slouken@689
   284
		
slouken@689
   285
		while(height--) {
slouken@689
   286
			DUFFS_LOOP_DOUBLE2({
slouken@689
   287
				/* One Pixel Blend */
slouken@689
   288
	                        movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   289
                                punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
slouken@689
   290
                                pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
slouken@689
   291
			  
slouken@689
   292
	                        movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   293
			        movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/
slouken@689
   294
                                punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
slouken@689
   295
                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   296
			  
slouken@689
   297
                                psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@689
   298
	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   299
	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@689
   300
	                        paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@689
   301
	                        pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   302
	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   303
	                        por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@689
   304
			        movd_r2m(mm2, *dstp);/* mm2 -> pixel */
slouken@689
   305
				++srcp;
slouken@689
   306
				++dstp;
slouken@689
   307
			},{
slouken@689
   308
			        /* Two Pixels Blend */
slouken@689
   309
				movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
slouken@689
   310
			        movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
slouken@689
   311
                                punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */
slouken@689
   312
			        pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */
slouken@689
   313
			        punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */
slouken@689
   314
	                        pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */
slouken@689
   315
	
slouken@689
   316
	                        movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
slouken@689
   317
	                        movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */
slouken@689
   318
			        movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
slouken@689
   319
                                punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */
slouken@689
   320
	                        punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */
slouken@689
   321
                                pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */
slouken@689
   322
	                  
slouken@689
   323
                                psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
slouken@689
   324
	                        pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
slouken@689
   325
			        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */
slouken@689
   326
			        psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
slouken@689
   327
			        psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
slouken@689
   328
	                        pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   329
				paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
slouken@689
   330
	                        psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */
slouken@689
   331
				pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   332
	                        paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
slouken@689
   333
	                        pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */
slouken@689
   334
	                        packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   335
	                        packuswb_r2r(mm6, mm6);  /* ARGBARGB -> mm6 */
slouken@689
   336
	                        psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */
slouken@689
   337
	                        psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */
slouken@689
   338
	                        por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */				
slouken@689
   339
				por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
slouken@689
   340
                                movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
slouken@689
   341
				srcp += 2;
slouken@689
   342
				dstp += 2;
slouken@689
   343
			}, width);
slouken@689
   344
			srcp += srcskip;
slouken@689
   345
			dstp += dstskip;
slouken@689
   346
		}
slouken@689
   347
		emms();
slouken@689
   348
	}
slouken@689
   349
}
slouken@689
   350
slouken@689
   351
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
   352
static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
slouken@689
   353
{
slouken@689
   354
	int width = info->d_width;
slouken@689
   355
	int height = info->d_height;
slouken@689
   356
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   357
	int srcskip = info->s_skip >> 2;
slouken@689
   358
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   359
	int dstskip = info->d_skip >> 2;
slouken@689
   360
        Uint32 alpha = 0;
slouken@689
   361
        Uint8 load[8];
slouken@689
   362
	                
slouken@689
   363
	*(Uint64 *)load = 0x00FF00FF00FF00FF;
slouken@689
   364
        movq_m2r(*load, mm3); /* mask -> mm2 */
slouken@689
   365
	*(Uint64 *)load = 0x00FF000000000000;
slouken@689
   366
        movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */
slouken@689
   367
        *(Uint64 *)load = 0x00FFFFFF00FFFFFF;
slouken@689
   368
        movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */
slouken@689
   369
        *(Uint64 *)load = 0xFF000000FF000000;
slouken@689
   370
        movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */
slouken@689
   371
	while(height--) {
slouken@689
   372
	    DUFFS_LOOP4({
slouken@689
   373
	        alpha = *srcp;
slouken@689
   374
	        alpha >>= 24;
slouken@689
   375
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
   376
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
   377
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
   378
		   Benchmark this! */
slouken@689
   379
		if(alpha) {   
slouken@689
   380
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@689
   381
		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   382
		    movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   383
		    pand_r2r(mm0, mm1);
slouken@689
   384
		    pand_r2r(mm6, mm2);
slouken@689
   385
		    por_r2r(mm1, mm2);
slouken@689
   386
		    movd_r2m(mm2, (*dstp));
slouken@689
   387
		  } else {
slouken@689
   388
		    movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
slouken@689
   389
                    punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
slouken@689
   390
                    pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
slouken@689
   391
			  
slouken@689
   392
	            movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
slouken@689
   393
                    punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
slouken@689
   394
                    pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   395
		
slouken@689
   396
		    movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */
slouken@689
   397
		    pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */
slouken@689
   398
		    psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/
slouken@689
   399
		    
slouken@689
   400
		    movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */
slouken@689
   401
		    psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */
slouken@689
   402
		    punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
slouken@689
   403
                    punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
slouken@689
   404
		                        		    
slouken@689
   405
                    /* blend */		    
slouken@689
   406
                    psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
slouken@689
   407
	            pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
slouken@689
   408
	            psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
slouken@689
   409
	            paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
slouken@689
   410
	            pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
slouken@689
   411
		    packuswb_r2r(mm2, mm2);  /* ARGBARGB -> mm2 */
slouken@689
   412
		    pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */
slouken@689
   413
		    por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */
slouken@689
   414
		    movd_r2m(mm2, *dstp);/* mm2 -> dst */
slouken@689
   415
		  }
slouken@689
   416
		}
slouken@689
   417
		++srcp;
slouken@689
   418
		++dstp;
slouken@689
   419
	    }, width);
slouken@689
   420
	    srcp += srcskip;
slouken@689
   421
	    dstp += dstskip;
slouken@689
   422
	}
slouken@689
   423
	emms();
slouken@689
   424
}
slouken@689
   425
#endif
slouken@689
   426
slouken@1
   427
/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
slouken@1
   428
static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
slouken@0
   429
{
slouken@0
   430
	int width = info->d_width;
slouken@0
   431
	int height = info->d_height;
slouken@0
   432
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
   433
	int srcskip = info->s_skip >> 2;
slouken@0
   434
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
   435
	int dstskip = info->d_skip >> 2;
slouken@0
   436
slouken@0
   437
	while(height--) {
slouken@0
   438
	    DUFFS_LOOP4({
slouken@1
   439
		    Uint32 s = *srcp++;
slouken@1
   440
		    Uint32 d = *dstp;
slouken@1
   441
		    *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
slouken@1
   442
			       + (s & d & 0x00010101)) | 0xff000000;
slouken@0
   443
	    }, width);
slouken@0
   444
	    srcp += srcskip;
slouken@0
   445
	    dstp += dstskip;
slouken@0
   446
	}
slouken@0
   447
}
slouken@0
   448
slouken@1
   449
/* fast RGB888->(A)RGB888 blending with surface alpha */
slouken@1
   450
static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
slouken@1
   451
{
slouken@1
   452
	unsigned alpha = info->src->alpha;
slouken@1
   453
	if(alpha == 128) {
slouken@1
   454
		BlitRGBtoRGBSurfaceAlpha128(info);
slouken@1
   455
	} else {
slouken@1
   456
		int width = info->d_width;
slouken@1
   457
		int height = info->d_height;
slouken@1
   458
		Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@1
   459
		int srcskip = info->s_skip >> 2;
slouken@1
   460
		Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@1
   461
		int dstskip = info->d_skip >> 2;
slouken@689
   462
		Uint32 s;
slouken@689
   463
		Uint32 d;
slouken@689
   464
		Uint32 s1;
slouken@689
   465
		Uint32 d1;
slouken@1
   466
slouken@1
   467
		while(height--) {
slouken@689
   468
			DUFFS_LOOP_DOUBLE2({
slouken@689
   469
				/* One Pixel Blend */
slouken@1
   470
				s = *srcp;
slouken@1
   471
				d = *dstp;
slouken@1
   472
				s1 = s & 0xff00ff;
slouken@1
   473
				d1 = d & 0xff00ff;
slouken@1
   474
				d1 = (d1 + ((s1 - d1) * alpha >> 8))
slouken@1
   475
				     & 0xff00ff;
slouken@1
   476
				s &= 0xff00;
slouken@1
   477
				d &= 0xff00;
slouken@1
   478
				d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@1
   479
				*dstp = d1 | d | 0xff000000;
slouken@1
   480
				++srcp;
slouken@1
   481
				++dstp;
slouken@689
   482
			},{
slouken@689
   483
			        /* Two Pixels Blend */
slouken@689
   484
				s = *srcp;
slouken@689
   485
				d = *dstp;
slouken@689
   486
				s1 = s & 0xff00ff;
slouken@689
   487
				d1 = d & 0xff00ff;
slouken@689
   488
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
   489
				d1 &= 0xff00ff;
slouken@689
   490
				     
slouken@689
   491
				s = ((s & 0xff00) >> 8) | 
slouken@689
   492
					((srcp[1] & 0xff00) << 8);
slouken@689
   493
				d = ((d & 0xff00) >> 8) |
slouken@689
   494
					((dstp[1] & 0xff00) << 8);
slouken@689
   495
				d += (s - d) * alpha >> 8;
slouken@689
   496
				d &= 0x00ff00ff;
slouken@689
   497
				
slouken@689
   498
				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
slouken@689
   499
				++srcp;
slouken@689
   500
				
slouken@689
   501
			        s1 = *srcp;
slouken@689
   502
				d1 = *dstp;
slouken@689
   503
				s1 &= 0xff00ff;
slouken@689
   504
				d1 &= 0xff00ff;
slouken@689
   505
				d1 += (s1 - d1) * alpha >> 8;
slouken@689
   506
				d1 &= 0xff00ff;
slouken@689
   507
				
slouken@689
   508
				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
slouken@689
   509
				++srcp;
slouken@689
   510
				++dstp;
slouken@1
   511
			}, width);
slouken@1
   512
			srcp += srcskip;
slouken@1
   513
			dstp += dstskip;
slouken@1
   514
		}
slouken@1
   515
	}
slouken@1
   516
}
slouken@1
   517
slouken@0
   518
/* fast ARGB888->(A)RGB888 blending with pixel alpha */
slouken@0
   519
static void BlitRGBtoRGBPixelAlpha(SDL_BlitInfo *info)
slouken@0
   520
{
slouken@0
   521
	int width = info->d_width;
slouken@0
   522
	int height = info->d_height;
slouken@0
   523
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
   524
	int srcskip = info->s_skip >> 2;
slouken@0
   525
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@0
   526
	int dstskip = info->d_skip >> 2;
slouken@0
   527
slouken@0
   528
	while(height--) {
slouken@0
   529
	    DUFFS_LOOP4({
slouken@0
   530
		Uint32 dalpha;
slouken@0
   531
		Uint32 d;
slouken@0
   532
		Uint32 s1;
slouken@0
   533
		Uint32 d1;
slouken@0
   534
		Uint32 s = *srcp;
slouken@0
   535
		Uint32 alpha = s >> 24;
slouken@0
   536
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
   537
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
   538
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
   539
		   Benchmark this! */
slouken@689
   540
		if(alpha) {   
slouken@689
   541
		  if(alpha == SDL_ALPHA_OPAQUE) {
slouken@0
   542
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
   543
		  } else {
slouken@0
   544
		    /*
slouken@0
   545
		     * take out the middle component (green), and process
slouken@0
   546
		     * the other two in parallel. One multiply less.
slouken@0
   547
		     */
slouken@0
   548
		    d = *dstp;
slouken@0
   549
		    dalpha = d & 0xff000000;
slouken@0
   550
		    s1 = s & 0xff00ff;
slouken@0
   551
		    d1 = d & 0xff00ff;
slouken@0
   552
		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
slouken@0
   553
		    s &= 0xff00;
slouken@0
   554
		    d &= 0xff00;
slouken@0
   555
		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
slouken@0
   556
		    *dstp = d1 | d | dalpha;
slouken@689
   557
		  }
slouken@0
   558
		}
slouken@0
   559
		++srcp;
slouken@0
   560
		++dstp;
slouken@0
   561
	    }, width);
slouken@0
   562
	    srcp += srcskip;
slouken@0
   563
	    dstp += dstskip;
slouken@0
   564
	}
slouken@0
   565
}
slouken@0
   566
slouken@689
   567
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   568
/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
slouken@689
   569
inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
slouken@689
   570
{
slouken@689
   571
	int width = info->d_width;
slouken@689
   572
	int height = info->d_height;
slouken@689
   573
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@689
   574
	int srcskip = info->s_skip >> 2;
slouken@689
   575
	Uint32 *dstp = (Uint32 *)info->d_pixels;
slouken@689
   576
	int dstskip = info->d_skip >> 2;
slouken@689
   577
slouken@689
   578
	Uint32 s;
slouken@689
   579
	Uint32 alpha;
slouken@689
   580
slouken@689
   581
	__asm__ (
slouken@689
   582
	/* make mm6 all zeros. */
slouken@689
   583
	"pxor       %%mm6, %%mm6\n"
slouken@689
   584
	
slouken@689
   585
	/* Make a mask to preserve the alpha. */
slouken@689
   586
	"pcmpeqb   %%mm7, %%mm7\n\t"            /* mm7(s) = FF FF FF FF | FF FF FF FF */
slouken@689
   587
	"psrlq     $16, %%mm7\n\t"		    /* mm7(s) = 00 00 FF FF | FF FF FF FF */
slouken@689
   588
slouken@689
   589
		: );
slouken@689
   590
slouken@689
   591
	while(height--) {
slouken@689
   592
slouken@689
   593
	    DUFFS_LOOP4({
slouken@689
   594
slouken@689
   595
		__asm__ (
slouken@689
   596
		"prefetch 64(%0)\n"
slouken@689
   597
		"prefetch 64(%1)\n"
slouken@689
   598
			: : "r" (srcp), "r" (dstp) );
slouken@689
   599
slouken@689
   600
		s = *srcp;
slouken@689
   601
		alpha = s >> 24;
slouken@689
   602
		/* FIXME: Here we special-case opaque alpha since the
slouken@689
   603
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@689
   604
		   it correctly. Also special-case alpha=0 for speed?
slouken@689
   605
		   Benchmark this! */
slouken@689
   606
		
slouken@689
   607
		if(alpha == SDL_ALPHA_OPAQUE) {
slouken@689
   608
		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
slouken@689
   609
		} 
slouken@689
   610
slouken@689
   611
		else {
slouken@689
   612
			    __asm__ (
slouken@689
   613
		    /* load in the source, and dst. */
slouken@689
   614
		    "movd      (%0), %%mm0\n"		    /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
slouken@689
   615
		    "movd      (%1), %%mm1\n"		    /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
slouken@689
   616
slouken@689
   617
		    /* Move the src alpha into mm2 */
slouken@689
   618
slouken@689
   619
		    /* if supporting pshufw */
slouken@689
   620
		    /*"pshufw     $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
   621
		    /*"psrlw     $8, %%mm2\n" */
slouken@689
   622
		    
slouken@689
   623
		    /* else: */
slouken@689
   624
		    "movq      %%mm0, %%mm2\n"
slouken@689
   625
		    "psrld     $24, %%mm2\n"                /* mm2 = 0 0 0 0 | 0  0  0  As */
slouken@689
   626
		    "punpcklwd	%%mm2, %%mm2\n"	            /* mm2 = 0 0 0 0 |  0 As  0  As */
slouken@689
   627
		    "punpckldq	%%mm2, %%mm2\n"             /* mm2 = 0 As 0 As |  0 As  0  As */
slouken@689
   628
slouken@689
   629
		    /* move the colors into words. */
slouken@689
   630
		    "punpcklbw %%mm6, %%mm0\n"		    /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
slouken@689
   631
		    "punpcklbw %%mm6, %%mm1\n"              /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
slouken@689
   632
slouken@689
   633
		    /* src - dst */
slouken@689
   634
		    "psubw    %%mm1, %%mm0\n"		    /* mm0 = As-Ad Rs-Rd | Gs-Gd  Bs-Bd */
slouken@689
   635
slouken@689
   636
		    /* A * (src-dst) */
slouken@689
   637
		    "pmullw    %%mm2, %%mm0\n"		    /* mm0 = As*As-d As*Rs-d | As*Gs-d  As*Bs-d */
slouken@689
   638
		    "pand      %%mm7, %%mm0\n"              /* to preserve dest alpha */
slouken@689
   639
		    "psrlw     $8,    %%mm0\n"		    /* mm0 = Ac>>8 Rc>>8 | Gc>>8  Bc>>8 */
slouken@689
   640
		    "paddb     %%mm1, %%mm0\n"		    /* mm0 = Ac+Ad Rc+Rd | Gc+Gd  Bc+Bd */
slouken@689
   641
slouken@689
   642
		    "packuswb  %%mm0, %%mm0\n"              /* mm0 =             | Ac Rc Gc Bc */
slouken@689
   643
		    
slouken@689
   644
		    "movd      %%mm0, (%1)\n"               /* result in mm0 */
slouken@689
   645
slouken@689
   646
		     : : "r" (srcp), "r" (dstp) );
slouken@689
   647
slouken@689
   648
		}
slouken@689
   649
		++srcp;
slouken@689
   650
		++dstp;
slouken@689
   651
	    }, width);
slouken@689
   652
	    srcp += srcskip;
slouken@689
   653
	    dstp += dstskip;
slouken@689
   654
	}
slouken@689
   655
slouken@689
   656
	__asm__ (
slouken@689
   657
	"emms\n"
slouken@689
   658
		:   );
slouken@689
   659
}
slouken@689
   660
#endif
slouken@689
   661
slouken@1
   662
/* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
slouken@1
   663
slouken@1
   664
/* blend a single 16 bit pixel at 50% */
slouken@1
   665
#define BLEND16_50(d, s, mask)						\
slouken@1
   666
	((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
slouken@1
   667
slouken@1
   668
/* blend two 16 bit pixels at 50% */
slouken@1
   669
#define BLEND2x16_50(d, s, mask)					     \
slouken@1
   670
	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
slouken@1
   671
	 + (s & d & (~(mask | mask << 16))))
slouken@1
   672
slouken@1
   673
static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
slouken@0
   674
{
slouken@0
   675
	int width = info->d_width;
slouken@0
   676
	int height = info->d_height;
slouken@0
   677
	Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@0
   678
	int srcskip = info->s_skip >> 1;
slouken@0
   679
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
   680
	int dstskip = info->d_skip >> 1;
slouken@0
   681
slouken@0
   682
	while(height--) {
slouken@1
   683
		if(((unsigned long)srcp ^ (unsigned long)dstp) & 2) {
slouken@1
   684
			/*
slouken@1
   685
			 * Source and destination not aligned, pipeline it.
slouken@1
   686
			 * This is mostly a win for big blits but no loss for
slouken@1
   687
			 * small ones
slouken@1
   688
			 */
slouken@1
   689
			Uint32 prev_sw;
slouken@1
   690
			int w = width;
slouken@1
   691
slouken@1
   692
			/* handle odd destination */
slouken@1
   693
			if((unsigned long)dstp & 2) {
slouken@1
   694
				Uint16 d = *dstp, s = *srcp;
slouken@1
   695
				*dstp = BLEND16_50(d, s, mask);
slouken@1
   696
				dstp++;
slouken@1
   697
				srcp++;
slouken@1
   698
				w--;
slouken@1
   699
			}
slouken@1
   700
			srcp++;	/* srcp is now 32-bit aligned */
slouken@1
   701
slouken@1
   702
			/* bootstrap pipeline with first halfword */
slouken@1
   703
			prev_sw = ((Uint32 *)srcp)[-1];
slouken@1
   704
slouken@1
   705
			while(w > 1) {
slouken@1
   706
				Uint32 sw, dw, s;
slouken@1
   707
				sw = *(Uint32 *)srcp;
slouken@1
   708
				dw = *(Uint32 *)dstp;
slouken@1
   709
				if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
slouken@1
   710
					s = (prev_sw << 16) + (sw >> 16);
slouken@1
   711
				else
slouken@1
   712
					s = (prev_sw >> 16) + (sw << 16);
slouken@1
   713
				prev_sw = sw;
slouken@1
   714
				*(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
slouken@1
   715
				dstp += 2;
slouken@1
   716
				srcp += 2;
slouken@1
   717
				w -= 2;
slouken@1
   718
			}
slouken@1
   719
slouken@1
   720
			/* final pixel if any */
slouken@1
   721
			if(w) {
slouken@1
   722
				Uint16 d = *dstp, s;
slouken@1
   723
				if(SDL_BYTEORDER == SDL_BIG_ENDIAN)
slouken@1
   724
					s = prev_sw;
slouken@1
   725
				else
slouken@1
   726
					s = prev_sw >> 16;
slouken@1
   727
				*dstp = BLEND16_50(d, s, mask);
slouken@1
   728
				srcp++;
slouken@1
   729
				dstp++;
slouken@1
   730
			}
slouken@1
   731
			srcp += srcskip - 1;
slouken@1
   732
			dstp += dstskip;
slouken@1
   733
		} else {
slouken@1
   734
			/* source and destination are aligned */
slouken@1
   735
			int w = width;
slouken@1
   736
slouken@1
   737
			/* first odd pixel? */
slouken@1
   738
			if((unsigned long)srcp & 2) {
slouken@1
   739
				Uint16 d = *dstp, s = *srcp;
slouken@1
   740
				*dstp = BLEND16_50(d, s, mask);
slouken@1
   741
				srcp++;
slouken@1
   742
				dstp++;
slouken@1
   743
				w--;
slouken@1
   744
			}
slouken@1
   745
			/* srcp and dstp are now 32-bit aligned */
slouken@1
   746
slouken@1
   747
			while(w > 1) {
slouken@1
   748
				Uint32 sw = *(Uint32 *)srcp;
slouken@1
   749
				Uint32 dw = *(Uint32 *)dstp;
slouken@1
   750
				*(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
slouken@1
   751
				srcp += 2;
slouken@1
   752
				dstp += 2;
slouken@1
   753
				w -= 2;
slouken@1
   754
			}
slouken@1
   755
slouken@1
   756
			/* last odd pixel? */
slouken@1
   757
			if(w) {
slouken@1
   758
				Uint16 d = *dstp, s = *srcp;
slouken@1
   759
				*dstp = BLEND16_50(d, s, mask);
slouken@1
   760
				srcp++;
slouken@1
   761
				dstp++;
slouken@1
   762
			}
slouken@1
   763
			srcp += srcskip;
slouken@1
   764
			dstp += dstskip;
slouken@1
   765
		}
slouken@1
   766
	}
slouken@1
   767
}
slouken@1
   768
slouken@689
   769
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   770
/* fast RGB565->RGB565 blending with surface alpha */
slouken@689
   771
static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
   772
{
slouken@689
   773
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
   774
	if(alpha == 128) {
slouken@689
   775
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@689
   776
	} else {
slouken@689
   777
		int width = info->d_width;
slouken@689
   778
		int height = info->d_height;
slouken@689
   779
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
   780
		int srcskip = info->s_skip >> 1;
slouken@689
   781
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
   782
		int dstskip = info->d_skip >> 1;
slouken@689
   783
	        Uint32 s, d;
slouken@689
   784
	        Uint8 load[8];
slouken@689
   785
	  
slouken@689
   786
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@689
   787
	        *(Uint64 *)load = alpha;
slouken@689
   788
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
   789
slouken@689
   790
                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@689
   791
                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@689
   792
                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@689
   793
	  
slouken@689
   794
 	        /* Setup the 565 color channel masks */
slouken@689
   795
	        *(Uint64 *)load = 0xF800F800F800F800;
slouken@689
   796
		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
slouken@689
   797
		*(Uint64 *)load = 0x07E007E007E007E0;
slouken@689
   798
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@689
   799
		*(Uint64 *)load = 0x001F001F001F001F;
slouken@689
   800
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
   801
		while(height--) {
slouken@689
   802
                        DUFFS_LOOP_QUATRO2(
slouken@689
   803
                        {
slouken@689
   804
	                        s = *srcp++;
slouken@689
   805
				d = *dstp;
slouken@689
   806
				/*
slouken@689
   807
				 * shift out the middle component (green) to
slouken@689
   808
				 * the high 16 bits, and process all three RGB
slouken@689
   809
				 * components at the same time.
slouken@689
   810
				 */
slouken@689
   811
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
   812
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
   813
				d += (s - d) * alpha >> 5;
slouken@689
   814
				d &= 0x07e0f81f;
slouken@689
   815
				*dstp++ = d | d >> 16;
slouken@689
   816
                        },{
slouken@689
   817
	                        s = *srcp++;
slouken@689
   818
				d = *dstp;
slouken@689
   819
				/*
slouken@689
   820
				 * shift out the middle component (green) to
slouken@689
   821
				 * the high 16 bits, and process all three RGB
slouken@689
   822
				 * components at the same time.
slouken@689
   823
				 */
slouken@689
   824
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
   825
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
   826
				d += (s - d) * alpha >> 5;
slouken@689
   827
				d &= 0x07e0f81f;
slouken@689
   828
				*dstp++ = d | d >> 16;
slouken@689
   829
			        s = *srcp++;
slouken@689
   830
				d = *dstp;
slouken@689
   831
				/*
slouken@689
   832
				 * shift out the middle component (green) to
slouken@689
   833
				 * the high 16 bits, and process all three RGB
slouken@689
   834
				 * components at the same time.
slouken@689
   835
				 */
slouken@689
   836
				s = (s | s << 16) & 0x07e0f81f;
slouken@689
   837
				d = (d | d << 16) & 0x07e0f81f;
slouken@689
   838
				d += (s - d) * alpha >> 5;
slouken@689
   839
				d &= 0x07e0f81f;
slouken@689
   840
				*dstp++ = d | d >> 16;
slouken@689
   841
                        },{
slouken@689
   842
	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@689
   843
	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@689
   844
			  
slouken@689
   845
	                        /* RED */
slouken@689
   846
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
   847
	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
slouken@689
   848
	                        psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
slouken@689
   849
	
slouken@689
   850
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
   851
	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
slouken@689
   852
	                        psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
slouken@689
   853
	
slouken@689
   854
	                        /* blend */
slouken@689
   855
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
   856
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
   857
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
   858
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
   859
	                        psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */
slouken@689
   860
	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
slouken@689
   861
	
slouken@689
   862
	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
slouken@689
   863
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
   864
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
slouken@689
   865
	                        por_r2r(mm6, mm3); /* save new reds in dsts */
slouken@689
   866
	
slouken@689
   867
	                        /* green */
slouken@689
   868
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
   869
	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
slouken@689
   870
	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
slouken@689
   871
	
slouken@689
   872
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
   873
	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
slouken@689
   874
	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
slouken@689
   875
	
slouken@689
   876
	                        /* blend */
slouken@689
   877
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
   878
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
   879
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
   880
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
   881
	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
slouken@689
   882
	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
slouken@689
   883
	
slouken@689
   884
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
   885
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
   886
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
slouken@689
   887
	                        por_r2r(mm6, mm3); /* save new greens in dsts */
slouken@689
   888
	
slouken@689
   889
	                        /* blue */
slouken@689
   890
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
   891
	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
slouken@689
   892
		
slouken@689
   893
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
   894
	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@689
   895
	
slouken@689
   896
	                        /* blend */
slouken@689
   897
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
   898
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
   899
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
   900
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
   901
	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
slouken@689
   902
	
slouken@689
   903
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
   904
	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
slouken@689
   905
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
slouken@689
   906
	                        por_r2r(mm6, mm3); /* save new blues in dsts */
slouken@689
   907
	
slouken@689
   908
	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
slouken@689
   909
	
slouken@689
   910
	                        srcp += 4;
slouken@689
   911
	                        dstp += 4;
slouken@689
   912
                        }, width);			
slouken@689
   913
			srcp += srcskip;
slouken@689
   914
			dstp += dstskip;
slouken@689
   915
		}
slouken@689
   916
		emms();
slouken@689
   917
	}
slouken@689
   918
}
slouken@689
   919
slouken@689
   920
/* fast RGB555->RGB555 blending with surface alpha */
slouken@689
   921
static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
slouken@689
   922
{
slouken@689
   923
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@689
   924
	if(alpha == 128) {
slouken@689
   925
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@689
   926
	} else {
slouken@689
   927
		int width = info->d_width;
slouken@689
   928
		int height = info->d_height;
slouken@689
   929
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@689
   930
		int srcskip = info->s_skip >> 1;
slouken@689
   931
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@689
   932
		int dstskip = info->d_skip >> 1;
slouken@689
   933
	        Uint32 s, d;
slouken@689
   934
	        Uint8 load[8];
slouken@689
   935
	  
slouken@689
   936
		alpha &= ~(1+2+4);		/* cut alpha to get the exact same behaviour */
slouken@689
   937
	        *(Uint64 *)load = alpha;
slouken@689
   938
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@689
   939
slouken@689
   940
                movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
slouken@689
   941
                punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
slouken@689
   942
                punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
slouken@689
   943
	  
slouken@689
   944
 	        /* Setup the 555 color channel masks */
slouken@689
   945
	        *(Uint64 *)load = 0x7C007C007C007C00;
slouken@689
   946
		movq_m2r(*load, mm1); /* MASKRED -> mm1 */
slouken@689
   947
		*(Uint64 *)load = 0x03E003E003E003E0;
slouken@689
   948
		movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
slouken@689
   949
		*(Uint64 *)load = 0x001F001F001F001F;
slouken@689
   950
		movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
slouken@689
   951
		while(height--) {
slouken@689
   952
                        DUFFS_LOOP_QUATRO2(
slouken@689
   953
                        {
slouken@689
   954
	                        s = *srcp++;
slouken@689
   955
				d = *dstp;
slouken@689
   956
				/*
slouken@689
   957
				 * shift out the middle component (green) to
slouken@689
   958
				 * the high 16 bits, and process all three RGB
slouken@689
   959
				 * components at the same time.
slouken@689
   960
				 */
slouken@689
   961
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
   962
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
   963
				d += (s - d) * alpha >> 5;
slouken@689
   964
				d &= 0x03e07c1f;
slouken@689
   965
				*dstp++ = d | d >> 16;
slouken@689
   966
                        },{
slouken@689
   967
	                        s = *srcp++;
slouken@689
   968
				d = *dstp;
slouken@689
   969
				/*
slouken@689
   970
				 * shift out the middle component (green) to
slouken@689
   971
				 * the high 16 bits, and process all three RGB
slouken@689
   972
				 * components at the same time.
slouken@689
   973
				 */
slouken@689
   974
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
   975
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
   976
				d += (s - d) * alpha >> 5;
slouken@689
   977
				d &= 0x03e07c1f;
slouken@689
   978
				*dstp++ = d | d >> 16;
slouken@689
   979
			        s = *srcp++;
slouken@689
   980
				d = *dstp;
slouken@689
   981
				/*
slouken@689
   982
				 * shift out the middle component (green) to
slouken@689
   983
				 * the high 16 bits, and process all three RGB
slouken@689
   984
				 * components at the same time.
slouken@689
   985
				 */
slouken@689
   986
				s = (s | s << 16) & 0x03e07c1f;
slouken@689
   987
				d = (d | d << 16) & 0x03e07c1f;
slouken@689
   988
				d += (s - d) * alpha >> 5;
slouken@689
   989
				d &= 0x03e07c1f;
slouken@689
   990
				*dstp++ = d | d >> 16;
slouken@689
   991
                        },{
slouken@689
   992
	                        movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
slouken@689
   993
	                        movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
slouken@689
   994
			  
slouken@689
   995
	                        /* RED */
slouken@689
   996
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
   997
	                        pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
slouken@689
   998
	                        psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */
slouken@689
   999
	
slouken@689
  1000
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1001
	                        pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
slouken@689
  1002
	                        psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */
slouken@689
  1003
	
slouken@689
  1004
	                        /* blend */
slouken@689
  1005
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1006
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1007
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1008
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1009
	                        psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */
slouken@689
  1010
	                        pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
slouken@689
  1011
	
slouken@689
  1012
	                        movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
slouken@689
  1013
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1014
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
slouken@689
  1015
	                        por_r2r(mm6, mm3); /* save new reds in dsts */
slouken@689
  1016
	
slouken@689
  1017
	                        /* green */
slouken@689
  1018
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1019
	                        pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
slouken@689
  1020
	                        psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
slouken@689
  1021
	
slouken@689
  1022
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1023
	                        pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
slouken@689
  1024
	                        psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
slouken@689
  1025
	
slouken@689
  1026
	                        /* blend */
slouken@689
  1027
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1028
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1029
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1030
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1031
	                        psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
slouken@689
  1032
	                        pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
slouken@689
  1033
	
slouken@689
  1034
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1035
	                        por_r2r(mm7, mm5);  /* MASKBLUE | mm5 -> mm5 */
slouken@689
  1036
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
slouken@689
  1037
	                        por_r2r(mm6, mm3); /* save new greens in dsts */
slouken@689
  1038
	
slouken@689
  1039
	                        /* blue */
slouken@689
  1040
	                        movq_r2r(mm2, mm5); /* src -> mm5 */
slouken@689
  1041
	                        pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
slouken@689
  1042
		
slouken@689
  1043
	                        movq_r2r(mm3, mm6); /* dst -> mm6 */
slouken@689
  1044
	                        pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
slouken@689
  1045
	
slouken@689
  1046
	                        /* blend */
slouken@689
  1047
	                        psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
slouken@689
  1048
	                        pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
slouken@689
  1049
	                        psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
slouken@689
  1050
	                        paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
slouken@689
  1051
	                        pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
slouken@689
  1052
	
slouken@689
  1053
	                        movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
slouken@689
  1054
	                        por_r2r(mm4, mm5);  /* MASKGREEN | mm5 -> mm5 */
slouken@689
  1055
	                        pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
slouken@689
  1056
	                        por_r2r(mm6, mm3); /* save new blues in dsts */
slouken@689
  1057
	
slouken@689
  1058
	                        movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
slouken@689
  1059
	
slouken@689
  1060
	                        srcp += 4;
slouken@689
  1061
	                        dstp += 4;
slouken@689
  1062
                        }, width);			
slouken@689
  1063
			srcp += srcskip;
slouken@689
  1064
			dstp += dstskip;
slouken@689
  1065
		}
slouken@689
  1066
		emms();
slouken@689
  1067
	}
slouken@689
  1068
}
slouken@689
  1069
#endif
slouken@689
  1070
slouken@1
  1071
/* fast RGB565->RGB565 blending with surface alpha */
slouken@1
  1072
static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
slouken@1
  1073
{
slouken@1
  1074
	unsigned alpha = info->src->alpha;
slouken@1
  1075
	if(alpha == 128) {
slouken@1
  1076
		Blit16to16SurfaceAlpha128(info, 0xf7de);
slouken@1
  1077
	} else {
slouken@1
  1078
		int width = info->d_width;
slouken@1
  1079
		int height = info->d_height;
slouken@1
  1080
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  1081
		int srcskip = info->s_skip >> 1;
slouken@1
  1082
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  1083
		int dstskip = info->d_skip >> 1;
slouken@1
  1084
		alpha >>= 3;	/* downscale alpha to 5 bits */
slouken@1
  1085
slouken@1
  1086
		while(height--) {
slouken@1
  1087
			DUFFS_LOOP4({
slouken@1
  1088
				Uint32 s = *srcp++;
slouken@1
  1089
				Uint32 d = *dstp;
slouken@1
  1090
				/*
slouken@1
  1091
				 * shift out the middle component (green) to
slouken@1
  1092
				 * the high 16 bits, and process all three RGB
slouken@1
  1093
				 * components at the same time.
slouken@1
  1094
				 */
slouken@1
  1095
				s = (s | s << 16) & 0x07e0f81f;
slouken@1
  1096
				d = (d | d << 16) & 0x07e0f81f;
slouken@1
  1097
				d += (s - d) * alpha >> 5;
slouken@1
  1098
				d &= 0x07e0f81f;
slouken@1
  1099
				*dstp++ = d | d >> 16;
slouken@1
  1100
			}, width);
slouken@1
  1101
			srcp += srcskip;
slouken@1
  1102
			dstp += dstskip;
slouken@1
  1103
		}
slouken@0
  1104
	}
slouken@0
  1105
}
slouken@0
  1106
slouken@0
  1107
/* fast RGB555->RGB555 blending with surface alpha */
slouken@0
  1108
static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  1109
{
slouken@1
  1110
	unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
slouken@1
  1111
	if(alpha == 128) {
slouken@1
  1112
		Blit16to16SurfaceAlpha128(info, 0xfbde);
slouken@1
  1113
	} else {
slouken@1
  1114
		int width = info->d_width;
slouken@1
  1115
		int height = info->d_height;
slouken@1
  1116
		Uint16 *srcp = (Uint16 *)info->s_pixels;
slouken@1
  1117
		int srcskip = info->s_skip >> 1;
slouken@1
  1118
		Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@1
  1119
		int dstskip = info->d_skip >> 1;
slouken@1
  1120
		alpha >>= 3;		/* downscale alpha to 5 bits */
slouken@0
  1121
slouken@1
  1122
		while(height--) {
slouken@1
  1123
			DUFFS_LOOP4({
slouken@1
  1124
				Uint32 s = *srcp++;
slouken@1
  1125
				Uint32 d = *dstp;
slouken@1
  1126
				/*
slouken@1
  1127
				 * shift out the middle component (green) to
slouken@1
  1128
				 * the high 16 bits, and process all three RGB
slouken@1
  1129
				 * components at the same time.
slouken@1
  1130
				 */
slouken@1
  1131
				s = (s | s << 16) & 0x03e07c1f;
slouken@1
  1132
				d = (d | d << 16) & 0x03e07c1f;
slouken@1
  1133
				d += (s - d) * alpha >> 5;
slouken@1
  1134
				d &= 0x03e07c1f;
slouken@1
  1135
				*dstp++ = d | d >> 16;
slouken@1
  1136
			}, width);
slouken@1
  1137
			srcp += srcskip;
slouken@1
  1138
			dstp += dstskip;
slouken@1
  1139
		}
slouken@0
  1140
	}
slouken@0
  1141
}
slouken@0
  1142
slouken@0
  1143
/* fast ARGB8888->RGB565 blending with pixel alpha */
slouken@0
  1144
static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
slouken@0
  1145
{
slouken@0
  1146
	int width = info->d_width;
slouken@0
  1147
	int height = info->d_height;
slouken@0
  1148
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1149
	int srcskip = info->s_skip >> 2;
slouken@0
  1150
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1151
	int dstskip = info->d_skip >> 1;
slouken@0
  1152
slouken@0
  1153
	while(height--) {
slouken@0
  1154
	    DUFFS_LOOP4({
slouken@0
  1155
		Uint32 s = *srcp;
slouken@0
  1156
		unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1157
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1158
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1159
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1160
		   Benchmark this! */
slouken@689
  1161
		if(alpha) {   
slouken@689
  1162
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@0
  1163
		    *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0)
slouken@0
  1164
			  + (s >> 3  & 0x1f);
slouken@689
  1165
		  } else {
slouken@0
  1166
		    Uint32 d = *dstp;
slouken@0
  1167
		    /*
slouken@0
  1168
		     * convert source and destination to G0RAB65565
slouken@0
  1169
		     * and blend all components at the same time
slouken@0
  1170
		     */
slouken@0
  1171
		    s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
slouken@0
  1172
		      + (s >> 3 & 0x1f);
slouken@0
  1173
		    d = (d | d << 16) & 0x07e0f81f;
slouken@0
  1174
		    d += (s - d) * alpha >> 5;
slouken@0
  1175
		    d &= 0x07e0f81f;
slouken@0
  1176
		    *dstp = d | d >> 16;
slouken@689
  1177
		  }
slouken@0
  1178
		}
slouken@0
  1179
		srcp++;
slouken@0
  1180
		dstp++;
slouken@0
  1181
	    }, width);
slouken@0
  1182
	    srcp += srcskip;
slouken@0
  1183
	    dstp += dstskip;
slouken@0
  1184
	}
slouken@0
  1185
}
slouken@0
  1186
slouken@0
  1187
/* fast ARGB8888->RGB555 blending with pixel alpha */
slouken@0
  1188
static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
slouken@0
  1189
{
slouken@0
  1190
	int width = info->d_width;
slouken@0
  1191
	int height = info->d_height;
slouken@0
  1192
	Uint32 *srcp = (Uint32 *)info->s_pixels;
slouken@0
  1193
	int srcskip = info->s_skip >> 2;
slouken@0
  1194
	Uint16 *dstp = (Uint16 *)info->d_pixels;
slouken@0
  1195
	int dstskip = info->d_skip >> 1;
slouken@0
  1196
slouken@0
  1197
	while(height--) {
slouken@0
  1198
	    DUFFS_LOOP4({
slouken@0
  1199
		unsigned alpha;
slouken@0
  1200
		Uint32 s = *srcp;
slouken@0
  1201
		alpha = s >> 27; /* downscale alpha to 5 bits */
slouken@0
  1202
		/* FIXME: Here we special-case opaque alpha since the
slouken@0
  1203
		   compositioning used (>>8 instead of /255) doesn't handle
slouken@0
  1204
		   it correctly. Also special-case alpha=0 for speed?
slouken@0
  1205
		   Benchmark this! */
slouken@689
  1206
		if(alpha) {   
slouken@689
  1207
		  if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
slouken@0
  1208
		    *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0)
slouken@0
  1209
			  + (s >> 3  & 0x1f);
slouken@689
  1210
		  } else {
slouken@0
  1211
		    Uint32 d = *dstp;
slouken@0
  1212
		    /*
slouken@0
  1213
		     * convert source and destination to G0RAB65565
slouken@0
  1214
		     * and blend all components at the same time
slouken@0
  1215
		     */
slouken@0
  1216
		    s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
slouken@0
  1217
		      + (s >> 3 & 0x1f);
slouken@0
  1218
		    d = (d | d << 16) & 0x03e07c1f;
slouken@0
  1219
		    d += (s - d) * alpha >> 5;
slouken@0
  1220
		    d &= 0x03e07c1f;
slouken@0
  1221
		    *dstp = d | d >> 16;
slouken@689
  1222
		  }
slouken@0
  1223
		}
slouken@0
  1224
		srcp++;
slouken@0
  1225
		dstp++;
slouken@0
  1226
	    }, width);
slouken@0
  1227
	    srcp += srcskip;
slouken@0
  1228
	    dstp += dstskip;
slouken@0
  1229
	}
slouken@0
  1230
}
slouken@0
  1231
slouken@0
  1232
/* General (slow) N->N blending with per-surface alpha */
slouken@0
  1233
static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
slouken@0
  1234
{
slouken@0
  1235
	int width = info->d_width;
slouken@0
  1236
	int height = info->d_height;
slouken@0
  1237
	Uint8 *src = info->s_pixels;
slouken@0
  1238
	int srcskip = info->s_skip;
slouken@0
  1239
	Uint8 *dst = info->d_pixels;
slouken@0
  1240
	int dstskip = info->d_skip;
slouken@0
  1241
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  1242
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  1243
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  1244
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1245
	unsigned sA = srcfmt->alpha;
slouken@0
  1246
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  1247
slouken@689
  1248
	if(sA) {
slouken@689
  1249
	  while ( height-- ) {
slouken@0
  1250
	    DUFFS_LOOP4(
slouken@0
  1251
	    {
slouken@0
  1252
		Uint32 pixel;
slouken@0
  1253
		unsigned sR;
slouken@0
  1254
		unsigned sG;
slouken@0
  1255
		unsigned sB;
slouken@0
  1256
		unsigned dR;
slouken@0
  1257
		unsigned dG;
slouken@0
  1258
		unsigned dB;
slouken@0
  1259
		DISEMBLE_RGB(src, srcbpp, srcfmt, pixel, sR, sG, sB);
slouken@0
  1260
		DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB);
slouken@0
  1261
		ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  1262
		ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1263
		src += srcbpp;
slouken@0
  1264
		dst += dstbpp;
slouken@0
  1265
	    },
slouken@0
  1266
	    width);
slouken@0
  1267
	    src += srcskip;
slouken@0
  1268
	    dst += dstskip;
slouken@689
  1269
	  }
slouken@0
  1270
	}
slouken@0
  1271
}
slouken@0
  1272
slouken@0
  1273
/* General (slow) colorkeyed N->N blending with per-surface alpha */
slouken@0
  1274
static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
slouken@0
  1275
{
slouken@0
  1276
	int width = info->d_width;
slouken@0
  1277
	int height = info->d_height;
slouken@0
  1278
	Uint8 *src = info->s_pixels;
slouken@0
  1279
	int srcskip = info->s_skip;
slouken@0
  1280
	Uint8 *dst = info->d_pixels;
slouken@0
  1281
	int dstskip = info->d_skip;
slouken@0
  1282
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  1283
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  1284
	Uint32 ckey = srcfmt->colorkey;
slouken@0
  1285
	int srcbpp = srcfmt->BytesPerPixel;
slouken@0
  1286
	int dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1287
	unsigned sA = srcfmt->alpha;
slouken@0
  1288
	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
slouken@0
  1289
slouken@0
  1290
	while ( height-- ) {
slouken@0
  1291
	    DUFFS_LOOP4(
slouken@0
  1292
	    {
slouken@0
  1293
		Uint32 pixel;
slouken@0
  1294
		unsigned sR;
slouken@0
  1295
		unsigned sG;
slouken@0
  1296
		unsigned sB;
slouken@0
  1297
		unsigned dR;
slouken@0
  1298
		unsigned dG;
slouken@0
  1299
		unsigned dB;
slouken@0
  1300
		RETRIEVE_RGB_PIXEL(src, srcbpp, pixel);
slouken@689
  1301
		if(sA && pixel != ckey) {
slouken@0
  1302
		    RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB);
slouken@0
  1303
		    DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB);
slouken@0
  1304
		    ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@0
  1305
		    ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@0
  1306
		}
slouken@0
  1307
		src += srcbpp;
slouken@0
  1308
		dst += dstbpp;
slouken@0
  1309
	    },
slouken@0
  1310
	    width);
slouken@0
  1311
	    src += srcskip;
slouken@0
  1312
	    dst += dstskip;
slouken@0
  1313
	}
slouken@0
  1314
}
slouken@0
  1315
slouken@0
  1316
/* General (slow) N->N blending with pixel alpha */
slouken@0
  1317
static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
slouken@0
  1318
{
slouken@0
  1319
	int width = info->d_width;
slouken@0
  1320
	int height = info->d_height;
slouken@0
  1321
	Uint8 *src = info->s_pixels;
slouken@0
  1322
	int srcskip = info->s_skip;
slouken@0
  1323
	Uint8 *dst = info->d_pixels;
slouken@0
  1324
	int dstskip = info->d_skip;
slouken@0
  1325
	SDL_PixelFormat *srcfmt = info->src;
slouken@0
  1326
	SDL_PixelFormat *dstfmt = info->dst;
slouken@0
  1327
slouken@0
  1328
	int  srcbpp;
slouken@0
  1329
	int  dstbpp;
slouken@0
  1330
slouken@0
  1331
	/* Set up some basic variables */
slouken@0
  1332
	srcbpp = srcfmt->BytesPerPixel;
slouken@0
  1333
	dstbpp = dstfmt->BytesPerPixel;
slouken@0
  1334
slouken@0
  1335
	/* FIXME: for 8bpp source alpha, this doesn't get opaque values
slouken@0
  1336
	   quite right. for <8bpp source alpha, it gets them very wrong
slouken@0
  1337
	   (check all macros!)
slouken@0
  1338
	   It is unclear whether there is a good general solution that doesn't
slouken@0
  1339
	   need a branch (or a divide). */
slouken@0
  1340
	while ( height-- ) {
slouken@0
  1341
	    DUFFS_LOOP4(
slouken@0
  1342
	    {
slouken@0
  1343
		Uint32 pixel;
slouken@0
  1344
		unsigned sR;
slouken@0
  1345
		unsigned sG;
slouken@0
  1346
		unsigned sB;
slouken@0
  1347
		unsigned dR;
slouken@0
  1348
		unsigned dG;
slouken@0
  1349
		unsigned dB;
slouken@0
  1350
		unsigned sA;
slouken@0
  1351
		unsigned dA;
slouken@0
  1352
		DISEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA);
slouken@689
  1353
		if(sA) {
slouken@689
  1354
		  DISEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA);
slouken@689
  1355
		  ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
slouken@689
  1356
		  ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
slouken@689
  1357
		}
slouken@0
  1358
		src += srcbpp;
slouken@0
  1359
		dst += dstbpp;
slouken@0
  1360
	    },
slouken@0
  1361
	    width);
slouken@0
  1362
	    src += srcskip;
slouken@0
  1363
	    dst += dstskip;
slouken@0
  1364
	}
slouken@0
  1365
}
slouken@0
  1366
slouken@0
  1367
slouken@0
  1368
SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index)
slouken@0
  1369
{
slouken@0
  1370
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1371
    SDL_PixelFormat *df = surface->map->dst->format;
slouken@0
  1372
slouken@0
  1373
    if(sf->Amask == 0) {
slouken@0
  1374
	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
slouken@0
  1375
	    if(df->BytesPerPixel == 1)
slouken@0
  1376
		return BlitNto1SurfaceAlphaKey;
slouken@0
  1377
	    else
slouken@0
  1378
		return BlitNtoNSurfaceAlphaKey;
slouken@0
  1379
	} else {
slouken@0
  1380
	    /* Per-surface alpha blits */
slouken@0
  1381
	    switch(df->BytesPerPixel) {
slouken@0
  1382
	    case 1:
slouken@0
  1383
		return BlitNto1SurfaceAlpha;
slouken@0
  1384
slouken@0
  1385
	    case 2:
slouken@0
  1386
		if(surface->map->identity) {
slouken@0
  1387
		    if(df->Gmask == 0x7e0)
slouken@689
  1388
		    {
slouken@689
  1389
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
  1390
		if((CPU_Flags()&MMX_CPU)!=0)
slouken@689
  1391
			return Blit565to565SurfaceAlphaMMX;
slouken@689
  1392
		else
slouken@689
  1393
#endif
slouken@0
  1394
			return Blit565to565SurfaceAlpha;
slouken@689
  1395
		    }
slouken@0
  1396
		    else if(df->Gmask == 0x3e0)
slouken@689
  1397
		    {
slouken@689
  1398
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
  1399
		if((CPU_Flags()&MMX_CPU)!=0)
slouken@689
  1400
			return Blit555to555SurfaceAlphaMMX;
slouken@689
  1401
		else
slouken@689
  1402
#endif
slouken@0
  1403
			return Blit555to555SurfaceAlpha;
slouken@689
  1404
		    }
slouken@0
  1405
		}
slouken@0
  1406
		return BlitNtoNSurfaceAlpha;
slouken@0
  1407
slouken@0
  1408
	    case 4:
slouken@0
  1409
		if(sf->Rmask == df->Rmask
slouken@0
  1410
		   && sf->Gmask == df->Gmask
slouken@0
  1411
		   && sf->Bmask == df->Bmask
slouken@0
  1412
		   && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff
slouken@0
  1413
		   && sf->BytesPerPixel == 4)
slouken@689
  1414
		{
slouken@689
  1415
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
  1416
		if((CPU_Flags()&MMX_CPU)!=0)
slouken@689
  1417
		    return BlitRGBtoRGBSurfaceAlphaMMX;
slouken@689
  1418
		else
slouken@689
  1419
#endif
slouken@0
  1420
		    return BlitRGBtoRGBSurfaceAlpha;
slouken@689
  1421
		}
slouken@0
  1422
		else
slouken@0
  1423
		    return BlitNtoNSurfaceAlpha;
slouken@0
  1424
slouken@0
  1425
	    case 3:
slouken@0
  1426
	    default:
slouken@0
  1427
		return BlitNtoNSurfaceAlpha;
slouken@0
  1428
	    }
slouken@0
  1429
	}
slouken@0
  1430
    } else {
slouken@0
  1431
	/* Per-pixel alpha blits */
slouken@0
  1432
	switch(df->BytesPerPixel) {
slouken@0
  1433
	case 1:
slouken@0
  1434
	    return BlitNto1PixelAlpha;
slouken@0
  1435
slouken@0
  1436
	case 2:
slouken@0
  1437
	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
slouken@0
  1438
	       && sf->Gmask == 0xff00
slouken@0
  1439
	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
slouken@0
  1440
		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
slouken@0
  1441
		if(df->Gmask == 0x7e0)
slouken@0
  1442
		    return BlitARGBto565PixelAlpha;
slouken@0
  1443
		else if(df->Gmask == 0x3e0)
slouken@0
  1444
		    return BlitARGBto555PixelAlpha;
slouken@0
  1445
	    }
slouken@0
  1446
	    return BlitNtoNPixelAlpha;
slouken@0
  1447
slouken@0
  1448
	case 4:
slouken@0
  1449
	    if(sf->Amask == 0xff000000
slouken@0
  1450
	       && sf->Rmask == df->Rmask
slouken@0
  1451
	       && sf->Gmask == df->Gmask
slouken@0
  1452
	       && sf->Bmask == df->Bmask
slouken@0
  1453
	       && sf->BytesPerPixel == 4)
slouken@689
  1454
	    {
slouken@689
  1455
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
  1456
		Uint32 f;
slouken@689
  1457
		f=CPU_Flags();
slouken@689
  1458
		if((f&(TDNOW_CPU|MMX_CPU))==(TDNOW_CPU|MMX_CPU))
slouken@689
  1459
		    return BlitRGBtoRGBPixelAlphaMMX3DNOW;
slouken@689
  1460
		else
slouken@689
  1461
		if((f&MMX_CPU)!=0)
slouken@689
  1462
		    return BlitRGBtoRGBPixelAlphaMMX;
slouken@689
  1463
		else
slouken@689
  1464
#endif
slouken@689
  1465
		    return BlitRGBtoRGBPixelAlpha;
slouken@689
  1466
	    }
slouken@0
  1467
	    return BlitNtoNPixelAlpha;
slouken@0
  1468
slouken@0
  1469
	case 3:
slouken@0
  1470
	default:
slouken@0
  1471
	    return BlitNtoNPixelAlpha;
slouken@0
  1472
	}
slouken@0
  1473
    }
slouken@0
  1474
}
slouken@0
  1475