Removed Rafal Bursig's MMX RLE code, at his request.
authorSam Lantinga <slouken@libsdl.org>
Tue, 13 Jan 2009 07:20:55 +0000
changeset 3035ff602fdfdedc
parent 3034 0e821769fc51
child 3036 76a1692fcec5
Removed Rafal Bursig's MMX RLE code, at his request.
src/video/SDL_RLEaccel.c
src/video/SDL_blit.h
src/video/SDL_blit_A.c
     1.1 --- a/src/video/SDL_RLEaccel.c	Tue Jan 13 03:53:22 2009 +0000
     1.2 +++ b/src/video/SDL_RLEaccel.c	Tue Jan 13 07:20:55 2009 +0000
     1.3 @@ -91,15 +91,6 @@
     1.4  #include "SDL_blit.h"
     1.5  #include "SDL_RLEaccel_c.h"
     1.6  
     1.7 -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
     1.8 -#define MMX_ASMBLIT
     1.9 -#endif
    1.10 -
    1.11 -#ifdef MMX_ASMBLIT
    1.12 -#include "mmx.h"
    1.13 -#include "SDL_cpuinfo.h"
    1.14 -#endif
    1.15 -
    1.16  #ifndef MAX
    1.17  #define MAX(a, b) ((a) > (b) ? (a) : (b))
    1.18  #endif
    1.19 @@ -123,262 +114,6 @@
    1.20  #define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
    1.21      PIXEL_COPY(to, from, length, bpp)
    1.22  
    1.23 -#ifdef MMX_ASMBLIT
    1.24 -
    1.25 -#define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
    1.26 -    do {							\
    1.27 -	Uint32 *srcp = (Uint32 *)(from);			\
    1.28 -	Uint32 *dstp = (Uint32 *)(to);				\
    1.29 -        int i = 0x00FF00FF;					\
    1.30 -        movd_m2r(*(&i), mm3);					\
    1.31 -        punpckldq_r2r(mm3, mm3);				\
    1.32 -        i = 0xFF000000;						\
    1.33 -        movd_m2r(*(&i), mm7);					\
    1.34 -        punpckldq_r2r(mm7, mm7);				\
    1.35 -        i = alpha | alpha << 16;				\
    1.36 -        movd_m2r(*(&i), mm4);					\
    1.37 -        punpckldq_r2r(mm4, mm4);				\
    1.38 -	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
    1.39 -	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
    1.40 -        i = length;						\
    1.41 -	if(i & 1) {						\
    1.42 -          movd_m2r((*srcp), mm1); /* src -> mm1 */		\
    1.43 -          punpcklbw_r2r(mm1, mm1);				\
    1.44 -          pand_r2r(mm3, mm1);					\
    1.45 -	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
    1.46 -          punpcklbw_r2r(mm2, mm2);				\
    1.47 -          pand_r2r(mm3, mm2);					\
    1.48 -	  psubw_r2r(mm2, mm1);					\
    1.49 -	  pmullw_r2r(mm4, mm1);					\
    1.50 -	  psrlw_i2r(8, mm1);					\
    1.51 -	  paddw_r2r(mm1, mm2);					\
    1.52 -	  pand_r2r(mm3, mm2);					\
    1.53 -	  packuswb_r2r(mm2, mm2);				\
    1.54 -	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
    1.55 -	  movd_r2m(mm2, *dstp);					\
    1.56 -	  ++srcp;						\
    1.57 -	  ++dstp;						\
    1.58 -	  i--;							\
    1.59 -	}							\
    1.60 -	for(; i > 0; --i) {					\
    1.61 -          movq_m2r((*srcp), mm0);				\
    1.62 -	  movq_r2r(mm0, mm1);					\
    1.63 -          punpcklbw_r2r(mm0, mm0);				\
    1.64 -	  movq_m2r((*dstp), mm2);				\
    1.65 -	  punpckhbw_r2r(mm1, mm1);				\
    1.66 -	  movq_r2r(mm2, mm6);					\
    1.67 -          pand_r2r(mm3, mm0);					\
    1.68 -          punpcklbw_r2r(mm2, mm2);				\
    1.69 -	  pand_r2r(mm3, mm1);					\
    1.70 -	  punpckhbw_r2r(mm6, mm6);				\
    1.71 -          pand_r2r(mm3, mm2);					\
    1.72 -	  psubw_r2r(mm2, mm0);					\
    1.73 -	  pmullw_r2r(mm4, mm0);					\
    1.74 -	  pand_r2r(mm3, mm6);					\
    1.75 -	  psubw_r2r(mm6, mm1);					\
    1.76 -	  pmullw_r2r(mm4, mm1);					\
    1.77 -	  psrlw_i2r(8, mm0);					\
    1.78 -	  paddw_r2r(mm0, mm2);					\
    1.79 -	  psrlw_i2r(8, mm1);					\
    1.80 -	  paddw_r2r(mm1, mm6);					\
    1.81 -	  pand_r2r(mm3, mm2);					\
    1.82 -	  pand_r2r(mm3, mm6);					\
    1.83 -	  packuswb_r2r(mm2, mm2);				\
    1.84 -	  packuswb_r2r(mm6, mm6);				\
    1.85 -	  psrlq_i2r(32, mm2);					\
    1.86 -	  psllq_i2r(32, mm6);					\
    1.87 -	  por_r2r(mm6, mm2);					\
    1.88 -	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
    1.89 -         movq_r2m(mm2, *dstp);					\
    1.90 -	  srcp += 2;						\
    1.91 -	  dstp += 2;						\
    1.92 -	  i--;							\
    1.93 -	}							\
    1.94 -	emms();							\
    1.95 -    } while(0)
    1.96 -
    1.97 -#define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
    1.98 -    do {						\
    1.99 -        int i, n = 0;					\
   1.100 -	Uint16 *srcp = (Uint16 *)(from);		\
   1.101 -	Uint16 *dstp = (Uint16 *)(to);			\
   1.102 -        Uint32 ALPHA = 0xF800;				\
   1.103 -	movd_m2r(*(&ALPHA), mm1);			\
   1.104 -        punpcklwd_r2r(mm1, mm1);			\
   1.105 -        punpcklwd_r2r(mm1, mm1);			\
   1.106 -	ALPHA = 0x07E0;					\
   1.107 -	movd_m2r(*(&ALPHA), mm4);			\
   1.108 -        punpcklwd_r2r(mm4, mm4);			\
   1.109 -        punpcklwd_r2r(mm4, mm4);			\
   1.110 -	ALPHA = 0x001F;					\
   1.111 -	movd_m2r(*(&ALPHA), mm7);			\
   1.112 -        punpcklwd_r2r(mm7, mm7);			\
   1.113 -        punpcklwd_r2r(mm7, mm7);			\
   1.114 -	alpha &= ~(1+2+4);				\
   1.115 -        i = (Uint32)alpha | (Uint32)alpha << 16;	\
   1.116 -        movd_m2r(*(&i), mm0);				\
   1.117 -        punpckldq_r2r(mm0, mm0);			\
   1.118 -        ALPHA = alpha >> 3;				\
   1.119 -        i = ((int)(length) & 3);			\
   1.120 -	for(; i > 0; --i) {				\
   1.121 -	    Uint32 s = *srcp++;				\
   1.122 -	    Uint32 d = *dstp;				\
   1.123 -	    s = (s | s << 16) & 0x07e0f81f;		\
   1.124 -	    d = (d | d << 16) & 0x07e0f81f;		\
   1.125 -	    d += (s - d) * ALPHA >> 5;			\
   1.126 -	    d &= 0x07e0f81f;				\
   1.127 -	    *dstp++ = d | d >> 16;			\
   1.128 -	    n++;					\
   1.129 -	}						\
   1.130 -	i = (int)(length) - n;				\
   1.131 -	for(; i > 0; --i) {				\
   1.132 -	  movq_m2r((*dstp), mm3);			\
   1.133 -	  movq_m2r((*srcp), mm2);			\
   1.134 -	  movq_r2r(mm2, mm5);				\
   1.135 -	  pand_r2r(mm1 , mm5);				\
   1.136 -	  psrlq_i2r(11, mm5);				\
   1.137 -	  movq_r2r(mm3, mm6);				\
   1.138 -	  pand_r2r(mm1 , mm6);				\
   1.139 -	  psrlq_i2r(11, mm6);				\
   1.140 -	  psubw_r2r(mm6, mm5);				\
   1.141 -	  pmullw_r2r(mm0, mm5);				\
   1.142 -	  psrlw_i2r(8, mm5);				\
   1.143 -	  paddw_r2r(mm5, mm6);				\
   1.144 -	  psllq_i2r(11, mm6);				\
   1.145 -	  pand_r2r(mm1, mm6);				\
   1.146 -	  movq_r2r(mm4, mm5);				\
   1.147 -	  por_r2r(mm7, mm5);				\
   1.148 -	  pand_r2r(mm5, mm3);				\
   1.149 -	  por_r2r(mm6, mm3);				\
   1.150 -	  movq_r2r(mm2, mm5);				\
   1.151 -	  pand_r2r(mm4 , mm5);				\
   1.152 -	  psrlq_i2r(5, mm5);				\
   1.153 -	  movq_r2r(mm3, mm6);				\
   1.154 -	  pand_r2r(mm4 , mm6);				\
   1.155 -	  psrlq_i2r(5, mm6);				\
   1.156 -	  psubw_r2r(mm6, mm5);				\
   1.157 -	  pmullw_r2r(mm0, mm5);				\
   1.158 -	  psrlw_i2r(8, mm5);				\
   1.159 -	  paddw_r2r(mm5, mm6);				\
   1.160 -	  psllq_i2r(5, mm6);				\
   1.161 -	  pand_r2r(mm4, mm6);				\
   1.162 -	  movq_r2r(mm1, mm5);				\
   1.163 -	  por_r2r(mm7, mm5);				\
   1.164 -	  pand_r2r(mm5, mm3);				\
   1.165 -	  por_r2r(mm6, mm3);				\
   1.166 -	  movq_r2r(mm2, mm5);				\
   1.167 -	  pand_r2r(mm7 , mm5);				\
   1.168 -          movq_r2r(mm3, mm6);				\
   1.169 -	  pand_r2r(mm7 , mm6);				\
   1.170 -	  psubw_r2r(mm6, mm5);				\
   1.171 -	  pmullw_r2r(mm0, mm5);				\
   1.172 -	  psrlw_i2r(8, mm5);				\
   1.173 -	  paddw_r2r(mm5, mm6);				\
   1.174 -	  pand_r2r(mm7, mm6);				\
   1.175 -	  movq_r2r(mm1, mm5);				\
   1.176 -	  por_r2r(mm4, mm5);				\
   1.177 -	  pand_r2r(mm5, mm3);				\
   1.178 -	  por_r2r(mm6, mm3);				\
   1.179 -	  movq_r2m(mm3, *dstp);				\
   1.180 -	  srcp += 4;					\
   1.181 -	  dstp += 4;					\
   1.182 -	  i -= 3;					\
   1.183 -	}						\
   1.184 -	emms();						\
   1.185 -    } while(0)
   1.186 -
   1.187 -#define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
   1.188 -    do {						\
   1.189 -        int i, n = 0;					\
   1.190 -	Uint16 *srcp = (Uint16 *)(from);		\
   1.191 -	Uint16 *dstp = (Uint16 *)(to);			\
   1.192 -        Uint32 ALPHA = 0x7C00;				\
   1.193 -	movd_m2r(*(&ALPHA), mm1);			\
   1.194 -        punpcklwd_r2r(mm1, mm1);			\
   1.195 -        punpcklwd_r2r(mm1, mm1);			\
   1.196 -	ALPHA = 0x03E0;					\
   1.197 -        movd_m2r(*(&ALPHA), mm4);			\
   1.198 -        punpcklwd_r2r(mm4, mm4);			\
   1.199 -        punpcklwd_r2r(mm4, mm4);			\
   1.200 -	ALPHA = 0x001F;					\
   1.201 -	movd_m2r(*(&ALPHA), mm7);			\
   1.202 -        punpcklwd_r2r(mm7, mm7);			\
   1.203 -        punpcklwd_r2r(mm7, mm7);			\
   1.204 -	alpha &= ~(1+2+4);				\
   1.205 -        i = (Uint32)alpha | (Uint32)alpha << 16;	\
   1.206 -        movd_m2r(*(&i), mm0);				\
   1.207 -        punpckldq_r2r(mm0, mm0);			\
   1.208 -        i = ((int)(length) & 3);				\
   1.209 -        ALPHA = alpha >> 3;				\
   1.210 -	for(; i > 0; --i) {				\
   1.211 -	    Uint32 s = *srcp++;				\
   1.212 -	    Uint32 d = *dstp;				\
   1.213 -	    s = (s | s << 16) & 0x03e07c1f;		\
   1.214 -	    d = (d | d << 16) & 0x03e07c1f;		\
   1.215 -	    d += (s - d) * ALPHA >> 5;			\
   1.216 -	    d &= 0x03e07c1f;				\
   1.217 -	    *dstp++ = d | d >> 16;			\
   1.218 -	    n++;					\
   1.219 -	}						\
   1.220 -	i = (int)(length) - n;				\
   1.221 -	for(; i > 0; --i) {				\
   1.222 -	  movq_m2r((*dstp), mm3);			\
   1.223 -	  movq_m2r((*srcp), mm2);			\
   1.224 -	  movq_r2r(mm2, mm5);				\
   1.225 -	  pand_r2r(mm1 , mm5);				\
   1.226 -	  psrlq_i2r(10, mm5);				\
   1.227 -	  movq_r2r(mm3, mm6);				\
   1.228 -	  pand_r2r(mm1 , mm6);				\
   1.229 -	  psrlq_i2r(10, mm6);				\
   1.230 -	  psubw_r2r(mm6, mm5);				\
   1.231 -	  pmullw_r2r(mm0, mm5);				\
   1.232 -	  psrlw_i2r(8, mm5);				\
   1.233 -	  paddw_r2r(mm5, mm6);				\
   1.234 -	  psllq_i2r(10, mm6);				\
   1.235 -	  pand_r2r(mm1, mm6);				\
   1.236 -	  movq_r2r(mm4, mm5);				\
   1.237 -	  por_r2r(mm7, mm5);				\
   1.238 -	  pand_r2r(mm5, mm3);				\
   1.239 -	  por_r2r(mm6, mm3);				\
   1.240 -	  movq_r2r(mm2, mm5);				\
   1.241 -	  pand_r2r(mm4 , mm5);				\
   1.242 -	  psrlq_i2r(5, mm5);				\
   1.243 -	  movq_r2r(mm3, mm6);				\
   1.244 -	  pand_r2r(mm4 , mm6);				\
   1.245 -	  psrlq_i2r(5, mm6);				\
   1.246 -	  psubw_r2r(mm6, mm5);				\
   1.247 -	  pmullw_r2r(mm0, mm5);				\
   1.248 -	  psrlw_i2r(8, mm5);				\
   1.249 -	  paddw_r2r(mm5, mm6);				\
   1.250 -	  psllq_i2r(5, mm6);				\
   1.251 -	  pand_r2r(mm4, mm6);				\
   1.252 -	  movq_r2r(mm1, mm5);				\
   1.253 -	  por_r2r(mm7, mm5);				\
   1.254 -	  pand_r2r(mm5, mm3);				\
   1.255 -	  por_r2r(mm6, mm3);				\
   1.256 -	  movq_r2r(mm2, mm5);				\
   1.257 -	  pand_r2r(mm7 , mm5);				\
   1.258 -          movq_r2r(mm3, mm6);				\
   1.259 -	  pand_r2r(mm7 , mm6);				\
   1.260 -	  psubw_r2r(mm6, mm5);				\
   1.261 -	  pmullw_r2r(mm0, mm5);				\
   1.262 -	  psrlw_i2r(8, mm5);				\
   1.263 -	  paddw_r2r(mm5, mm6);				\
   1.264 -	  pand_r2r(mm7, mm6);				\
   1.265 -	  movq_r2r(mm1, mm5);				\
   1.266 -	  por_r2r(mm4, mm5);				\
   1.267 -	  pand_r2r(mm5, mm3);				\
   1.268 -	  por_r2r(mm6, mm3);				\
   1.269 -	  movq_r2m(mm3, *dstp);				\
   1.270 -	  srcp += 4;					\
   1.271 -	  dstp += 4;					\
   1.272 -	  i -= 3;					\
   1.273 -	}						\
   1.274 -	emms();						\
   1.275 -    } while(0)
   1.276 -
   1.277 -#endif
   1.278 -
   1.279  /*
   1.280   * For 32bpp pixels on the form 0x00rrggbb:
   1.281   * If we treat the middle component separately, we can process the two
   1.282 @@ -504,48 +239,6 @@
   1.283  	}								\
   1.284      } while(0)
   1.285  
   1.286 -#ifdef MMX_ASMBLIT
   1.287 -
   1.288 -#define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
   1.289 -    do {								\
   1.290 -	Uint32 *srcp = (Uint32 *)(from);				\
   1.291 -	Uint32 *dstp = (Uint32 *)(to);					\
   1.292 -        int i = 0x00fefefe;						\
   1.293 -        movd_m2r(*(&i), mm4);						\
   1.294 -        punpckldq_r2r(mm4, mm4);					\
   1.295 -        i = 0x00010101;							\
   1.296 -        movd_m2r(*(&i), mm3);						\
   1.297 -        punpckldq_r2r(mm3, mm3);					\
   1.298 -        i = (int)(length);						\
   1.299 -        if( i & 1 ) {							\
   1.300 -	  Uint32 s = *srcp++;						\
   1.301 -	  Uint32 d = *dstp;						\
   1.302 -	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
   1.303 -		     + (s & d & 0x00010101);				\
   1.304 -	  i--;								\
   1.305 -	}								\
   1.306 -	for(; i > 0; --i) {						\
   1.307 -	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
   1.308 -	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
   1.309 -	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
   1.310 -	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
   1.311 -	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
   1.312 -	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
   1.313 -	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
   1.314 -	    psrld_i2r(1, mm5);						\
   1.315 -	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
   1.316 -	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
   1.317 -	    paddd_r2r(mm5, mm2);					\
   1.318 -	    movq_r2m(mm2, (*dstp));					\
   1.319 -	    dstp += 2;							\
   1.320 -	    srcp += 2;							\
   1.321 -	    i--;							\
   1.322 -	}								\
   1.323 -	emms();								\
   1.324 -    } while(0)
   1.325 -
   1.326 -#endif
   1.327 -
   1.328  /*
   1.329   * Special case: 50% alpha (alpha=128)
   1.330   * This is treated specially because it can be optimized very well, and
   1.331 @@ -617,94 +310,6 @@
   1.332  #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
   1.333      ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
   1.334  
   1.335 -#ifdef MMX_ASMBLIT
   1.336 -
   1.337 -#define CHOOSE_BLIT(blitter, alpha, fmt)				\
   1.338 -    do {								\
   1.339 -        if(alpha == 255) {						\
   1.340 -	    switch(fmt->BytesPerPixel) {				\
   1.341 -	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
   1.342 -	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
   1.343 -	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
   1.344 -	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
   1.345 -	    }								\
   1.346 -	} else {							\
   1.347 -	    switch(fmt->BytesPerPixel) {				\
   1.348 -	    case 1:							\
   1.349 -		/* No 8bpp alpha blitting */				\
   1.350 -		break;							\
   1.351 -									\
   1.352 -	    case 2:							\
   1.353 -		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
   1.354 -		case 0xffff:						\
   1.355 -		    if(fmt->Gmask == 0x07e0				\
   1.356 -		       || fmt->Rmask == 0x07e0				\
   1.357 -		       || fmt->Bmask == 0x07e0) {			\
   1.358 -			if(alpha == 128)				\
   1.359 -			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
   1.360 -			else {						\
   1.361 -			    if(SDL_HasMMX())				\
   1.362 -				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
   1.363 -			    else					\
   1.364 -				blitter(2, Uint8, ALPHA_BLIT16_565);	\
   1.365 -			}						\
   1.366 -		    } else						\
   1.367 -			goto general16;					\
   1.368 -		    break;						\
   1.369 -									\
   1.370 -		case 0x7fff:						\
   1.371 -		    if(fmt->Gmask == 0x03e0				\
   1.372 -		       || fmt->Rmask == 0x03e0				\
   1.373 -		       || fmt->Bmask == 0x03e0) {			\
   1.374 -			if(alpha == 128)				\
   1.375 -			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
   1.376 -			else {						\
   1.377 -			    if(SDL_HasMMX())				\
   1.378 -				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
   1.379 -			    else					\
   1.380 -				blitter(2, Uint8, ALPHA_BLIT16_555);	\
   1.381 -			}						\
   1.382 -			break;						\
   1.383 -		    }							\
   1.384 -		    /* fallthrough */					\
   1.385 -									\
   1.386 -		default:						\
   1.387 -		general16:						\
   1.388 -		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
   1.389 -		}							\
   1.390 -		break;							\
   1.391 -									\
   1.392 -	    case 3:							\
   1.393 -		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
   1.394 -		break;							\
   1.395 -									\
   1.396 -	    case 4:							\
   1.397 -		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
   1.398 -		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
   1.399 -		       || fmt->Bmask == 0xff00)) {			\
   1.400 -		    if(alpha == 128)					\
   1.401 -		    {							\
   1.402 -			if(SDL_HasMMX())				\
   1.403 -				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
   1.404 -			else						\
   1.405 -				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
   1.406 -		    }							\
   1.407 -		    else						\
   1.408 -		    {							\
   1.409 -			if(SDL_HasMMX())				\
   1.410 -				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
   1.411 -			else						\
   1.412 -				blitter(4, Uint16, ALPHA_BLIT32_888);	\
   1.413 -		    }							\
   1.414 -		} else							\
   1.415 -		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
   1.416 -		break;							\
   1.417 -	    }								\
   1.418 -	}								\
   1.419 -    } while(0)
   1.420 -
   1.421 -#else
   1.422 -
   1.423  #define CHOOSE_BLIT(blitter, alpha, fmt)				\
   1.424      do {								\
   1.425          if(alpha == 255) {						\
   1.426 @@ -773,8 +378,6 @@
   1.427  	}								\
   1.428      } while(0)
   1.429  
   1.430 -#endif
   1.431 -
   1.432  /*
   1.433   * This takes care of the case when the surface is clipped on the left and/or
   1.434   * right. Top clipping has already been taken care of.
     2.1 --- a/src/video/SDL_blit.h	Tue Jan 13 03:53:22 2009 +0000
     2.2 +++ b/src/video/SDL_blit.h	Tue Jan 13 07:20:55 2009 +0000
     2.3 @@ -476,48 +476,7 @@
     2.4  	case 3:		pixel_copy_increment;				\
     2.5  	case 2:		pixel_copy_increment;				\
     2.6  	case 1:		pixel_copy_increment;				\
     2.7 -		} while ( --n > 0 );					\
     2.8 -	}								\
     2.9 -}
    2.10 -
    2.11 -/* 2 - times unrolled loop */
    2.12 -#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
    2.13 -				double_pixel_copy_increment, width)	\
    2.14 -{ int n, w = width;							\
    2.15 -	if( w & 1 ) {							\
    2.16 -	    pixel_copy_increment;					\
    2.17 -	    w--;							\
    2.18 -	}								\
    2.19 -	if ( w > 0 )	{						\
    2.20 -	    n = ( w + 2) / 4;						\
    2.21 -	    switch( w & 2 ) {						\
    2.22 -	    case 0: do {	double_pixel_copy_increment;		\
    2.23 -	    case 2:		double_pixel_copy_increment;		\
    2.24 -		    } while ( --n > 0 );					\
    2.25 -	    }								\
    2.26 -	}								\
    2.27 -}
    2.28 -
    2.29 -/* 2 - times unrolled loop 4 pixels */
    2.30 -#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
    2.31 -				double_pixel_copy_increment,		\
    2.32 -				quatro_pixel_copy_increment, width)	\
    2.33 -{ int n, w = width;								\
    2.34 -        if(w & 1) {							\
    2.35 -	  pixel_copy_increment;						\
    2.36 -	  w--;								\
    2.37 -	}								\
    2.38 -	if(w & 2) {							\
    2.39 -	  double_pixel_copy_increment;					\
    2.40 -	  w -= 2;							\
    2.41 -	}								\
    2.42 -	if ( w > 0 ) {							\
    2.43 -	    n = ( w + 7 ) / 8;						\
    2.44 -	    switch( w & 4 ) {						\
    2.45 -	    case 0: do {	quatro_pixel_copy_increment;		\
    2.46 -	    case 4:		quatro_pixel_copy_increment;		\
    2.47 -		    } while ( --n > 0 );					\
    2.48 -	    }								\
    2.49 +		} while (--n > 0);					\
    2.50  	}								\
    2.51  }
    2.52  
    2.53 @@ -525,40 +484,28 @@
    2.54  #define DUFFS_LOOP(pixel_copy_increment, width)				\
    2.55  	DUFFS_LOOP8(pixel_copy_increment, width)
    2.56  
    2.57 -#else
    2.58 -
    2.59 -/* Don't use Duff's device to unroll loops */
    2.60 -#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
    2.61 -			 double_pixel_copy_increment, width)		\
    2.62 -{ int n = width;								\
    2.63 -    if( n & 1 ) {							\
    2.64 -	pixel_copy_increment;						\
    2.65 -	n--;								\
    2.66 -    }									\
    2.67 -    n=n>>1;								\
    2.68 -    for(; n > 0; --n) {   						\
    2.69 -	double_pixel_copy_increment;					\
    2.70 -    }									\
    2.71 +/* Special version of Duff's device for even more optimization */
    2.72 +#define DUFFS_LOOP_124(pixel_copy_increment1,				\
    2.73 +                       pixel_copy_increment2,				\
    2.74 +                       pixel_copy_increment4, width)			\
    2.75 +{ int n = width;							\
    2.76 +	if (n & 1) {							\
    2.77 +		pixel_copy_increment1; n -= 1;				\
    2.78 +	}								\
    2.79 +	if (n & 2) {							\
    2.80 +		pixel_copy_increment2; n -= 2;				\
    2.81 +	}								\
    2.82 +	if (n) {							\
    2.83 +		n = (n+7)/ 8;						\
    2.84 +		switch (n & 4) {					\
    2.85 +		case 0: do {	pixel_copy_increment4;			\
    2.86 +		case 4:		pixel_copy_increment4;			\
    2.87 +			} while (--n > 0);				\
    2.88 +		}							\
    2.89 +	}								\
    2.90  }
    2.91  
    2.92 -/* Don't use Duff's device to unroll loops */
    2.93 -#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
    2.94 -				double_pixel_copy_increment,		\
    2.95 -				quatro_pixel_copy_increment, width)	\
    2.96 -{ int n = width;								\
    2.97 -        if(n & 1) {							\
    2.98 -	  pixel_copy_increment;						\
    2.99 -	  n--;								\
   2.100 -	}								\
   2.101 -	if(n & 2) {							\
   2.102 -	  double_pixel_copy_increment;					\
   2.103 -	  n -= 2;							\
   2.104 -	}								\
   2.105 -	n=n>>2;								\
   2.106 -	for(; n > 0; --n) {   						\
   2.107 -	  quatro_pixel_copy_increment;					\
   2.108 -        }								\
   2.109 -}
   2.110 +#else
   2.111  
   2.112  /* Don't use Duff's device to unroll loops */
   2.113  #define DUFFS_LOOP(pixel_copy_increment, width)				\
   2.114 @@ -571,6 +518,10 @@
   2.115  	DUFFS_LOOP(pixel_copy_increment, width)
   2.116  #define DUFFS_LOOP4(pixel_copy_increment, width)			\
   2.117  	DUFFS_LOOP(pixel_copy_increment, width)
   2.118 +#define DUFFS_LOOP_124(pixel_copy_increment1,				\
   2.119 +                       pixel_copy_increment2,				\
   2.120 +                       pixel_copy_increment4, width)			\
   2.121 +	DUFFS_LOOP(pixel_copy_increment1, width)
   2.122  
   2.123  #endif /* USE_DUFFS_LOOP */
   2.124  
     3.1 --- a/src/video/SDL_blit_A.c	Tue Jan 13 03:53:22 2009 +0000
     3.2 +++ b/src/video/SDL_blit_A.c	Tue Jan 13 07:20:55 2009 +0000
     3.3 @@ -1266,8 +1266,7 @@
     3.4  
     3.5          while (height--) {
     3.6  			/* *INDENT-OFF* */
     3.7 -			DUFFS_LOOP_DOUBLE2({
     3.8 -				/* One Pixel Blend */
     3.9 +			DUFFS_LOOP4({
    3.10  				s = *srcp;
    3.11  				d = *dstp;
    3.12  				s1 = s & 0xff00ff;
    3.13 @@ -1280,35 +1279,6 @@
    3.14  				*dstp = d1 | d | 0xff000000;
    3.15  				++srcp;
    3.16  				++dstp;
    3.17 -			},{
    3.18 -			        /* Two Pixels Blend */
    3.19 -				s = *srcp;
    3.20 -				d = *dstp;
    3.21 -				s1 = s & 0xff00ff;
    3.22 -				d1 = d & 0xff00ff;
    3.23 -				d1 += (s1 - d1) * alpha >> 8;
    3.24 -				d1 &= 0xff00ff;
    3.25 -				     
    3.26 -				s = ((s & 0xff00) >> 8) | 
    3.27 -					((srcp[1] & 0xff00) << 8);
    3.28 -				d = ((d & 0xff00) >> 8) |
    3.29 -					((dstp[1] & 0xff00) << 8);
    3.30 -				d += (s - d) * alpha >> 8;
    3.31 -				d &= 0x00ff00ff;
    3.32 -				
    3.33 -				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
    3.34 -				++srcp;
    3.35 -				
    3.36 -			        s1 = *srcp;
    3.37 -				d1 = *dstp;
    3.38 -				s1 &= 0xff00ff;
    3.39 -				d1 &= 0xff00ff;
    3.40 -				d1 += (s1 - d1) * alpha >> 8;
    3.41 -				d1 &= 0xff00ff;
    3.42 -				
    3.43 -				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
    3.44 -				++srcp;
    3.45 -				++dstp;
    3.46  			}, width);
    3.47  			/* *INDENT-ON* */
    3.48              srcp += srcskip;
    3.49 @@ -1588,7 +1558,7 @@
    3.50  
    3.51          while (height--) {
    3.52  			/* *INDENT-OFF* */
    3.53 -			DUFFS_LOOP_QUATRO2(
    3.54 +			DUFFS_LOOP_124(
    3.55  			{
    3.56  				s = *srcp++;
    3.57  				d = *dstp;
    3.58 @@ -1726,7 +1696,7 @@
    3.59  
    3.60          while (height--) {
    3.61  			/* *INDENT-OFF* */
    3.62 -			DUFFS_LOOP_QUATRO2(
    3.63 +			DUFFS_LOOP_124(
    3.64  			{
    3.65  				s = *srcp++;
    3.66  				d = *dstp;