From ea79a5f19c5e6821336c1810558d9e51e0543a0d Mon Sep 17 00:00:00 2001
From: Sam Lantinga <slouken@libsdl.org>
Date: Tue, 13 Jan 2009 07:20:55 +0000
Subject: [PATCH] Removed Rafal Bursig's MMX RLE code, at his request.

---
 src/video/SDL_RLEaccel.c | 397 ---------------------------------------
 src/video/SDL_blit.h     |  97 +++-------
 src/video/SDL_blit_A.c   |  36 +---
 3 files changed, 27 insertions(+), 503 deletions(-)

diff --git a/src/video/SDL_RLEaccel.c b/src/video/SDL_RLEaccel.c
index a5e39799f..4be519c8e 100644
--- a/src/video/SDL_RLEaccel.c
+++ b/src/video/SDL_RLEaccel.c
@@ -91,15 +91,6 @@
 #include "SDL_blit.h"
 #include "SDL_RLEaccel_c.h"
 
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
-#define MMX_ASMBLIT
-#endif
-
-#ifdef MMX_ASMBLIT
-#include "mmx.h"
-#include "SDL_cpuinfo.h"
-#endif
-
 #ifndef MAX
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
@@ -123,262 +114,6 @@ do {							\
 #define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
     PIXEL_COPY(to, from, length, bpp)
 
-#ifdef MMX_ASMBLIT
-
-#define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
-    do {							\
-	Uint32 *srcp = (Uint32 *)(from);			\
-	Uint32 *dstp = (Uint32 *)(to);				\
-        int i = 0x00FF00FF;					\
-        movd_m2r(*(&i), mm3);					\
-        punpckldq_r2r(mm3, mm3);				\
-        i = 0xFF000000;						\
-        movd_m2r(*(&i), mm7);					\
-        punpckldq_r2r(mm7, mm7);				\
-        i = alpha | alpha << 16;				\
-        movd_m2r(*(&i), mm4);					\
-        punpckldq_r2r(mm4, mm4);				\
-	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
-	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
-        i = length;						\
-	if(i & 1) {						\
-          movd_m2r((*srcp), mm1); /* src -> mm1 */		\
-          punpcklbw_r2r(mm1, mm1);				\
-          pand_r2r(mm3, mm1);					\
-	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
-          punpcklbw_r2r(mm2, mm2);				\
-          pand_r2r(mm3, mm2);					\
-	  psubw_r2r(mm2, mm1);					\
-	  pmullw_r2r(mm4, mm1);					\
-	  psrlw_i2r(8, mm1);					\
-	  paddw_r2r(mm1, mm2);					\
-	  pand_r2r(mm3, mm2);					\
-	  packuswb_r2r(mm2, mm2);				\
-	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
-	  movd_r2m(mm2, *dstp);					\
-	  ++srcp;						\
-	  ++dstp;						\
-	  i--;							\
-	}							\
-	for(; i > 0; --i) {					\
-          movq_m2r((*srcp), mm0);				\
-	  movq_r2r(mm0, mm1);					\
-          punpcklbw_r2r(mm0, mm0);				\
-	  movq_m2r((*dstp), mm2);				\
-	  punpckhbw_r2r(mm1, mm1);				\
-	  movq_r2r(mm2, mm6);					\
-          pand_r2r(mm3, mm0);					\
-          punpcklbw_r2r(mm2, mm2);				\
-	  pand_r2r(mm3, mm1);					\
-	  punpckhbw_r2r(mm6, mm6);				\
-          pand_r2r(mm3, mm2);					\
-	  psubw_r2r(mm2, mm0);					\
-	  pmullw_r2r(mm4, mm0);					\
-	  pand_r2r(mm3, mm6);					\
-	  psubw_r2r(mm6, mm1);					\
-	  pmullw_r2r(mm4, mm1);					\
-	  psrlw_i2r(8, mm0);					\
-	  paddw_r2r(mm0, mm2);					\
-	  psrlw_i2r(8, mm1);					\
-	  paddw_r2r(mm1, mm6);					\
-	  pand_r2r(mm3, mm2);					\
-	  pand_r2r(mm3, mm6);					\
-	  packuswb_r2r(mm2, mm2);				\
-	  packuswb_r2r(mm6, mm6);				\
-	  psrlq_i2r(32, mm2);					\
-	  psllq_i2r(32, mm6);					\
-	  por_r2r(mm6, mm2);					\
-	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
-         movq_r2m(mm2, *dstp);					\
-	  srcp += 2;						\
-	  dstp += 2;						\
-	  i--;							\
-	}							\
-	emms();							\
-    } while(0)
-
-#define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
-    do {						\
-        int i, n = 0;					\
-	Uint16 *srcp = (Uint16 *)(from);		\
-	Uint16 *dstp = (Uint16 *)(to);			\
-        Uint32 ALPHA = 0xF800;				\
-	movd_m2r(*(&ALPHA), mm1);			\
-        punpcklwd_r2r(mm1, mm1);			\
-        punpcklwd_r2r(mm1, mm1);			\
-	ALPHA = 0x07E0;					\
-	movd_m2r(*(&ALPHA), mm4);			\
-        punpcklwd_r2r(mm4, mm4);			\
-        punpcklwd_r2r(mm4, mm4);			\
-	ALPHA = 0x001F;					\
-	movd_m2r(*(&ALPHA), mm7);			\
-        punpcklwd_r2r(mm7, mm7);			\
-        punpcklwd_r2r(mm7, mm7);			\
-	alpha &= ~(1+2+4);				\
-        i = (Uint32)alpha | (Uint32)alpha << 16;	\
-        movd_m2r(*(&i), mm0);				\
-        punpckldq_r2r(mm0, mm0);			\
-        ALPHA = alpha >> 3;				\
-        i = ((int)(length) & 3);			\
-	for(; i > 0; --i) {				\
-	    Uint32 s = *srcp++;				\
-	    Uint32 d = *dstp;				\
-	    s = (s | s << 16) & 0x07e0f81f;		\
-	    d = (d | d << 16) & 0x07e0f81f;		\
-	    d += (s - d) * ALPHA >> 5;			\
-	    d &= 0x07e0f81f;				\
-	    *dstp++ = d | d >> 16;			\
-	    n++;					\
-	}						\
-	i = (int)(length) - n;				\
-	for(; i > 0; --i) {				\
-	  movq_m2r((*dstp), mm3);			\
-	  movq_m2r((*srcp), mm2);			\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm1 , mm5);				\
-	  psrlq_i2r(11, mm5);				\
-	  movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm1 , mm6);				\
-	  psrlq_i2r(11, mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  psllq_i2r(11, mm6);				\
-	  pand_r2r(mm1, mm6);				\
-	  movq_r2r(mm4, mm5);				\
-	  por_r2r(mm7, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm4 , mm5);				\
-	  psrlq_i2r(5, mm5);				\
-	  movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm4 , mm6);				\
-	  psrlq_i2r(5, mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  psllq_i2r(5, mm6);				\
-	  pand_r2r(mm4, mm6);				\
-	  movq_r2r(mm1, mm5);				\
-	  por_r2r(mm7, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm7 , mm5);				\
-          movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm7 , mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  pand_r2r(mm7, mm6);				\
-	  movq_r2r(mm1, mm5);				\
-	  por_r2r(mm4, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2m(mm3, *dstp);				\
-	  srcp += 4;					\
-	  dstp += 4;					\
-	  i -= 3;					\
-	}						\
-	emms();						\
-    } while(0)
-
-#define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
-    do {						\
-        int i, n = 0;					\
-	Uint16 *srcp = (Uint16 *)(from);		\
-	Uint16 *dstp = (Uint16 *)(to);			\
-        Uint32 ALPHA = 0x7C00;				\
-	movd_m2r(*(&ALPHA), mm1);			\
-        punpcklwd_r2r(mm1, mm1);			\
-        punpcklwd_r2r(mm1, mm1);			\
-	ALPHA = 0x03E0;					\
-        movd_m2r(*(&ALPHA), mm4);			\
-        punpcklwd_r2r(mm4, mm4);			\
-        punpcklwd_r2r(mm4, mm4);			\
-	ALPHA = 0x001F;					\
-	movd_m2r(*(&ALPHA), mm7);			\
-        punpcklwd_r2r(mm7, mm7);			\
-        punpcklwd_r2r(mm7, mm7);			\
-	alpha &= ~(1+2+4);				\
-        i = (Uint32)alpha | (Uint32)alpha << 16;	\
-        movd_m2r(*(&i), mm0);				\
-        punpckldq_r2r(mm0, mm0);			\
-        i = ((int)(length) & 3);				\
-        ALPHA = alpha >> 3;				\
-	for(; i > 0; --i) {				\
-	    Uint32 s = *srcp++;				\
-	    Uint32 d = *dstp;				\
-	    s = (s | s << 16) & 0x03e07c1f;		\
-	    d = (d | d << 16) & 0x03e07c1f;		\
-	    d += (s - d) * ALPHA >> 5;			\
-	    d &= 0x03e07c1f;				\
-	    *dstp++ = d | d >> 16;			\
-	    n++;					\
-	}						\
-	i = (int)(length) - n;				\
-	for(; i > 0; --i) {				\
-	  movq_m2r((*dstp), mm3);			\
-	  movq_m2r((*srcp), mm2);			\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm1 , mm5);				\
-	  psrlq_i2r(10, mm5);				\
-	  movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm1 , mm6);				\
-	  psrlq_i2r(10, mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  psllq_i2r(10, mm6);				\
-	  pand_r2r(mm1, mm6);				\
-	  movq_r2r(mm4, mm5);				\
-	  por_r2r(mm7, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm4 , mm5);				\
-	  psrlq_i2r(5, mm5);				\
-	  movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm4 , mm6);				\
-	  psrlq_i2r(5, mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  psllq_i2r(5, mm6);				\
-	  pand_r2r(mm4, mm6);				\
-	  movq_r2r(mm1, mm5);				\
-	  por_r2r(mm7, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2r(mm2, mm5);				\
-	  pand_r2r(mm7 , mm5);				\
-          movq_r2r(mm3, mm6);				\
-	  pand_r2r(mm7 , mm6);				\
-	  psubw_r2r(mm6, mm5);				\
-	  pmullw_r2r(mm0, mm5);				\
-	  psrlw_i2r(8, mm5);				\
-	  paddw_r2r(mm5, mm6);				\
-	  pand_r2r(mm7, mm6);				\
-	  movq_r2r(mm1, mm5);				\
-	  por_r2r(mm4, mm5);				\
-	  pand_r2r(mm5, mm3);				\
-	  por_r2r(mm6, mm3);				\
-	  movq_r2m(mm3, *dstp);				\
-	  srcp += 4;					\
-	  dstp += 4;					\
-	  i -= 3;					\
-	}						\
-	emms();						\
-    } while(0)
-
-#endif
-
 /*
  * For 32bpp pixels on the form 0x00rrggbb:
  * If we treat the middle component separately, we can process the two
@@ -504,48 +239,6 @@ do {							\
 	}								\
     } while(0)
 
-#ifdef MMX_ASMBLIT
-
-#define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
-    do {								\
-	Uint32 *srcp = (Uint32 *)(from);				\
-	Uint32 *dstp = (Uint32 *)(to);					\
-        int i = 0x00fefefe;						\
-        movd_m2r(*(&i), mm4);						\
-        punpckldq_r2r(mm4, mm4);					\
-        i = 0x00010101;							\
-        movd_m2r(*(&i), mm3);						\
-        punpckldq_r2r(mm3, mm3);					\
-        i = (int)(length);						\
-        if( i & 1 ) {							\
-	  Uint32 s = *srcp++;						\
-	  Uint32 d = *dstp;						\
-	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
-		     + (s & d & 0x00010101);				\
-	  i--;								\
-	}								\
-	for(; i > 0; --i) {						\
-	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
-	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
-	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
-	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
-	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
-	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
-	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
-	    psrld_i2r(1, mm5);						\
-	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
-	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
-	    paddd_r2r(mm5, mm2);					\
-	    movq_r2m(mm2, (*dstp));					\
-	    dstp += 2;							\
-	    srcp += 2;							\
-	    i--;							\
-	}								\
-	emms();								\
-    } while(0)
-
-#endif
-
 /*
  * Special case: 50% alpha (alpha=128)
  * This is treated specially because it can be optimized very well, and
@@ -617,94 +310,6 @@ do {							\
 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
 
-#ifdef MMX_ASMBLIT
-
-#define CHOOSE_BLIT(blitter, alpha, fmt)				\
-    do {								\
-        if(alpha == 255) {						\
-	    switch(fmt->BytesPerPixel) {				\
-	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
-	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
-	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
-	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
-	    }								\
-	} else {							\
-	    switch(fmt->BytesPerPixel) {				\
-	    case 1:							\
-		/* No 8bpp alpha blitting */				\
-		break;							\
-									\
-	    case 2:							\
-		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
-		case 0xffff:						\
-		    if(fmt->Gmask == 0x07e0				\
-		       || fmt->Rmask == 0x07e0				\
-		       || fmt->Bmask == 0x07e0) {			\
-			if(alpha == 128)				\
-			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
-			else {						\
-			    if(SDL_HasMMX())				\
-				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
-			    else					\
-				blitter(2, Uint8, ALPHA_BLIT16_565);	\
-			}						\
-		    } else						\
-			goto general16;					\
-		    break;						\
-									\
-		case 0x7fff:						\
-		    if(fmt->Gmask == 0x03e0				\
-		       || fmt->Rmask == 0x03e0				\
-		       || fmt->Bmask == 0x03e0) {			\
-			if(alpha == 128)				\
-			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
-			else {						\
-			    if(SDL_HasMMX())				\
-				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
-			    else					\
-				blitter(2, Uint8, ALPHA_BLIT16_555);	\
-			}						\
-			break;						\
-		    }							\
-		    /* fallthrough */					\
-									\
-		default:						\
-		general16:						\
-		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
-		}							\
-		break;							\
-									\
-	    case 3:							\
-		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
-		break;							\
-									\
-	    case 4:							\
-		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
-		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
-		       || fmt->Bmask == 0xff00)) {			\
-		    if(alpha == 128)					\
-		    {							\
-			if(SDL_HasMMX())				\
-				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
-			else						\
-				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
-		    }							\
-		    else						\
-		    {							\
-			if(SDL_HasMMX())				\
-				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
-			else						\
-				blitter(4, Uint16, ALPHA_BLIT32_888);	\
-		    }							\
-		} else							\
-		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
-		break;							\
-	    }								\
-	}								\
-    } while(0)
-
-#else
-
 #define CHOOSE_BLIT(blitter, alpha, fmt)				\
     do {								\
         if(alpha == 255) {						\
@@ -773,8 +378,6 @@ do {							\
 	}								\
     } while(0)
 
-#endif
-
 /*
  * This takes care of the case when the surface is clipped on the left and/or
  * right. Top clipping has already been taken care of.
diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h
index 46e416403..bd62ceb68 100644
--- a/src/video/SDL_blit.h
+++ b/src/video/SDL_blit.h
@@ -476,48 +476,7 @@ do {						\
 	case 3:		pixel_copy_increment;				\
 	case 2:		pixel_copy_increment;				\
 	case 1:		pixel_copy_increment;				\
-		} while ( --n > 0 );					\
-	}								\
-}
-
-/* 2 - times unrolled loop */
-#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
-				double_pixel_copy_increment, width)	\
-{ int n, w = width;							\
-	if( w & 1 ) {							\
-	    pixel_copy_increment;					\
-	    w--;							\
-	}								\
-	if ( w > 0 )	{						\
-	    n = ( w + 2) / 4;						\
-	    switch( w & 2 ) {						\
-	    case 0: do {	double_pixel_copy_increment;		\
-	    case 2:		double_pixel_copy_increment;		\
-		    } while ( --n > 0 );					\
-	    }								\
-	}								\
-}
-
-/* 2 - times unrolled loop 4 pixels */
-#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
-				double_pixel_copy_increment,		\
-				quatro_pixel_copy_increment, width)	\
-{ int n, w = width;								\
-        if(w & 1) {							\
-	  pixel_copy_increment;						\
-	  w--;								\
-	}								\
-	if(w & 2) {							\
-	  double_pixel_copy_increment;					\
-	  w -= 2;							\
-	}								\
-	if ( w > 0 ) {							\
-	    n = ( w + 7 ) / 8;						\
-	    switch( w & 4 ) {						\
-	    case 0: do {	quatro_pixel_copy_increment;		\
-	    case 4:		quatro_pixel_copy_increment;		\
-		    } while ( --n > 0 );					\
-	    }								\
+		} while (--n > 0);					\
 	}								\
 }
 
@@ -525,41 +484,29 @@ do {						\
 #define DUFFS_LOOP(pixel_copy_increment, width)				\
 	DUFFS_LOOP8(pixel_copy_increment, width)
 
-#else
-
-/* Don't use Duff's device to unroll loops */
-#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
-			 double_pixel_copy_increment, width)		\
-{ int n = width;								\
-    if( n & 1 ) {							\
-	pixel_copy_increment;						\
-	n--;								\
-    }									\
-    n=n>>1;								\
-    for(; n > 0; --n) {   						\
-	double_pixel_copy_increment;					\
-    }									\
-}
-
-/* Don't use Duff's device to unroll loops */
-#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
-				double_pixel_copy_increment,		\
-				quatro_pixel_copy_increment, width)	\
-{ int n = width;								\
-        if(n & 1) {							\
-	  pixel_copy_increment;						\
-	  n--;								\
+/* Special version of Duff's device for even more optimization */
+#define DUFFS_LOOP_124(pixel_copy_increment1,				\
+                       pixel_copy_increment2,				\
+                       pixel_copy_increment4, width)			\
+{ int n = width;							\
+	if (n & 1) {							\
+		pixel_copy_increment1; n -= 1;				\
+	}								\
+	if (n & 2) {							\
+		pixel_copy_increment2; n -= 2;				\
 	}								\
-	if(n & 2) {							\
-	  double_pixel_copy_increment;					\
-	  n -= 2;							\
+	if (n) {							\
+		n = (n+7)/ 8;						\
+		switch (n & 4) {					\
+		case 0: do {	pixel_copy_increment4;			\
+		case 4:		pixel_copy_increment4;			\
+			} while (--n > 0);				\
+		}							\
 	}								\
-	n=n>>2;								\
-	for(; n > 0; --n) {   						\
-	  quatro_pixel_copy_increment;					\
-        }								\
 }
 
+#else
+
 /* Don't use Duff's device to unroll loops */
 #define DUFFS_LOOP(pixel_copy_increment, width)				\
 { int n;								\
@@ -571,6 +518,10 @@ do {						\
 	DUFFS_LOOP(pixel_copy_increment, width)
 #define DUFFS_LOOP4(pixel_copy_increment, width)			\
 	DUFFS_LOOP(pixel_copy_increment, width)
+#define DUFFS_LOOP_124(pixel_copy_increment1,				\
+                       pixel_copy_increment2,				\
+                       pixel_copy_increment4, width)			\
+	DUFFS_LOOP(pixel_copy_increment1, width)
 
 #endif /* USE_DUFFS_LOOP */
 
diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c
index a1a69c3b3..272546a2d 100644
--- a/src/video/SDL_blit_A.c
+++ b/src/video/SDL_blit_A.c
@@ -1266,8 +1266,7 @@ BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
 
         while (height--) {
 			/* *INDENT-OFF* */
-			DUFFS_LOOP_DOUBLE2({
-				/* One Pixel Blend */
+			DUFFS_LOOP4({
 				s = *srcp;
 				d = *dstp;
 				s1 = s & 0xff00ff;
@@ -1280,35 +1279,6 @@ BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
 				*dstp = d1 | d | 0xff000000;
 				++srcp;
 				++dstp;
-			},{
-			        /* Two Pixels Blend */
-				s = *srcp;
-				d = *dstp;
-				s1 = s & 0xff00ff;
-				d1 = d & 0xff00ff;
-				d1 += (s1 - d1) * alpha >> 8;
-				d1 &= 0xff00ff;
-				     
-				s = ((s & 0xff00) >> 8) | 
-					((srcp[1] & 0xff00) << 8);
-				d = ((d & 0xff00) >> 8) |
-					((dstp[1] & 0xff00) << 8);
-				d += (s - d) * alpha >> 8;
-				d &= 0x00ff00ff;
-				
-				*dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
-				++srcp;
-				
-			        s1 = *srcp;
-				d1 = *dstp;
-				s1 &= 0xff00ff;
-				d1 &= 0xff00ff;
-				d1 += (s1 - d1) * alpha >> 8;
-				d1 &= 0xff00ff;
-				
-				*dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
-				++srcp;
-				++dstp;
 			}, width);
 			/* *INDENT-ON* */
             srcp += srcskip;
@@ -1588,7 +1558,7 @@ Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
 
         while (height--) {
 			/* *INDENT-OFF* */
-			DUFFS_LOOP_QUATRO2(
+			DUFFS_LOOP_124(
 			{
 				s = *srcp++;
 				d = *dstp;
@@ -1726,7 +1696,7 @@ Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
 
         while (height--) {
 			/* *INDENT-OFF* */
-			DUFFS_LOOP_QUATRO2(
+			DUFFS_LOOP_124(
 			{
 				s = *srcp++;
 				d = *dstp;