GCC inline asm for MMX YUV processing no longer has textrels and now works when SDL-1.2
authorRyan C. Gordon <icculus@icculus.org>
Wed, 11 Jul 2007 06:26:22 +0000
branchSDL-1.2
changeset 4045f420bba13676
parent 4044 009d85e98922
child 4046 3a9e60224efe
GCC inline asm for MMX YUV processing no longer has textrels and now works when
gcc wants to hog %%ebx for the PIC register.

Fixes Bugzilla #418.
src/video/SDL_yuv_mmx.c
src/video/SDL_yuv_sw.c
     1.1 --- a/src/video/SDL_yuv_mmx.c	Wed Jul 11 05:55:10 2007 +0000
     1.2 +++ b/src/video/SDL_yuv_mmx.c	Wed Jul 11 06:26:22 2007 +0000
     1.3 @@ -21,36 +21,35 @@
     1.4  */
     1.5  #include "SDL_config.h"
     1.6  
     1.7 -#if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */
     1.8 -#if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
     1.9 +#if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
    1.10  
    1.11  #include "SDL_stdinc.h"
    1.12  
    1.13 -#define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used))
    1.14 - 
    1.15 -static unsigned int  ASM_ARRAY(MMX_0080w)    = {0x00800080, 0x00800080};
    1.16 -static unsigned int  ASM_ARRAY(MMX_00FFw)    = {0x00ff00ff, 0x00ff00ff};
    1.17 -static unsigned int  ASM_ARRAY(MMX_FF00w)    = {0xff00ff00, 0xff00ff00};
    1.18 +#include "mmx.h"
    1.19 +
    1.20 +static mmx_t MMX_0080w    = { .ud = {0x00800080, 0x00800080} };
    1.21 +static mmx_t MMX_00FFw    = { .ud = {0x00ff00ff, 0x00ff00ff} };
    1.22 +static mmx_t MMX_FF00w    = { .ud = {0xff00ff00, 0xff00ff00} };
    1.23  
    1.24 -static unsigned short ASM_ARRAY(MMX_Ycoeff)  = {0x004a, 0x004a, 0x004a, 0x004a};
    1.25 +static mmx_t MMX_Ycoeff   = { .uw = {0x004a, 0x004a, 0x004a, 0x004a} };
    1.26  
    1.27 -static unsigned short ASM_ARRAY(MMX_UbluRGB) = {0x0072, 0x0072, 0x0072, 0x0072};
    1.28 -static unsigned short ASM_ARRAY(MMX_VredRGB) = {0x0059, 0x0059, 0x0059, 0x0059};
    1.29 -static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {0xffea, 0xffea, 0xffea, 0xffea};
    1.30 -static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {0xffd2, 0xffd2, 0xffd2, 0xffd2};
    1.31 +static mmx_t MMX_UbluRGB  = { .uw = {0x0072, 0x0072, 0x0072, 0x0072} };
    1.32 +static mmx_t MMX_VredRGB  = { .uw = {0x0059, 0x0059, 0x0059, 0x0059} };
    1.33 +static mmx_t MMX_UgrnRGB  = { .uw = {0xffea, 0xffea, 0xffea, 0xffea} };
    1.34 +static mmx_t MMX_VgrnRGB  = { .uw = {0xffd2, 0xffd2, 0xffd2, 0xffd2} };
    1.35  
    1.36 -static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {0x0081, 0x0081, 0x0081, 0x0081};
    1.37 -static unsigned short ASM_ARRAY(MMX_Vred5x5) = {0x0066, 0x0066, 0x0066, 0x0066};
    1.38 -static unsigned short ASM_ARRAY(MMX_Ugrn555) = {0xffe7, 0xffe7, 0xffe7, 0xffe7};
    1.39 -static unsigned short ASM_ARRAY(MMX_Vgrn555) = {0xffcc, 0xffcc, 0xffcc, 0xffcc};
    1.40 -static unsigned short ASM_ARRAY(MMX_Ugrn565) = {0xffe8, 0xffe8, 0xffe8, 0xffe8};
    1.41 -static unsigned short ASM_ARRAY(MMX_Vgrn565) = {0xffcd, 0xffcd, 0xffcd, 0xffcd};
    1.42 +static mmx_t MMX_Ublu5x5  = { .uw = {0x0081, 0x0081, 0x0081, 0x0081} };
    1.43 +static mmx_t MMX_Vred5x5  = { .uw = {0x0066, 0x0066, 0x0066, 0x0066} };
    1.44 +static mmx_t MMX_Ugrn555  = { .uw = {0xffe7, 0xffe7, 0xffe7, 0xffe7} };
    1.45 +static mmx_t MMX_Vgrn555  = { .uw = {0xffcc, 0xffcc, 0xffcc, 0xffcc} };
    1.46 +static mmx_t MMX_Ugrn565  = { .uw = {0xffe8, 0xffe8, 0xffe8, 0xffe8} };
    1.47 +static mmx_t MMX_Vgrn565  = { .uw = {0xffcd, 0xffcd, 0xffcd, 0xffcd} };
    1.48  
    1.49 -static unsigned short ASM_ARRAY(MMX_red555)  = {0x7c00, 0x7c00, 0x7c00, 0x7c00};
    1.50 -static unsigned short ASM_ARRAY(MMX_red565)  = {0xf800, 0xf800, 0xf800, 0xf800};
    1.51 -static unsigned short ASM_ARRAY(MMX_grn555)  = {0x03e0, 0x03e0, 0x03e0, 0x03e0};
    1.52 -static unsigned short ASM_ARRAY(MMX_grn565)  = {0x07e0, 0x07e0, 0x07e0, 0x07e0};
    1.53 -static unsigned short ASM_ARRAY(MMX_blu5x5)  = {0x001f, 0x001f, 0x001f, 0x001f};
    1.54 +static mmx_t MMX_red555   = { .uw = {0x7c00, 0x7c00, 0x7c00, 0x7c00} };
    1.55 +static mmx_t MMX_red565   = { .uw = {0xf800, 0xf800, 0xf800, 0xf800} };
    1.56 +static mmx_t MMX_grn555   = { .uw = {0x03e0, 0x03e0, 0x03e0, 0x03e0} };
    1.57 +static mmx_t MMX_grn565   = { .uw = {0x07e0, 0x07e0, 0x07e0, 0x07e0} };
    1.58 +static mmx_t MMX_blu5x5   = { .uw = {0x001f, 0x001f, 0x001f, 0x001f} };
    1.59  
    1.60  /**
    1.61     This MMX assembler is my first assembler/MMX program ever.
    1.62 @@ -86,38 +85,42 @@
    1.63                                unsigned char *cb, unsigned char *out,
    1.64                                int rows, int cols, int mod )
    1.65  {
    1.66 -    Uint32 *row1;
    1.67 -    Uint32 *row2;
    1.68 +	Uint32 *row1;
    1.69 +	Uint32 *row2;
    1.70  
    1.71 -    unsigned char* y = lum +cols*rows;    // Pointer to the end
    1.72 -    int x=0;
    1.73 -    row1 = (Uint32 *)out;                 // 32 bit target
    1.74 -    row2 = (Uint32 *)out+cols+mod;        // start of second row 
    1.75 -    mod = (mod+cols+mod)*4;               // increment for row1 in byte
    1.76 +	unsigned char* y = lum +cols*rows;    // Pointer to the end
    1.77 +	int x = 0;
    1.78 +	row1 = (Uint32 *)out;                 // 32 bit target
    1.79 +	row2 = (Uint32 *)out+cols+mod;        // start of second row
    1.80 +	mod = (mod+cols+mod)*4;               // increment for row1 in byte
    1.81  
    1.82 -    __asm__ __volatile__ (
    1.83 -/* We don't really care about PIC - the code should be rewritten to use
    1.84 -   relative addressing for the static tables, so right now we take the
    1.85 -   COW hit on the pages this code resides. Big deal.
    1.86 -   This spill is just to reduce register pressure in the PIC case. */
    1.87 -		 "pushl %%ebx\n"
    1.88 -		 "movl %0, %%ebx\n"
    1.89 +	__asm__ __volatile__ (
    1.90 +		 // tap dance to workaround the inability to use %%ebx at will...
    1.91 +		 //  move one thing to the stack...
    1.92 +		 "pushl $0\n"  // save a slot on the stack.
    1.93 +		 "pushl %%ebx\n"  // save %%ebx.
    1.94 +		 "movl %0, %%ebx\n"  // put the thing in ebx.
    1.95 +		 "movl %%ebx, 4(%%esp)\n"  // put the thing in the stack slot.
    1.96 +		 "popl %%ebx\n"  // get back %%ebx (the PIC register).
    1.97  
    1.98 -	         ".align 8\n"
    1.99 +		 ".align 8\n"
   1.100  		 "1:\n"
   1.101  		
   1.102  		 // create Cr (result in mm1)
   1.103 +		 "pushl %%ebx\n"
   1.104 +		 "movl 4(%%esp), %%ebx\n"
   1.105  		 "movd (%%ebx), %%mm1\n"   //         0  0  0  0  v3 v2 v1 v0
   1.106 +		 "popl %%ebx\n"
   1.107  		 "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
   1.108  		 "movd (%2), %%mm2\n"           //    0  0  0  0 l3 l2 l1 l0
   1.109  		 "punpcklbw %%mm7,%%mm1\n" //         0  v3 0  v2 00 v1 00 v0
   1.110  		 "punpckldq %%mm1,%%mm1\n" //         00 v1 00 v0 00 v1 00 v0
   1.111 -		 "psubw _MMX_0080w,%%mm1\n"  // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 
   1.112 +		 "psubw %9,%%mm1\n"        // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 
   1.113  
   1.114  		 // create Cr_g (result in mm0)
   1.115  		 "movq %%mm1,%%mm0\n"           // r1 r1 r0 r0 r1 r1 r0 r0
   1.116 -		 "pmullw _MMX_VgrnRGB,%%mm0\n"// red*-46dec=0.7136*64
   1.117 -		 "pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64
   1.118 +		 "pmullw %10,%%mm0\n"           // red*-46dec=0.7136*64
   1.119 +		 "pmullw %11,%%mm1\n"           // red*89dec=1.4013*64
   1.120  		 "psraw  $6, %%mm0\n"           // red=red/64
   1.121  		 "psraw  $6, %%mm1\n"           // red=red/64
   1.122  		 
   1.123 @@ -126,8 +129,8 @@
   1.124  		 "movq (%2,%4),%%mm3\n"         //    0  0  0  0 L3 L2 L1 L0
   1.125  		 "punpckldq %%mm3,%%mm2\n"      //   L3 L2 L1 L0 l3 l2 l1 l0
   1.126  		 "movq %%mm2,%%mm4\n"           //   L3 L2 L1 L0 l3 l2 l1 l0
   1.127 -		 "pand _MMX_FF00w,%%mm2\n"      //   L3 0  L1  0 l3  0 l1  0
   1.128 -		 "pand _MMX_00FFw,%%mm4\n"      //   0  L2  0 L0  0 l2  0 l0
   1.129 +		 "pand %12,%%mm2\n"             //   L3 0  L1  0 l3  0 l1  0
   1.130 +		 "pand %13,%%mm4\n"             //   0  L2  0 L0  0 l2  0 l0
   1.131  		 "psrlw $8,%%mm2\n"             //   0  L3  0 L1  0 l3  0 l1
   1.132  
   1.133  		 // create R (result in mm6)
   1.134 @@ -144,11 +147,11 @@
   1.135  		 "movd (%1), %%mm1\n"      //         0  0  0  0  u3 u2 u1 u0
   1.136  		 "punpcklbw %%mm7,%%mm1\n" //         0  u3 0  u2 00 u1 00 u0
   1.137  		 "punpckldq %%mm1,%%mm1\n" //         00 u1 00 u0 00 u1 00 u0
   1.138 -		 "psubw _MMX_0080w,%%mm1\n"  // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 
   1.139 +		 "psubw %9,%%mm1\n"        // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 
   1.140  		 // create Cb_g (result in mm5)
   1.141  		 "movq %%mm1,%%mm5\n"            // u1 u1 u0 u0 u1 u1 u0 u0
   1.142 -		 "pmullw _MMX_UgrnRGB,%%mm5\n"    // blue*-109dec=1.7129*64
   1.143 -		 "pmullw _MMX_UbluRGB,%%mm1\n"    // blue*114dec=1.78125*64
   1.144 +		 "pmullw %14,%%mm5\n"            // blue*-109dec=1.7129*64
   1.145 +		 "pmullw %15,%%mm1\n"            // blue*114dec=1.78125*64
   1.146  		 "psraw  $6, %%mm5\n"            // blue=red/64
   1.147  		 "psraw  $6, %%mm1\n"            // blue=blue/64
   1.148  
   1.149 @@ -213,7 +216,7 @@
   1.150  		 "addl  $4,%2\n"            // lum+4
   1.151  		 "leal  16(%3),%3\n"        // row1+16
   1.152  		 "leal  16(%5),%5\n"        // row2+16
   1.153 -		 "addl  $2, %%ebx\n"        // cr+2
   1.154 +		 "addl  $2, (%%esp)\n"        // cr+2
   1.155  		 "addl  $2, %1\n"           // cb+2
   1.156  
   1.157  		 "addl  $4,%6\n"            // x+4
   1.158 @@ -226,11 +229,16 @@
   1.159  		 "movl           $0,     %6\n" // x=0
   1.160  		 "cmpl           %7,     %2\n"
   1.161  		 "jl             1b\n"
   1.162 -		 "emms\n"
   1.163 -		 "popl %%ebx\n"
   1.164 +
   1.165 +		 "addl $4, %%esp\n"  // get rid of the stack slot we reserved.
   1.166 +		 "emms\n"  // reset MMX registers.
   1.167  		 :
   1.168  		 : "m" (cr), "r"(cb),"r"(lum),
   1.169 -		 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
   1.170 +		   "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
   1.171 +		   "m"(MMX_0080w),"m"(MMX_VgrnRGB),"m"(MMX_VredRGB),
   1.172 +		   "m"(MMX_FF00w),"m"(MMX_00FFw),"m"(MMX_UgrnRGB),
   1.173 +		   "m"(MMX_UbluRGB)
   1.174 +	);
   1.175  }
   1.176  
   1.177  void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
   1.178 @@ -249,31 +257,40 @@
   1.179  
   1.180  
   1.181        __asm__ __volatile__(
   1.182 -         "pushl %%ebx\n"
   1.183 -	 "movl %0, %%ebx\n"
   1.184 +         // tap dance to workaround the inability to use %%ebx at will...
   1.185 +         //  move one thing to the stack...
   1.186 +         "pushl $0\n"  // save a slot on the stack.
   1.187 +         "pushl %%ebx\n"  // save %%ebx.
   1.188 +         "movl %0, %%ebx\n"  // put the thing in ebx.
   1.189 +         "movl %%ebx, 4(%%esp)\n"  // put the thing in the stack slot.
   1.190 +         "popl %%ebx\n"  // get back %%ebx (the PIC register).
   1.191  
   1.192           ".align 8\n"
   1.193           "1:\n"
   1.194           "movd           (%1),                   %%mm0\n" // 4 Cb         0  0  0  0 u3 u2 u1 u0
   1.195           "pxor           %%mm7,                  %%mm7\n"
   1.196 -         "movd           (%%ebx),                %%mm1\n" // 4 Cr                0  0  0  0 v3 v2 v1 v0
   1.197 +         "pushl %%ebx\n"
   1.198 +         "movl 4(%%esp), %%ebx\n"
   1.199 +         "movd (%%ebx), %%mm1\n"   // 4 Cr                0  0  0  0 v3 v2 v1 v0
   1.200 +         "popl %%ebx\n"
   1.201 +
   1.202           "punpcklbw      %%mm7,                  %%mm0\n" // 4 W cb   0 u3  0 u2  0 u1  0 u0
   1.203           "punpcklbw      %%mm7,                  %%mm1\n" // 4 W cr   0 v3  0 v2  0 v1  0 v0
   1.204 -         "psubw          _MMX_0080w,             %%mm0\n"
   1.205 -         "psubw          _MMX_0080w,             %%mm1\n"
   1.206 +         "psubw          %9,                     %%mm0\n"
   1.207 +         "psubw          %9,                     %%mm1\n"
   1.208           "movq           %%mm0,                  %%mm2\n" // Cb                   0 u3  0 u2  0 u1  0 u0
   1.209           "movq           %%mm1,                  %%mm3\n" // Cr
   1.210 -         "pmullw         _MMX_Ugrn565,           %%mm2\n" // Cb2green 0 R3  0 R2  0 R1  0 R0
   1.211 +         "pmullw         %10,                    %%mm2\n" // Cb2green 0 R3  0 R2  0 R1  0 R0
   1.212           "movq           (%2),                   %%mm6\n" // L1      l7 L6 L5 L4 L3 L2 L1 L0
   1.213 -         "pmullw         _MMX_Ublu5x5,           %%mm0\n" // Cb2blue
   1.214 -         "pand           _MMX_00FFw,             %%mm6\n" // L1      00 L6 00 L4 00 L2 00 L0
   1.215 -         "pmullw         _MMX_Vgrn565,           %%mm3\n" // Cr2green
   1.216 +         "pmullw         %11,                    %%mm0\n" // Cb2blue
   1.217 +         "pand           %12,                    %%mm6\n" // L1      00 L6 00 L4 00 L2 00 L0
   1.218 +         "pmullw         %13,                    %%mm3\n" // Cr2green
   1.219           "movq           (%2),                   %%mm7\n" // L2
   1.220 -         "pmullw         _MMX_Vred5x5,           %%mm1\n" // Cr2red
   1.221 +         "pmullw         %14,                    %%mm1\n" // Cr2red
   1.222           "psrlw          $8,                     %%mm7\n"        // L2           00 L7 00 L5 00 L3 00 L1
   1.223 -         "pmullw         _MMX_Ycoeff,            %%mm6\n" // lum1
   1.224 +         "pmullw         %15,                    %%mm6\n" // lum1
   1.225           "paddw          %%mm3,                  %%mm2\n" // Cb2green + Cr2green == green
   1.226 -         "pmullw         _MMX_Ycoeff,            %%mm7\n" // lum2
   1.227 +         "pmullw         %15,                    %%mm7\n" // lum2
   1.228  
   1.229           "movq           %%mm6,                  %%mm4\n" // lum1
   1.230           "paddw          %%mm0,                  %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
   1.231 @@ -291,11 +308,11 @@
   1.232           "punpcklbw      %%mm4,                  %%mm4\n"
   1.233           "punpcklbw      %%mm5,                  %%mm5\n"
   1.234  
   1.235 -         "pand           _MMX_red565,            %%mm4\n"
   1.236 +         "pand           %16,                    %%mm4\n"
   1.237           "psllw          $3,                     %%mm5\n" // GREEN       1
   1.238           "punpcklbw      %%mm6,                  %%mm6\n"
   1.239 -         "pand           _MMX_grn565,            %%mm5\n"
   1.240 -         "pand           _MMX_red565,            %%mm6\n"
   1.241 +         "pand           %17,                    %%mm5\n"
   1.242 +         "pand           %16,                    %%mm6\n"
   1.243           "por            %%mm5,                  %%mm4\n" //
   1.244           "psrlw          $11,                    %%mm6\n" // BLUE        1
   1.245           "movq           %%mm3,                  %%mm5\n" // lum2
   1.246 @@ -309,23 +326,23 @@
   1.247           "packuswb       %%mm3,                  %%mm3\n"
   1.248           "packuswb       %%mm5,                  %%mm5\n"
   1.249           "packuswb       %%mm7,                  %%mm7\n"
   1.250 -         "pand           _MMX_00FFw,             %%mm6\n" // L3
   1.251 +         "pand           %12,                    %%mm6\n" // L3
   1.252           "punpcklbw      %%mm3,                  %%mm3\n"
   1.253           "punpcklbw      %%mm5,                  %%mm5\n"
   1.254 -         "pmullw         _MMX_Ycoeff,            %%mm6\n" // lum3
   1.255 +         "pmullw         %15,                    %%mm6\n" // lum3
   1.256           "punpcklbw      %%mm7,                  %%mm7\n"
   1.257           "psllw          $3,                     %%mm5\n" // GREEN 2
   1.258 -         "pand           _MMX_red565,            %%mm7\n"
   1.259 -         "pand           _MMX_red565,            %%mm3\n"
   1.260 +         "pand           %16,                    %%mm7\n"
   1.261 +         "pand           %16,                    %%mm3\n"
   1.262           "psrlw          $11,                    %%mm7\n" // BLUE  2
   1.263 -         "pand           _MMX_grn565,            %%mm5\n"
   1.264 +         "pand           %17,                    %%mm5\n"
   1.265           "por            %%mm7,                  %%mm3\n"
   1.266           "movq           (%2,%4),                %%mm7\n" // L4 load lum2
   1.267           "por            %%mm5,                  %%mm3\n" //
   1.268           "psrlw          $8,                     %%mm7\n" // L4
   1.269           "movq           %%mm4,                  %%mm5\n"
   1.270           "punpcklwd      %%mm3,                  %%mm4\n"
   1.271 -         "pmullw         _MMX_Ycoeff,            %%mm7\n" // lum4
   1.272 +         "pmullw         %15,                    %%mm7\n" // lum4
   1.273           "punpckhwd      %%mm3,                  %%mm5\n"
   1.274  
   1.275           "movq           %%mm4,                  (%3)\n"  // write row1
   1.276 @@ -352,11 +369,11 @@
   1.277           "punpcklbw      %%mm5,                  %%mm5\n"
   1.278           "punpcklbw      %%mm6,                  %%mm6\n"
   1.279           "psllw          $3,                     %%mm5\n" // GREEN 3
   1.280 -         "pand           _MMX_red565,            %%mm4\n"
   1.281 +         "pand           %16,                    %%mm4\n"
   1.282           "psraw          $6,                     %%mm3\n" // psr 6
   1.283           "psraw          $6,                     %%mm0\n"
   1.284 -         "pand           _MMX_red565,            %%mm6\n" // BLUE
   1.285 -         "pand           _MMX_grn565,            %%mm5\n"
   1.286 +         "pand           %16,                    %%mm6\n" // BLUE
   1.287 +         "pand           %17,                    %%mm5\n"
   1.288           "psrlw          $11,                    %%mm6\n" // BLUE  3
   1.289           "por            %%mm5,                  %%mm4\n"
   1.290           "psraw          $6,                     %%mm7\n"
   1.291 @@ -367,11 +384,11 @@
   1.292           "punpcklbw      %%mm3,                  %%mm3\n"
   1.293           "punpcklbw      %%mm0,                  %%mm0\n"
   1.294           "punpcklbw      %%mm7,                  %%mm7\n"
   1.295 -         "pand           _MMX_red565,            %%mm3\n"
   1.296 -         "pand           _MMX_red565,            %%mm7\n" // BLUE
   1.297 +         "pand           %16,                    %%mm3\n"
   1.298 +         "pand           %16,                    %%mm7\n" // BLUE
   1.299           "psllw          $3,                     %%mm0\n" // GREEN 4
   1.300           "psrlw          $11,                    %%mm7\n"
   1.301 -         "pand           _MMX_grn565,            %%mm0\n"
   1.302 +         "pand           %17,                    %%mm0\n"
   1.303           "por            %%mm7,                  %%mm3\n"
   1.304           "por            %%mm0,                  %%mm3\n"
   1.305  
   1.306 @@ -381,30 +398,33 @@
   1.307           "punpckhwd      %%mm3,                  %%mm5\n"
   1.308  
   1.309           "movq           %%mm4,                  (%5)\n"
   1.310 -	 "movq           %%mm5,                  8(%5)\n"
   1.311 +         "movq           %%mm5,                  8(%5)\n"
   1.312  
   1.313           "addl           $8,                     %6\n"
   1.314           "addl           $8,                     %2\n"
   1.315 -         "addl           $4,                     %%ebx\n"
   1.316 +         "addl           $4,                     (%%esp)\n"
   1.317           "addl           $4,                     %1\n"
   1.318           "cmpl           %4,                     %6\n"
   1.319           "leal           16(%3),                 %3\n"
   1.320 -	 "leal           16(%5),%5\n" // row2+16
   1.321 +         "leal           16(%5),%5\n" // row2+16
   1.322  
   1.323  
   1.324           "jl             1b\n"
   1.325 -	 "addl           %4,     %2\n" // lum += cols 
   1.326 -	 "addl           %8,     %3\n" // row1+= mod
   1.327 -	 "addl           %8,     %5\n" // row2+= mod
   1.328 -	 "movl           $0,     %6\n" // x=0
   1.329 -	 "cmpl           %7,     %2\n"
   1.330 -	 "jl             1b\n"
   1.331 +         "addl           %4,     %2\n" // lum += cols
   1.332 +         "addl           %8,     %3\n" // row1+= mod
   1.333 +         "addl           %8,     %5\n" // row2+= mod
   1.334 +         "movl           $0,     %6\n" // x=0
   1.335 +         "cmpl           %7,     %2\n"
   1.336 +         "jl             1b\n"
   1.337 +         "addl $4, %%esp\n"  // get rid of the stack slot we reserved.
   1.338           "emms\n"
   1.339 -	 "popl %%ebx\n"
   1.340           :
   1.341 -         :"m" (cr), "r"(cb),"r"(lum),
   1.342 -	 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
   1.343 +         : "m" (cr), "r"(cb),"r"(lum),
   1.344 +           "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
   1.345 +           "m"(MMX_0080w),"m"(MMX_Ugrn565),"m"(MMX_Ublu5x5),
   1.346 +           "m"(MMX_00FFw),"m"(MMX_Vgrn565),"m"(MMX_Vred5x5),
   1.347 +           "m"(MMX_Ycoeff),"m"(MMX_red565),"m"(MMX_grn565));
   1.348  }
   1.349  
   1.350 -#endif /* GCC i386 inline assembly */
   1.351 -#endif /* 0 */
   1.352 +#endif /* GCC3 i386 inline assembly */
   1.353 +
     2.1 --- a/src/video/SDL_yuv_sw.c	Wed Jul 11 05:55:10 2007 +0000
     2.2 +++ b/src/video/SDL_yuv_sw.c	Wed Jul 11 06:26:22 2007 +0000
     2.3 @@ -121,7 +121,7 @@
     2.4  
     2.5  /* The colorspace conversion functions */
     2.6  
     2.7 -#if 0 /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES*/
     2.8 +#if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
     2.9  extern void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
    2.10                                       unsigned char *lum, unsigned char *cr,
    2.11                                       unsigned char *cb, unsigned char *out,
    2.12 @@ -1061,7 +1061,7 @@
    2.13  	    case SDL_YV12_OVERLAY:
    2.14  	    case SDL_IYUV_OVERLAY:
    2.15  		if ( display->format->BytesPerPixel == 2 ) {
    2.16 -#if 0 /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES*/
    2.17 +#if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
    2.18  			/* inline assembly functions */
    2.19  			if ( SDL_HasMMX() && (Rmask == 0xF800) &&
    2.20  			                     (Gmask == 0x07E0) &&
    2.21 @@ -1083,7 +1083,7 @@
    2.22  			swdata->Display2X = Color24DitherYV12Mod2X;
    2.23  		}
    2.24  		if ( display->format->BytesPerPixel == 4 ) {
    2.25 -#if 0 /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES*/
    2.26 +#if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
    2.27  			/* inline assembly functions */
    2.28  			if ( SDL_HasMMX() && (Rmask == 0x00FF0000) &&
    2.29  			                     (Gmask == 0x0000FF00) &&