src/render/SDL_yuv_mmx.c
changeset 7191 75360622e65f
parent 6885 700f1b25f77f
child 8093 b43765095a6f
     1.1 --- a/src/render/SDL_yuv_mmx.c	Sat May 18 12:48:50 2013 -0700
     1.2 +++ b/src/render/SDL_yuv_mmx.c	Sat May 18 14:17:52 2013 -0700
     1.3 @@ -63,10 +63,10 @@
     1.4     The MMX routine calculates 256bit=8RGB values in each cycle
     1.5     (4 for row1 & 4 for row2)
     1.6  
     1.7 -   The red/green/blue.. coefficents are taken from the mpeg_play 
     1.8 +   The red/green/blue.. coefficents are taken from the mpeg_play
     1.9     player. They look nice, but I dont know if you can have
    1.10     better values, to avoid integer rounding errors.
    1.11 -   
    1.12 +
    1.13  
    1.14     IMPORTANT:
    1.15     ==========
    1.16 @@ -84,152 +84,152 @@
    1.17      Uint32 *row1;
    1.18      Uint32 *row2;
    1.19  
    1.20 -    unsigned char* y = lum +cols*rows;    // Pointer to the end
    1.21 +    unsigned char* y = lum +cols*rows;    /* Pointer to the end */
    1.22      int x = 0;
    1.23 -    row1 = (Uint32 *)out;                 // 32 bit target
    1.24 -    row2 = (Uint32 *)out+cols+mod;        // start of second row
    1.25 -    mod = (mod+cols+mod)*4;               // increment for row1 in byte
    1.26 +    row1 = (Uint32 *)out;                 /* 32 bit target */
    1.27 +    row2 = (Uint32 *)out+cols+mod;        /* start of second row */
    1.28 +    mod = (mod+cols+mod)*4;               /* increment for row1 in byte */
    1.29  
    1.30      __asm__ __volatile__ (
    1.31 -        // tap dance to workaround the inability to use %%ebx at will...
    1.32 -        //  move one thing to the stack...
    1.33 -        "pushl $0\n"  // save a slot on the stack.
    1.34 -        "pushl %%ebx\n"  // save %%ebx.
    1.35 -        "movl %0, %%ebx\n"  // put the thing in ebx.
    1.36 -        "movl %%ebx,4(%%esp)\n"  // put the thing in the stack slot.
    1.37 -        "popl %%ebx\n"  // get back %%ebx (the PIC register).
    1.38 +        /* tap dance to workaround the inability to use %%ebx at will... */
    1.39 +        /*  move one thing to the stack... */
    1.40 +        "pushl $0\n"  /* save a slot on the stack. */
    1.41 +        "pushl %%ebx\n"  /* save %%ebx. */
    1.42 +        "movl %0, %%ebx\n"  /* put the thing in ebx. */
    1.43 +        "movl %%ebx,4(%%esp)\n"  /* put the thing in the stack slot. */
    1.44 +        "popl %%ebx\n"  /* get back %%ebx (the PIC register). */
    1.45  
    1.46          ".align 8\n"
    1.47          "1:\n"
    1.48  
    1.49 -        // create Cr (result in mm1)
    1.50 +        /* create Cr (result in mm1) */
    1.51          "pushl %%ebx\n"
    1.52          "movl 4(%%esp),%%ebx\n"
    1.53 -        "movd (%%ebx),%%mm1\n"   //         0  0  0  0  v3 v2 v1 v0
    1.54 +        "movd (%%ebx),%%mm1\n"   /*         0  0  0  0  v3 v2 v1 v0 */
    1.55          "popl %%ebx\n"
    1.56 -        "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
    1.57 -        "movd (%2), %%mm2\n"           //    0  0  0  0 l3 l2 l1 l0
    1.58 -        "punpcklbw %%mm7,%%mm1\n" //         0  v3 0  v2 00 v1 00 v0
    1.59 -        "punpckldq %%mm1,%%mm1\n" //         00 v1 00 v0 00 v1 00 v0
    1.60 -        "psubw %9,%%mm1\n"        // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0
    1.61 +        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
    1.62 +        "movd (%2), %%mm2\n"           /*    0  0  0  0 l3 l2 l1 l0 */
    1.63 +        "punpcklbw %%mm7,%%mm1\n" /*         0  v3 0  v2 00 v1 00 v0 */
    1.64 +        "punpckldq %%mm1,%%mm1\n" /*         00 v1 00 v0 00 v1 00 v0 */
    1.65 +        "psubw %9,%%mm1\n"        /* mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 */
    1.66  
    1.67 -        // create Cr_g (result in mm0)
    1.68 -        "movq %%mm1,%%mm0\n"           // r1 r1 r0 r0 r1 r1 r0 r0
    1.69 -        "pmullw %10,%%mm0\n"           // red*-46dec=0.7136*64
    1.70 -        "pmullw %11,%%mm1\n"           // red*89dec=1.4013*64
    1.71 -        "psraw  $6, %%mm0\n"           // red=red/64
    1.72 -        "psraw  $6, %%mm1\n"           // red=red/64
    1.73 +        /* create Cr_g (result in mm0) */
    1.74 +        "movq %%mm1,%%mm0\n"           /* r1 r1 r0 r0 r1 r1 r0 r0 */
    1.75 +        "pmullw %10,%%mm0\n"           /* red*-46dec=0.7136*64 */
    1.76 +        "pmullw %11,%%mm1\n"           /* red*89dec=1.4013*64 */
    1.77 +        "psraw  $6, %%mm0\n"           /* red=red/64 */
    1.78 +        "psraw  $6, %%mm1\n"           /* red=red/64 */
    1.79  
    1.80 -        // create L1 L2 (result in mm2,mm4)
    1.81 -        // L2=lum+cols
    1.82 -        "movq (%2,%4),%%mm3\n"         //    0  0  0  0 L3 L2 L1 L0
    1.83 -        "punpckldq %%mm3,%%mm2\n"      //   L3 L2 L1 L0 l3 l2 l1 l0
    1.84 -        "movq %%mm2,%%mm4\n"           //   L3 L2 L1 L0 l3 l2 l1 l0
    1.85 -        "pand %12,%%mm2\n"             //   L3 0  L1  0 l3  0 l1  0
    1.86 -        "pand %13,%%mm4\n"             //   0  L2  0 L0  0 l2  0 l0
    1.87 -        "psrlw $8,%%mm2\n"             //   0  L3  0 L1  0 l3  0 l1
    1.88 +        /* create L1 L2 (result in mm2,mm4) */
    1.89 +        /* L2=lum+cols */
    1.90 +        "movq (%2,%4),%%mm3\n"         /*    0  0  0  0 L3 L2 L1 L0 */
    1.91 +        "punpckldq %%mm3,%%mm2\n"      /*   L3 L2 L1 L0 l3 l2 l1 l0 */
    1.92 +        "movq %%mm2,%%mm4\n"           /*   L3 L2 L1 L0 l3 l2 l1 l0 */
    1.93 +        "pand %12,%%mm2\n"             /*   L3 0  L1  0 l3  0 l1  0 */
    1.94 +        "pand %13,%%mm4\n"             /*   0  L2  0 L0  0 l2  0 l0 */
    1.95 +        "psrlw $8,%%mm2\n"             /*   0  L3  0 L1  0 l3  0 l1 */
    1.96  
    1.97 -        // create R (result in mm6)
    1.98 -        "movq %%mm2,%%mm5\n"           //   0 L3  0 L1  0 l3  0 l1
    1.99 -        "movq %%mm4,%%mm6\n"           //   0 L2  0 L0  0 l2  0 l0
   1.100 -        "paddsw  %%mm1, %%mm5\n"       // lum1+red:x R3 x R1 x r3 x r1
   1.101 -        "paddsw  %%mm1, %%mm6\n"       // lum1+red:x R2 x R0 x r2 x r0
   1.102 -        "packuswb %%mm5,%%mm5\n"       //  R3 R1 r3 r1 R3 R1 r3 r1
   1.103 -        "packuswb %%mm6,%%mm6\n"       //  R2 R0 r2 r0 R2 R0 r2 r0
   1.104 -        "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
   1.105 -        "punpcklbw %%mm5,%%mm6\n"      //  R3 R2 R1 R0 r3 r2 r1 r0
   1.106 +        /* create R (result in mm6) */
   1.107 +        "movq %%mm2,%%mm5\n"           /*   0 L3  0 L1  0 l3  0 l1 */
   1.108 +        "movq %%mm4,%%mm6\n"           /*   0 L2  0 L0  0 l2  0 l0 */
   1.109 +        "paddsw  %%mm1, %%mm5\n"       /* lum1+red:x R3 x R1 x r3 x r1 */
   1.110 +        "paddsw  %%mm1, %%mm6\n"       /* lum1+red:x R2 x R0 x r2 x r0 */
   1.111 +        "packuswb %%mm5,%%mm5\n"       /*  R3 R1 r3 r1 R3 R1 r3 r1 */
   1.112 +        "packuswb %%mm6,%%mm6\n"       /*  R2 R0 r2 r0 R2 R0 r2 r0 */
   1.113 +        "pxor %%mm7,%%mm7\n"      /*         00 00 00 00 00 00 00 00 */
   1.114 +        "punpcklbw %%mm5,%%mm6\n"      /*  R3 R2 R1 R0 r3 r2 r1 r0 */
   1.115  
   1.116 -        // create Cb (result in mm1)
   1.117 -        "movd (%1), %%mm1\n"      //         0  0  0  0  u3 u2 u1 u0
   1.118 -        "punpcklbw %%mm7,%%mm1\n" //         0  u3 0  u2 00 u1 00 u0
   1.119 -        "punpckldq %%mm1,%%mm1\n" //         00 u1 00 u0 00 u1 00 u0
   1.120 -        "psubw %9,%%mm1\n"        // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0
   1.121 +        /* create Cb (result in mm1) */
   1.122 +        "movd (%1), %%mm1\n"      /*         0  0  0  0  u3 u2 u1 u0 */
   1.123 +        "punpcklbw %%mm7,%%mm1\n" /*         0  u3 0  u2 00 u1 00 u0 */
   1.124 +        "punpckldq %%mm1,%%mm1\n" /*         00 u1 00 u0 00 u1 00 u0 */
   1.125 +        "psubw %9,%%mm1\n"        /* mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 */
   1.126  
   1.127 -        // create Cb_g (result in mm5)
   1.128 -        "movq %%mm1,%%mm5\n"            // u1 u1 u0 u0 u1 u1 u0 u0
   1.129 -        "pmullw %14,%%mm5\n"            // blue*-109dec=1.7129*64
   1.130 -        "pmullw %15,%%mm1\n"            // blue*114dec=1.78125*64
   1.131 -        "psraw  $6, %%mm5\n"            // blue=red/64
   1.132 -        "psraw  $6, %%mm1\n"            // blue=blue/64
   1.133 +        /* create Cb_g (result in mm5) */
   1.134 +        "movq %%mm1,%%mm5\n"            /* u1 u1 u0 u0 u1 u1 u0 u0 */
   1.135 +        "pmullw %14,%%mm5\n"            /* blue*-109dec=1.7129*64 */
   1.136 +        "pmullw %15,%%mm1\n"            /* blue*114dec=1.78125*64 */
   1.137 +        "psraw  $6, %%mm5\n"            /* blue=red/64 */
   1.138 +        "psraw  $6, %%mm1\n"            /* blue=blue/64 */
   1.139  
   1.140 -        // create G (result in mm7)
   1.141 -        "movq %%mm2,%%mm3\n"      //   0  L3  0 L1  0 l3  0 l1
   1.142 -        "movq %%mm4,%%mm7\n"      //   0  L2  0 L0  0 l2  0 l1
   1.143 -        "paddsw  %%mm5, %%mm3\n"  // lum1+Cb_g:x G3t x G1t x g3t x g1t
   1.144 -        "paddsw  %%mm5, %%mm7\n"  // lum1+Cb_g:x G2t x G0t x g2t x g0t
   1.145 -        "paddsw  %%mm0, %%mm3\n"  // lum1+Cr_g:x G3  x G1  x g3  x g1
   1.146 -        "paddsw  %%mm0, %%mm7\n"  // lum1+blue:x G2  x G0  x g2  x g0
   1.147 -        "packuswb %%mm3,%%mm3\n"  // G3 G1 g3 g1 G3 G1 g3 g1
   1.148 -        "packuswb %%mm7,%%mm7\n"  // G2 G0 g2 g0 G2 G0 g2 g0
   1.149 -        "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
   1.150 +        /* create G (result in mm7) */
   1.151 +        "movq %%mm2,%%mm3\n"      /*   0  L3  0 L1  0 l3  0 l1 */
   1.152 +        "movq %%mm4,%%mm7\n"      /*   0  L2  0 L0  0 l2  0 l1 */
   1.153 +        "paddsw  %%mm5, %%mm3\n"  /* lum1+Cb_g:x G3t x G1t x g3t x g1t */
   1.154 +        "paddsw  %%mm5, %%mm7\n"  /* lum1+Cb_g:x G2t x G0t x g2t x g0t */
   1.155 +        "paddsw  %%mm0, %%mm3\n"  /* lum1+Cr_g:x G3  x G1  x g3  x g1 */
   1.156 +        "paddsw  %%mm0, %%mm7\n"  /* lum1+blue:x G2  x G0  x g2  x g0 */
   1.157 +        "packuswb %%mm3,%%mm3\n"  /* G3 G1 g3 g1 G3 G1 g3 g1 */
   1.158 +        "packuswb %%mm7,%%mm7\n"  /* G2 G0 g2 g0 G2 G0 g2 g0 */
   1.159 +        "punpcklbw %%mm3,%%mm7\n" /* G3 G2 G1 G0 g3 g2 g1 g0 */
   1.160  
   1.161 -        // create B (result in mm5)
   1.162 -        "movq %%mm2,%%mm3\n"         //   0  L3  0 L1  0 l3  0 l1
   1.163 -        "movq %%mm4,%%mm5\n"         //   0  L2  0 L0  0 l2  0 l1
   1.164 -        "paddsw  %%mm1, %%mm3\n"     // lum1+blue:x B3 x B1 x b3 x b1
   1.165 -        "paddsw  %%mm1, %%mm5\n"     // lum1+blue:x B2 x B0 x b2 x b0
   1.166 -        "packuswb %%mm3,%%mm3\n"     // B3 B1 b3 b1 B3 B1 b3 b1
   1.167 -        "packuswb %%mm5,%%mm5\n"     // B2 B0 b2 b0 B2 B0 b2 b0
   1.168 -        "punpcklbw %%mm3,%%mm5\n"    // B3 B2 B1 B0 b3 b2 b1 b0
   1.169 +        /* create B (result in mm5) */
   1.170 +        "movq %%mm2,%%mm3\n"         /*   0  L3  0 L1  0 l3  0 l1 */
   1.171 +        "movq %%mm4,%%mm5\n"         /*   0  L2  0 L0  0 l2  0 l1 */
   1.172 +        "paddsw  %%mm1, %%mm3\n"     /* lum1+blue:x B3 x B1 x b3 x b1 */
   1.173 +        "paddsw  %%mm1, %%mm5\n"     /* lum1+blue:x B2 x B0 x b2 x b0 */
   1.174 +        "packuswb %%mm3,%%mm3\n"     /* B3 B1 b3 b1 B3 B1 b3 b1 */
   1.175 +        "packuswb %%mm5,%%mm5\n"     /* B2 B0 b2 b0 B2 B0 b2 b0 */
   1.176 +        "punpcklbw %%mm3,%%mm5\n"    /* B3 B2 B1 B0 b3 b2 b1 b0 */
   1.177  
   1.178 -        // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
   1.179 +        /* fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
   1.180  
   1.181 -        "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
   1.182 -        "pxor %%mm4,%%mm4\n"           //  0  0  0  0  0  0  0  0
   1.183 -        "movq %%mm6,%%mm1\n"           // R3 R2 R1 R0 r3 r2 r1 r0
   1.184 -        "movq %%mm5,%%mm3\n"           // B3 B2 B1 B0 b3 b2 b1 b0
   1.185 +        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
   1.186 +        "pxor %%mm4,%%mm4\n"           /*  0  0  0  0  0  0  0  0 */
   1.187 +        "movq %%mm6,%%mm1\n"           /* R3 R2 R1 R0 r3 r2 r1 r0 */
   1.188 +        "movq %%mm5,%%mm3\n"           /* B3 B2 B1 B0 b3 b2 b1 b0 */
   1.189  
   1.190 -        // process lower lum
   1.191 -        "punpcklbw %%mm4,%%mm1\n"      //  0 r3  0 r2  0 r1  0 r0
   1.192 -        "punpcklbw %%mm4,%%mm3\n"      //  0 b3  0 b2  0 b1  0 b0
   1.193 -        "movq %%mm1,%%mm2\n"           //  0 r3  0 r2  0 r1  0 r0
   1.194 -        "movq %%mm3,%%mm0\n"           //  0 b3  0 b2  0 b1  0 b0
   1.195 -        "punpcklwd %%mm1,%%mm3\n"      //  0 r1  0 b1  0 r0  0 b0
   1.196 -        "punpckhwd %%mm2,%%mm0\n"      //  0 r3  0 b3  0 r2  0 b2
   1.197 +        /* process lower lum */
   1.198 +        "punpcklbw %%mm4,%%mm1\n"      /*  0 r3  0 r2  0 r1  0 r0 */
   1.199 +        "punpcklbw %%mm4,%%mm3\n"      /*  0 b3  0 b2  0 b1  0 b0 */
   1.200 +        "movq %%mm1,%%mm2\n"           /*  0 r3  0 r2  0 r1  0 r0 */
   1.201 +        "movq %%mm3,%%mm0\n"           /*  0 b3  0 b2  0 b1  0 b0 */
   1.202 +        "punpcklwd %%mm1,%%mm3\n"      /*  0 r1  0 b1  0 r0  0 b0 */
   1.203 +        "punpckhwd %%mm2,%%mm0\n"      /*  0 r3  0 b3  0 r2  0 b2 */
   1.204  
   1.205 -        "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
   1.206 -        "movq %%mm7,%%mm1\n"           // G3 G2 G1 G0 g3 g2 g1 g0
   1.207 -        "punpcklbw %%mm1,%%mm2\n"      // g3  0 g2  0 g1  0 g0  0
   1.208 -        "punpcklwd %%mm4,%%mm2\n"      //  0  0 g1  0  0  0 g0  0
   1.209 -        "por %%mm3, %%mm2\n"          //  0 r1 g1 b1  0 r0 g0 b0
   1.210 -        "movq %%mm2,(%3)\n"          // wrote out ! row1
   1.211 +        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
   1.212 +        "movq %%mm7,%%mm1\n"           /* G3 G2 G1 G0 g3 g2 g1 g0 */
   1.213 +        "punpcklbw %%mm1,%%mm2\n"      /* g3  0 g2  0 g1  0 g0  0 */
   1.214 +        "punpcklwd %%mm4,%%mm2\n"      /*  0  0 g1  0  0  0 g0  0 */
   1.215 +        "por %%mm3, %%mm2\n"          /*  0 r1 g1 b1  0 r0 g0 b0 */
   1.216 +        "movq %%mm2,(%3)\n"          /* wrote out ! row1 */
   1.217  
   1.218 -        "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
   1.219 -        "punpcklbw %%mm1,%%mm4\n"      // g3  0 g2  0 g1  0 g0  0
   1.220 -        "punpckhwd %%mm2,%%mm4\n"      //  0  0 g3  0  0  0 g2  0
   1.221 -        "por %%mm0, %%mm4\n"          //  0 r3 g3 b3  0 r2 g2 b2
   1.222 -        "movq %%mm4,8(%3)\n"         // wrote out ! row1
   1.223 +        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
   1.224 +        "punpcklbw %%mm1,%%mm4\n"      /* g3  0 g2  0 g1  0 g0  0 */
   1.225 +        "punpckhwd %%mm2,%%mm4\n"      /*  0  0 g3  0  0  0 g2  0 */
   1.226 +        "por %%mm0, %%mm4\n"          /*  0 r3 g3 b3  0 r2 g2 b2 */
   1.227 +        "movq %%mm4,8(%3)\n"         /* wrote out ! row1 */
   1.228  
   1.229 -        // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
   1.230 -        // this can be done "destructive"
   1.231 -        "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
   1.232 -        "punpckhbw %%mm2,%%mm6\n"      //  0 R3  0 R2  0 R1  0 R0
   1.233 -        "punpckhbw %%mm1,%%mm5\n"      // G3 B3 G2 B2 G1 B1 G0 B0
   1.234 -        "movq %%mm5,%%mm1\n"           // G3 B3 G2 B2 G1 B1 G0 B0
   1.235 -        "punpcklwd %%mm6,%%mm1\n"      //  0 R1 G1 B1  0 R0 G0 B0
   1.236 -        "movq %%mm1,(%5)\n"          // wrote out ! row2
   1.237 -        "punpckhwd %%mm6,%%mm5\n"      //  0 R3 G3 B3  0 R2 G2 B2
   1.238 -        "movq %%mm5,8(%5)\n"         // wrote out ! row2
   1.239 +        /* fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) */
   1.240 +        /* this can be done "destructive" */
   1.241 +        "pxor %%mm2,%%mm2\n"           /*  0  0  0  0  0  0  0  0 */
   1.242 +        "punpckhbw %%mm2,%%mm6\n"      /*  0 R3  0 R2  0 R1  0 R0 */
   1.243 +        "punpckhbw %%mm1,%%mm5\n"      /* G3 B3 G2 B2 G1 B1 G0 B0 */
   1.244 +        "movq %%mm5,%%mm1\n"           /* G3 B3 G2 B2 G1 B1 G0 B0 */
   1.245 +        "punpcklwd %%mm6,%%mm1\n"      /*  0 R1 G1 B1  0 R0 G0 B0 */
   1.246 +        "movq %%mm1,(%5)\n"          /* wrote out ! row2 */
   1.247 +        "punpckhwd %%mm6,%%mm5\n"      /*  0 R3 G3 B3  0 R2 G2 B2 */
   1.248 +        "movq %%mm5,8(%5)\n"         /* wrote out ! row2 */
   1.249  
   1.250 -        "addl $4,%2\n"            // lum+4
   1.251 -        "leal 16(%3),%3\n"        // row1+16
   1.252 -        "leal 16(%5),%5\n"        // row2+16
   1.253 -        "addl $2,(%%esp)\n"        // cr+2
   1.254 -        "addl $2,%1\n"           // cb+2
   1.255 +        "addl $4,%2\n"            /* lum+4 */
   1.256 +        "leal 16(%3),%3\n"        /* row1+16 */
   1.257 +        "leal 16(%5),%5\n"        /* row2+16 */
   1.258 +        "addl $2,(%%esp)\n"        /* cr+2 */
   1.259 +        "addl $2,%1\n"           /* cb+2 */
   1.260  
   1.261 -        "addl $4,%6\n"            // x+4
   1.262 +        "addl $4,%6\n"            /* x+4 */
   1.263          "cmpl %4,%6\n"
   1.264  
   1.265          "jl 1b\n"
   1.266 -        "addl %4,%2\n" // lum += cols
   1.267 -        "addl %8,%3\n" // row1+= mod
   1.268 -        "addl %8,%5\n" // row2+= mod
   1.269 -        "movl $0,%6\n" // x=0
   1.270 +        "addl %4,%2\n" /* lum += cols */
   1.271 +        "addl %8,%3\n" /* row1+= mod */
   1.272 +        "addl %8,%5\n" /* row2+= mod */
   1.273 +        "movl $0,%6\n" /* x=0 */
   1.274          "cmpl %7,%2\n"
   1.275          "jl 1b\n"
   1.276  
   1.277 -        "addl $4,%%esp\n"  // get rid of the stack slot we reserved.
   1.278 -        "emms\n"  // reset MMX registers.
   1.279 +        "addl $4,%%esp\n"  /* get rid of the stack slot we reserved. */
   1.280 +        "emms\n"  /* reset MMX registers. */
   1.281          :
   1.282          : "m" (cr), "r"(cb),"r"(lum),
   1.283            "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
   1.284 @@ -254,125 +254,125 @@
   1.285      mod = (mod+cols+mod)*2;               /* increment for row1 in byte */
   1.286  
   1.287      __asm__ __volatile__(
   1.288 -        // tap dance to workaround the inability to use %%ebx at will...
   1.289 -        //  move one thing to the stack...
   1.290 -        "pushl $0\n"  // save a slot on the stack.
   1.291 -        "pushl %%ebx\n"  // save %%ebx.
   1.292 -        "movl %0, %%ebx\n"  // put the thing in ebx.
   1.293 -        "movl %%ebx, 4(%%esp)\n"  // put the thing in the stack slot.
   1.294 -        "popl %%ebx\n"  // get back %%ebx (the PIC register).
   1.295 +        /* tap dance to workaround the inability to use %%ebx at will... */
   1.296 +        /*  move one thing to the stack... */
   1.297 +        "pushl $0\n"  /* save a slot on the stack. */
   1.298 +        "pushl %%ebx\n"  /* save %%ebx. */
   1.299 +        "movl %0, %%ebx\n"  /* put the thing in ebx. */
   1.300 +        "movl %%ebx, 4(%%esp)\n"  /* put the thing in the stack slot. */
   1.301 +        "popl %%ebx\n"  /* get back %%ebx (the PIC register). */
   1.302  
   1.303          ".align 8\n"
   1.304          "1:\n"
   1.305  
   1.306 -        "movd           (%1),                   %%mm0\n" // 4 Cb         0  0  0  0 u3 u2 u1 u0
   1.307 +        "movd           (%1),                   %%mm0\n" /* 4 Cb         0  0  0  0 u3 u2 u1 u0 */
   1.308          "pxor           %%mm7,                  %%mm7\n"
   1.309          "pushl %%ebx\n"
   1.310          "movl 4(%%esp), %%ebx\n"
   1.311 -        "movd (%%ebx), %%mm1\n"   // 4 Cr                0  0  0  0 v3 v2 v1 v0
   1.312 +        "movd (%%ebx), %%mm1\n"   /* 4 Cr                0  0  0  0 v3 v2 v1 v0 */
   1.313          "popl %%ebx\n"
   1.314  
   1.315 -        "punpcklbw      %%mm7,                  %%mm0\n" // 4 W cb   0 u3  0 u2  0 u1  0 u0
   1.316 -        "punpcklbw      %%mm7,                  %%mm1\n" // 4 W cr   0 v3  0 v2  0 v1  0 v0
   1.317 +        "punpcklbw      %%mm7,                  %%mm0\n" /* 4 W cb   0 u3  0 u2  0 u1  0 u0 */
   1.318 +        "punpcklbw      %%mm7,                  %%mm1\n" /* 4 W cr   0 v3  0 v2  0 v1  0 v0 */
   1.319          "psubw          %9,                     %%mm0\n"
   1.320          "psubw          %9,                     %%mm1\n"
   1.321 -        "movq           %%mm0,                  %%mm2\n" // Cb                   0 u3  0 u2  0 u1  0 u0
   1.322 -        "movq           %%mm1,                  %%mm3\n" // Cr
   1.323 -        "pmullw         %10,                    %%mm2\n" // Cb2green 0 R3  0 R2  0 R1  0 R0
   1.324 -        "movq           (%2),                   %%mm6\n" // L1      l7 L6 L5 L4 L3 L2 L1 L0
   1.325 -        "pmullw         %11,                    %%mm0\n" // Cb2blue
   1.326 -        "pand           %12,                    %%mm6\n" // L1      00 L6 00 L4 00 L2 00 L0
   1.327 -        "pmullw         %13,                    %%mm3\n" // Cr2green
   1.328 -        "movq           (%2),                   %%mm7\n" // L2
   1.329 -        "pmullw         %14,                    %%mm1\n" // Cr2red
   1.330 -        "psrlw          $8,                     %%mm7\n"        // L2           00 L7 00 L5 00 L3 00 L1
   1.331 -        "pmullw         %15,                    %%mm6\n" // lum1
   1.332 -        "paddw          %%mm3,                  %%mm2\n" // Cb2green + Cr2green == green
   1.333 -        "pmullw         %15,                    %%mm7\n" // lum2
   1.334 +        "movq           %%mm0,                  %%mm2\n" /* Cb                   0 u3  0 u2  0 u1  0 u0 */
   1.335 +        "movq           %%mm1,                  %%mm3\n" /* Cr */
   1.336 +        "pmullw         %10,                    %%mm2\n" /* Cb2green 0 R3  0 R2  0 R1  0 R0 */
   1.337 +        "movq           (%2),                   %%mm6\n" /* L1      l7 L6 L5 L4 L3 L2 L1 L0 */
   1.338 +        "pmullw         %11,                    %%mm0\n" /* Cb2blue */
   1.339 +        "pand           %12,                    %%mm6\n" /* L1      00 L6 00 L4 00 L2 00 L0 */
   1.340 +        "pmullw         %13,                    %%mm3\n" /* Cr2green */
   1.341 +        "movq           (%2),                   %%mm7\n" /* L2 */
   1.342 +        "pmullw         %14,                    %%mm1\n" /* Cr2red */
   1.343 +        "psrlw          $8,                     %%mm7\n"        /* L2           00 L7 00 L5 00 L3 00 L1 */
   1.344 +        "pmullw         %15,                    %%mm6\n" /* lum1 */
   1.345 +        "paddw          %%mm3,                  %%mm2\n" /* Cb2green + Cr2green == green */
   1.346 +        "pmullw         %15,                    %%mm7\n" /* lum2 */
   1.347  
   1.348 -        "movq           %%mm6,                  %%mm4\n" // lum1
   1.349 -        "paddw          %%mm0,                  %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
   1.350 -        "movq           %%mm4,                  %%mm5\n" // lum1
   1.351 -        "paddw          %%mm1,                  %%mm4\n" // lum1 +red  00 R6 00 R4 00 R2 00 R0
   1.352 -        "paddw          %%mm2,                  %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0
   1.353 -        "psraw          $6,                     %%mm4\n" // R1 0 .. 64
   1.354 -        "movq           %%mm7,                  %%mm3\n" // lum2                       00 L7 00 L5 00 L3 00 L1
   1.355 -        "psraw          $6,                     %%mm5\n" // G1  - .. +
   1.356 -        "paddw          %%mm0,                  %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
   1.357 -        "psraw          $6,                     %%mm6\n" // B1         0 .. 64
   1.358 -        "packuswb       %%mm4,                  %%mm4\n" // R1 R1
   1.359 -        "packuswb       %%mm5,                  %%mm5\n" // G1 G1
   1.360 -        "packuswb       %%mm6,                  %%mm6\n" // B1 B1
   1.361 +        "movq           %%mm6,                  %%mm4\n" /* lum1 */
   1.362 +        "paddw          %%mm0,                  %%mm6\n" /* lum1 +blue 00 B6 00 B4 00 B2 00 B0 */
   1.363 +        "movq           %%mm4,                  %%mm5\n" /* lum1 */
   1.364 +        "paddw          %%mm1,                  %%mm4\n" /* lum1 +red  00 R6 00 R4 00 R2 00 R0 */
   1.365 +        "paddw          %%mm2,                  %%mm5\n" /* lum1 +green 00 G6 00 G4 00 G2 00 G0 */
   1.366 +        "psraw          $6,                     %%mm4\n" /* R1 0 .. 64 */
   1.367 +        "movq           %%mm7,                  %%mm3\n" /* lum2                       00 L7 00 L5 00 L3 00 L1 */
   1.368 +        "psraw          $6,                     %%mm5\n" /* G1  - .. + */
   1.369 +        "paddw          %%mm0,                  %%mm7\n" /* Lum2 +blue 00 B7 00 B5 00 B3 00 B1 */
   1.370 +        "psraw          $6,                     %%mm6\n" /* B1         0 .. 64 */
   1.371 +        "packuswb       %%mm4,                  %%mm4\n" /* R1 R1 */
   1.372 +        "packuswb       %%mm5,                  %%mm5\n" /* G1 G1 */
   1.373 +        "packuswb       %%mm6,                  %%mm6\n" /* B1 B1 */
   1.374          "punpcklbw      %%mm4,                  %%mm4\n"
   1.375          "punpcklbw      %%mm5,                  %%mm5\n"
   1.376  
   1.377          "pand           %16,                    %%mm4\n"
   1.378 -        "psllw          $3,                     %%mm5\n" // GREEN       1
   1.379 +        "psllw          $3,                     %%mm5\n" /* GREEN       1 */
   1.380          "punpcklbw      %%mm6,                  %%mm6\n"
   1.381          "pand           %17,                    %%mm5\n"
   1.382          "pand           %16,                    %%mm6\n"
   1.383 -        "por            %%mm5,                  %%mm4\n" //
   1.384 -        "psrlw          $11,                    %%mm6\n" // BLUE        1
   1.385 -        "movq           %%mm3,                  %%mm5\n" // lum2
   1.386 -        "paddw          %%mm1,                  %%mm3\n" // lum2 +red      00 R7 00 R5 00 R3 00 R1
   1.387 -        "paddw          %%mm2,                  %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
   1.388 -        "psraw          $6,                     %%mm3\n" // R2
   1.389 -        "por            %%mm6,                  %%mm4\n" // MM4
   1.390 -        "psraw          $6,                     %%mm5\n" // G2
   1.391 -        "movq           (%2, %4),               %%mm6\n" // L3 load lum2
   1.392 +        "por            %%mm5,                  %%mm4\n" /* */
   1.393 +        "psrlw          $11,                    %%mm6\n" /* BLUE        1 */
   1.394 +        "movq           %%mm3,                  %%mm5\n" /* lum2 */
   1.395 +        "paddw          %%mm1,                  %%mm3\n" /* lum2 +red      00 R7 00 R5 00 R3 00 R1 */
   1.396 +        "paddw          %%mm2,                  %%mm5\n" /* lum2 +green 00 G7 00 G5 00 G3 00 G1 */
   1.397 +        "psraw          $6,                     %%mm3\n" /* R2 */
   1.398 +        "por            %%mm6,                  %%mm4\n" /* MM4 */
   1.399 +        "psraw          $6,                     %%mm5\n" /* G2 */
   1.400 +        "movq           (%2, %4),               %%mm6\n" /* L3 load lum2 */
   1.401          "psraw          $6,                     %%mm7\n"
   1.402          "packuswb       %%mm3,                  %%mm3\n"
   1.403          "packuswb       %%mm5,                  %%mm5\n"
   1.404          "packuswb       %%mm7,                  %%mm7\n"
   1.405 -        "pand           %12,                    %%mm6\n" // L3
   1.406 +        "pand           %12,                    %%mm6\n" /* L3 */
   1.407          "punpcklbw      %%mm3,                  %%mm3\n"
   1.408          "punpcklbw      %%mm5,                  %%mm5\n"
   1.409 -        "pmullw         %15,                    %%mm6\n" // lum3
   1.410 +        "pmullw         %15,                    %%mm6\n" /* lum3 */
   1.411          "punpcklbw      %%mm7,                  %%mm7\n"
   1.412 -        "psllw          $3,                     %%mm5\n" // GREEN 2
   1.413 +        "psllw          $3,                     %%mm5\n" /* GREEN 2 */
   1.414          "pand           %16,                    %%mm7\n"
   1.415          "pand           %16,                    %%mm3\n"
   1.416 -        "psrlw          $11,                    %%mm7\n" // BLUE  2
   1.417 +        "psrlw          $11,                    %%mm7\n" /* BLUE  2 */
   1.418          "pand           %17,                    %%mm5\n"
   1.419          "por            %%mm7,                  %%mm3\n"
   1.420 -        "movq           (%2,%4),                %%mm7\n" // L4 load lum2
   1.421 -        "por            %%mm5,                  %%mm3\n" //
   1.422 -        "psrlw          $8,                     %%mm7\n" // L4
   1.423 +        "movq           (%2,%4),                %%mm7\n" /* L4 load lum2 */
   1.424 +        "por            %%mm5,                  %%mm3\n"
   1.425 +        "psrlw          $8,                     %%mm7\n" /* L4 */
   1.426          "movq           %%mm4,                  %%mm5\n"
   1.427          "punpcklwd      %%mm3,                  %%mm4\n"
   1.428 -        "pmullw         %15,                    %%mm7\n" // lum4
   1.429 +        "pmullw         %15,                    %%mm7\n" /* lum4 */
   1.430          "punpckhwd      %%mm3,                  %%mm5\n"
   1.431  
   1.432 -        "movq           %%mm4,                  (%3)\n"  // write row1
   1.433 -        "movq           %%mm5,                  8(%3)\n" // write row1
   1.434 +        "movq           %%mm4,                  (%3)\n"  /* write row1 */
   1.435 +        "movq           %%mm5,                  8(%3)\n" /* write row1 */
   1.436  
   1.437 -        "movq           %%mm6,                  %%mm4\n" // Lum3
   1.438 -        "paddw          %%mm0,                  %%mm6\n" // Lum3 +blue
   1.439 +        "movq           %%mm6,                  %%mm4\n" /* Lum3 */
   1.440 +        "paddw          %%mm0,                  %%mm6\n" /* Lum3 +blue */
   1.441  
   1.442 -        "movq           %%mm4,                  %%mm5\n" // Lum3
   1.443 -        "paddw          %%mm1,                  %%mm4\n" // Lum3 +red
   1.444 -        "paddw          %%mm2,                  %%mm5\n" // Lum3 +green
   1.445 +        "movq           %%mm4,                  %%mm5\n" /* Lum3 */
   1.446 +        "paddw          %%mm1,                  %%mm4\n" /* Lum3 +red */
   1.447 +        "paddw          %%mm2,                  %%mm5\n" /* Lum3 +green */
   1.448          "psraw          $6,                     %%mm4\n"
   1.449 -        "movq           %%mm7,                  %%mm3\n" // Lum4
   1.450 +        "movq           %%mm7,                  %%mm3\n" /* Lum4 */
   1.451          "psraw          $6,                     %%mm5\n"
   1.452 -        "paddw          %%mm0,                  %%mm7\n" // Lum4 +blue
   1.453 -        "psraw          $6,                     %%mm6\n" // Lum3 +blue
   1.454 -        "movq           %%mm3,                  %%mm0\n" // Lum4
   1.455 +        "paddw          %%mm0,                  %%mm7\n" /* Lum4 +blue */
   1.456 +        "psraw          $6,                     %%mm6\n" /* Lum3 +blue */
   1.457 +        "movq           %%mm3,                  %%mm0\n" /* Lum4 */
   1.458          "packuswb       %%mm4,                  %%mm4\n"
   1.459 -        "paddw          %%mm1,                  %%mm3\n" // Lum4 +red
   1.460 +        "paddw          %%mm1,                  %%mm3\n" /* Lum4 +red */
   1.461          "packuswb       %%mm5,                  %%mm5\n"
   1.462 -        "paddw          %%mm2,                  %%mm0\n" // Lum4 +green
   1.463 +        "paddw          %%mm2,                  %%mm0\n" /* Lum4 +green */
   1.464          "packuswb       %%mm6,                  %%mm6\n"
   1.465          "punpcklbw      %%mm4,                  %%mm4\n"
   1.466          "punpcklbw      %%mm5,                  %%mm5\n"
   1.467          "punpcklbw      %%mm6,                  %%mm6\n"
   1.468 -        "psllw          $3,                     %%mm5\n" // GREEN 3
   1.469 +        "psllw          $3,                     %%mm5\n" /* GREEN 3 */
   1.470          "pand           %16,                    %%mm4\n"
   1.471 -        "psraw          $6,                     %%mm3\n" // psr 6
   1.472 +        "psraw          $6,                     %%mm3\n" /* psr 6 */
   1.473          "psraw          $6,                     %%mm0\n"
   1.474 -        "pand           %16,                    %%mm6\n" // BLUE
   1.475 +        "pand           %16,                    %%mm6\n" /* BLUE */
   1.476          "pand           %17,                    %%mm5\n"
   1.477 -        "psrlw          $11,                    %%mm6\n" // BLUE  3
   1.478 +        "psrlw          $11,                    %%mm6\n" /* BLUE  3 */
   1.479          "por            %%mm5,                  %%mm4\n"
   1.480          "psraw          $6,                     %%mm7\n"
   1.481          "por            %%mm6,                  %%mm4\n"
   1.482 @@ -383,8 +383,8 @@
   1.483          "punpcklbw      %%mm0,                  %%mm0\n"
   1.484          "punpcklbw      %%mm7,                  %%mm7\n"
   1.485          "pand           %16,                    %%mm3\n"
   1.486 -        "pand           %16,                    %%mm7\n" // BLUE
   1.487 -        "psllw          $3,                     %%mm0\n" // GREEN 4
   1.488 +        "pand           %16,                    %%mm7\n" /* BLUE */
   1.489 +        "psllw          $3,                     %%mm0\n" /* GREEN 4 */
   1.490          "psrlw          $11,                    %%mm7\n"
   1.491          "pand           %17,                    %%mm0\n"
   1.492          "por            %%mm7,                  %%mm3\n"
   1.493 @@ -404,16 +404,16 @@
   1.494          "addl           $4,                     %1\n"
   1.495          "cmpl           %4,                     %6\n"
   1.496          "leal           16(%3),                 %3\n"
   1.497 -        "leal           16(%5),%5\n" // row2+16
   1.498 +        "leal           16(%5),%5\n" /* row2+16 */
   1.499  
   1.500          "jl             1b\n"
   1.501 -        "addl           %4,     %2\n" // lum += cols
   1.502 -        "addl           %8,     %3\n" // row1+= mod
   1.503 -        "addl           %8,     %5\n" // row2+= mod
   1.504 -        "movl           $0,     %6\n" // x=0
   1.505 +        "addl           %4,     %2\n" /* lum += cols */
   1.506 +        "addl           %8,     %3\n" /* row1+= mod */
   1.507 +        "addl           %8,     %5\n" /* row2+= mod */
   1.508 +        "movl           $0,     %6\n" /* x=0 */
   1.509          "cmpl           %7,     %2\n"
   1.510          "jl             1b\n"
   1.511 -        "addl $4, %%esp\n"  // get rid of the stack slot we reserved.
   1.512 +        "addl $4, %%esp\n"  /* get rid of the stack slot we reserved. */
   1.513          "emms\n"
   1.514          :
   1.515          : "m" (cr), "r"(cb),"r"(lum),