src/video/SDL_yuv_mmx.c
author Sam Lantinga <slouken@libsdl.org>
Mon, 10 Jul 2006 21:04:37 +0000
changeset 1895 c121d94672cb
parent 1413 40edc79b0926
child 2167 8f2174e22cd5
permissions -rw-r--r--
SDL 1.2 is moving to a branch, and SDL 1.3 is becoming the head.
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@1312
     3
    Copyright (C) 1997-2006 Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@1312
     6
    modify it under the terms of the GNU Lesser General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@1312
     8
    version 2.1 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@1312
    13
    Lesser General Public License for more details.
slouken@0
    14
slouken@1312
    15
    You should have received a copy of the GNU Lesser General Public
slouken@1312
    16
    License along with this library; if not, write to the Free Software
slouken@1312
    17
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@1402
    22
#include "SDL_config.h"
slouken@0
    23
slouken@1895
    24
#if 0                           /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */
slouken@1402
    25
#if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
slouken@0
    26
slouken@1407
    27
#include "SDL_stdinc.h"
slouken@1407
    28
icculus@1148
    29
#define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used))
slouken@0
    30
slouken@1895
    31
static unsigned int ASM_ARRAY(MMX_0080w) = {
slouken@1895
    32
0x00800080, 0x00800080};
slouken@1895
    33
static unsigned int ASM_ARRAY(MMX_00FFw) = {
slouken@1895
    34
0x00ff00ff, 0x00ff00ff};
slouken@1895
    35
static unsigned int ASM_ARRAY(MMX_FF00w) = {
slouken@1895
    36
0xff00ff00, 0xff00ff00};
slouken@0
    37
slouken@1895
    38
static unsigned short ASM_ARRAY(MMX_Ycoeff) = {
slouken@1895
    39
0x004a, 0x004a, 0x004a, 0x004a};
slouken@0
    40
slouken@1895
    41
static unsigned short ASM_ARRAY(MMX_UbluRGB) = {
slouken@1895
    42
0x0072, 0x0072, 0x0072, 0x0072};
slouken@1895
    43
static unsigned short ASM_ARRAY(MMX_VredRGB) = {
slouken@1895
    44
0x0059, 0x0059, 0x0059, 0x0059};
slouken@1895
    45
static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {
slouken@1895
    46
0xffea, 0xffea, 0xffea, 0xffea};
slouken@1895
    47
static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {
slouken@1895
    48
0xffd2, 0xffd2, 0xffd2, 0xffd2};
slouken@0
    49
slouken@1895
    50
static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {
slouken@1895
    51
0x0081, 0x0081, 0x0081, 0x0081};
slouken@1895
    52
static unsigned short ASM_ARRAY(MMX_Vred5x5) = {
slouken@1895
    53
0x0066, 0x0066, 0x0066, 0x0066};
slouken@1895
    54
static unsigned short ASM_ARRAY(MMX_Ugrn555) = {
slouken@1895
    55
0xffe7, 0xffe7, 0xffe7, 0xffe7};
slouken@1895
    56
static unsigned short ASM_ARRAY(MMX_Vgrn555) = {
slouken@1895
    57
0xffcc, 0xffcc, 0xffcc, 0xffcc};
slouken@1895
    58
static unsigned short ASM_ARRAY(MMX_Ugrn565) = {
slouken@1895
    59
0xffe8, 0xffe8, 0xffe8, 0xffe8};
slouken@1895
    60
static unsigned short ASM_ARRAY(MMX_Vgrn565) = {
slouken@1895
    61
0xffcd, 0xffcd, 0xffcd, 0xffcd};
slouken@1895
    62
slouken@1895
    63
static unsigned short ASM_ARRAY(MMX_red555) = {
slouken@1895
    64
0x7c00, 0x7c00, 0x7c00, 0x7c00};
slouken@1895
    65
static unsigned short ASM_ARRAY(MMX_red565) = {
slouken@1895
    66
0xf800, 0xf800, 0xf800, 0xf800};
slouken@1895
    67
static unsigned short ASM_ARRAY(MMX_grn555) = {
slouken@1895
    68
0x03e0, 0x03e0, 0x03e0, 0x03e0};
slouken@1895
    69
static unsigned short ASM_ARRAY(MMX_grn565) = {
slouken@1895
    70
0x07e0, 0x07e0, 0x07e0, 0x07e0};
slouken@1895
    71
static unsigned short ASM_ARRAY(MMX_blu5x5) = {
slouken@1895
    72
0x001f, 0x001f, 0x001f, 0x001f};
slouken@0
    73
slouken@0
    74
/**
slouken@0
    75
   This MMX assembler is my first assembler/MMX program ever.
slouken@0
    76
   Thus it maybe buggy.
slouken@0
    77
   Send patches to:
slouken@0
    78
   mvogt@rhrk.uni-kl.de
slouken@0
    79
slouken@0
    80
   After it worked fine I have "obfuscated" the code a bit to have
slouken@0
    81
   more parallism in the MMX units. This means I moved
slouken@0
    82
   initilisation around and delayed other instruction.
slouken@0
    83
   Performance measurement did not show that this brought any advantage
slouken@0
    84
   but in theory it _should_ be faster this way.
slouken@0
    85
slouken@0
    86
   The overall performanve gain to the C based dither was 30%-40%.
slouken@0
    87
   The MMX routine calculates 256bit=8RGB values in each cycle
slouken@0
    88
   (4 for row1 & 4 for row2)
slouken@0
    89
slouken@0
    90
   The red/green/blue.. coefficents are taken from the mpeg_play 
slouken@0
    91
   player. They look nice, but I dont know if you can have
slouken@0
    92
   better values, to avoid integer rounding errors.
slouken@0
    93
   
slouken@0
    94
slouken@0
    95
   IMPORTANT:
slouken@0
    96
   ==========
slouken@0
    97
slouken@0
    98
   It is a requirement that the cr/cb/lum are 8 byte aligned and
slouken@0
    99
   the out are 16byte aligned or you will/may get segfaults
slouken@0
   100
slouken@0
   101
*/
slouken@0
   102
slouken@1895
   103
void
slouken@1895
   104
ColorRGBDitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix,
slouken@1895
   105
                        unsigned char *lum, unsigned char *cr,
slouken@1895
   106
                        unsigned char *cb, unsigned char *out,
slouken@1895
   107
                        int rows, int cols, int mod)
slouken@0
   108
{
slouken@0
   109
    Uint32 *row1;
slouken@0
   110
    Uint32 *row2;
slouken@0
   111
slouken@1895
   112
    unsigned char *y = lum + cols * rows;       // Pointer to the end
slouken@1895
   113
    int x = 0;
slouken@1895
   114
    row1 = (Uint32 *) out;      // 32 bit target
slouken@1895
   115
    row2 = (Uint32 *) out + cols + mod; // start of second row 
slouken@1895
   116
    mod = (mod + cols + mod) * 4;       // increment for row1 in byte
slouken@0
   117
slouken@1895
   118
    __asm__ __volatile__(
slouken@0
   119
/* We don't really care about PIC - the code should be rewritten to use
slouken@0
   120
   relative addressing for the static tables, so right now we take the
slouken@0
   121
   COW hit on the pages this code resides. Big deal.
slouken@0
   122
   This spill is just to reduce register pressure in the PIC case. */
slouken@1895
   123
                            "pushl %%ebx\n"
slouken@1895
   124
                            "movl %0, %%ebx\n" ".align 8\n" "1:\n"
slouken@1895
   125
                            // create Cr (result in mm1)
slouken@1895
   126
                            "movd (%%ebx), %%mm1\n"     //         0  0  0  0  v3 v2 v1 v0
slouken@1895
   127
                            "pxor %%mm7,%%mm7\n"        //         00 00 00 00 00 00 00 00
slouken@1895
   128
                            "movd (%2), %%mm2\n"        //    0  0  0  0 l3 l2 l1 l0
slouken@1895
   129
                            "punpcklbw %%mm7,%%mm1\n"   //         0  v3 0  v2 00 v1 00 v0
slouken@1895
   130
                            "punpckldq %%mm1,%%mm1\n"   //         00 v1 00 v0 00 v1 00 v0
slouken@1895
   131
                            "psubw _MMX_0080w,%%mm1\n"  // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 
slouken@1895
   132
                            // create Cr_g (result in mm0)
slouken@1895
   133
                            "movq %%mm1,%%mm0\n"        // r1 r1 r0 r0 r1 r1 r0 r0
slouken@1895
   134
                            "pmullw _MMX_VgrnRGB,%%mm0\n"       // red*-46dec=0.7136*64
slouken@1895
   135
                            "pmullw _MMX_VredRGB,%%mm1\n"       // red*89dec=1.4013*64
slouken@1895
   136
                            "psraw  $6, %%mm0\n"        // red=red/64
slouken@1895
   137
                            "psraw  $6, %%mm1\n"        // red=red/64
slouken@1895
   138
                            // create L1 L2 (result in mm2,mm4)
slouken@1895
   139
                            // L2=lum+cols
slouken@1895
   140
                            "movq (%2,%4),%%mm3\n"      //    0  0  0  0 L3 L2 L1 L0
slouken@1895
   141
                            "punpckldq %%mm3,%%mm2\n"   //   L3 L2 L1 L0 l3 l2 l1 l0
slouken@1895
   142
                            "movq %%mm2,%%mm4\n"        //   L3 L2 L1 L0 l3 l2 l1 l0
slouken@1895
   143
                            "pand _MMX_FF00w,%%mm2\n"   //   L3 0  L1  0 l3  0 l1  0
slouken@1895
   144
                            "pand _MMX_00FFw,%%mm4\n"   //   0  L2  0 L0  0 l2  0 l0
slouken@1895
   145
                            "psrlw $8,%%mm2\n"  //   0  L3  0 L1  0 l3  0 l1
slouken@1895
   146
                            // create R (result in mm6)
slouken@1895
   147
                            "movq %%mm2,%%mm5\n"        //   0 L3  0 L1  0 l3  0 l1
slouken@1895
   148
                            "movq %%mm4,%%mm6\n"        //   0 L2  0 L0  0 l2  0 l0
slouken@1895
   149
                            "paddsw  %%mm1, %%mm5\n"    // lum1+red:x R3 x R1 x r3 x r1
slouken@1895
   150
                            "paddsw  %%mm1, %%mm6\n"    // lum1+red:x R2 x R0 x r2 x r0
slouken@1895
   151
                            "packuswb %%mm5,%%mm5\n"    //  R3 R1 r3 r1 R3 R1 r3 r1
slouken@1895
   152
                            "packuswb %%mm6,%%mm6\n"    //  R2 R0 r2 r0 R2 R0 r2 r0
slouken@1895
   153
                            "pxor %%mm7,%%mm7\n"        //         00 00 00 00 00 00 00 00
slouken@1895
   154
                            "punpcklbw %%mm5,%%mm6\n"   //  R3 R2 R1 R0 r3 r2 r1 r0
slouken@1895
   155
                            // create Cb (result in mm1)
slouken@1895
   156
                            "movd (%1), %%mm1\n"        //         0  0  0  0  u3 u2 u1 u0
slouken@1895
   157
                            "punpcklbw %%mm7,%%mm1\n"   //         0  u3 0  u2 00 u1 00 u0
slouken@1895
   158
                            "punpckldq %%mm1,%%mm1\n"   //         00 u1 00 u0 00 u1 00 u0
slouken@1895
   159
                            "psubw _MMX_0080w,%%mm1\n"  // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 
slouken@1895
   160
                            // create Cb_g (result in mm5)
slouken@1895
   161
                            "movq %%mm1,%%mm5\n"        // u1 u1 u0 u0 u1 u1 u0 u0
slouken@1895
   162
                            "pmullw _MMX_UgrnRGB,%%mm5\n"       // blue*-109dec=1.7129*64
slouken@1895
   163
                            "pmullw _MMX_UbluRGB,%%mm1\n"       // blue*114dec=1.78125*64
slouken@1895
   164
                            "psraw  $6, %%mm5\n"        // blue=red/64
slouken@1895
   165
                            "psraw  $6, %%mm1\n"        // blue=blue/64
slouken@1895
   166
                            // create G (result in mm7)
slouken@1895
   167
                            "movq %%mm2,%%mm3\n"        //   0  L3  0 L1  0 l3  0 l1
slouken@1895
   168
                            "movq %%mm4,%%mm7\n"        //   0  L2  0 L0  0 l2  0 l1
slouken@1895
   169
                            "paddsw  %%mm5, %%mm3\n"    // lum1+Cb_g:x G3t x G1t x g3t x g1t
slouken@1895
   170
                            "paddsw  %%mm5, %%mm7\n"    // lum1+Cb_g:x G2t x G0t x g2t x g0t
slouken@1895
   171
                            "paddsw  %%mm0, %%mm3\n"    // lum1+Cr_g:x G3  x G1  x g3  x g1
slouken@1895
   172
                            "paddsw  %%mm0, %%mm7\n"    // lum1+blue:x G2  x G0  x g2  x g0
slouken@1895
   173
                            "packuswb %%mm3,%%mm3\n"    // G3 G1 g3 g1 G3 G1 g3 g1
slouken@1895
   174
                            "packuswb %%mm7,%%mm7\n"    // G2 G0 g2 g0 G2 G0 g2 g0
slouken@1895
   175
                            "punpcklbw %%mm3,%%mm7\n"   // G3 G2 G1 G0 g3 g2 g1 g0
slouken@1895
   176
                            // create B (result in mm5)
slouken@1895
   177
                            "movq %%mm2,%%mm3\n"        //   0  L3  0 L1  0 l3  0 l1
slouken@1895
   178
                            "movq %%mm4,%%mm5\n"        //   0  L2  0 L0  0 l2  0 l1
slouken@1895
   179
                            "paddsw  %%mm1, %%mm3\n"    // lum1+blue:x B3 x B1 x b3 x b1
slouken@1895
   180
                            "paddsw  %%mm1, %%mm5\n"    // lum1+blue:x B2 x B0 x b2 x b0
slouken@1895
   181
                            "packuswb %%mm3,%%mm3\n"    // B3 B1 b3 b1 B3 B1 b3 b1
slouken@1895
   182
                            "packuswb %%mm5,%%mm5\n"    // B2 B0 b2 b0 B2 B0 b2 b0
slouken@1895
   183
                            "punpcklbw %%mm3,%%mm5\n"   // B3 B2 B1 B0 b3 b2 b1 b0
slouken@1895
   184
                            // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
slouken@1895
   185
                            "pxor %%mm2,%%mm2\n"        //  0  0  0  0  0  0  0  0
slouken@1895
   186
                            "pxor %%mm4,%%mm4\n"        //  0  0  0  0  0  0  0  0
slouken@1895
   187
                            "movq %%mm6,%%mm1\n"        // R3 R2 R1 R0 r3 r2 r1 r0
slouken@1895
   188
                            "movq %%mm5,%%mm3\n"        // B3 B2 B1 B0 b3 b2 b1 b0
slouken@1895
   189
                            // process lower lum
slouken@1895
   190
                            "punpcklbw %%mm4,%%mm1\n"   //  0 r3  0 r2  0 r1  0 r0
slouken@1895
   191
                            "punpcklbw %%mm4,%%mm3\n"   //  0 b3  0 b2  0 b1  0 b0
slouken@1895
   192
                            "movq %%mm1,%%mm2\n"        //  0 r3  0 r2  0 r1  0 r0
slouken@1895
   193
                            "movq %%mm3,%%mm0\n"        //  0 b3  0 b2  0 b1  0 b0
slouken@1895
   194
                            "punpcklwd %%mm1,%%mm3\n"   //  0 r1  0 b1  0 r0  0 b0
slouken@1895
   195
                            "punpckhwd %%mm2,%%mm0\n"   //  0 r3  0 b3  0 r2  0 b2
slouken@1895
   196
                            "pxor %%mm2,%%mm2\n"        //  0  0  0  0  0  0  0  0
slouken@1895
   197
                            "movq %%mm7,%%mm1\n"        // G3 G2 G1 G0 g3 g2 g1 g0
slouken@1895
   198
                            "punpcklbw %%mm1,%%mm2\n"   // g3  0 g2  0 g1  0 g0  0
slouken@1895
   199
                            "punpcklwd %%mm4,%%mm2\n"   //  0  0 g1  0  0  0 g0  0 
slouken@1895
   200
                            "por  %%mm3, %%mm2\n"       //  0 r1 g1 b1  0 r0 g0 b0
slouken@1895
   201
                            "movq   %%mm2,(%3)\n"       // wrote out ! row1
slouken@1895
   202
                            "pxor %%mm2,%%mm2\n"        //  0  0  0  0  0  0  0  0
slouken@1895
   203
                            "punpcklbw %%mm1,%%mm4\n"   // g3  0 g2  0 g1  0 g0  0
slouken@1895
   204
                            "punpckhwd %%mm2,%%mm4\n"   //  0  0 g3  0  0  0 g2  0 
slouken@1895
   205
                            "por  %%mm0, %%mm4\n"       //  0 r3 g3 b3  0 r2 g2 b2
slouken@1895
   206
                            "movq   %%mm4,8(%3)\n"      // wrote out ! row1
slouken@1895
   207
                            // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
slouken@1895
   208
                            // this can be done "destructive"
slouken@1895
   209
                            "pxor %%mm2,%%mm2\n"        //  0  0  0  0  0  0  0  0
slouken@1895
   210
                            "punpckhbw %%mm2,%%mm6\n"   //  0 R3  0 R2  0 R1  0 R0
slouken@1895
   211
                            "punpckhbw %%mm1,%%mm5\n"   // G3 B3 G2 B2 G1 B1 G0 B0
slouken@1895
   212
                            "movq %%mm5,%%mm1\n"        // G3 B3 G2 B2 G1 B1 G0 B0
slouken@1895
   213
                            "punpcklwd %%mm6,%%mm1\n"   //  0 R1 G1 B1  0 R0 G0 B0
slouken@1895
   214
                            "movq   %%mm1,(%5)\n"       // wrote out ! row2
slouken@1895
   215
                            "punpckhwd %%mm6,%%mm5\n"   //  0 R3 G3 B3  0 R2 G2 B2
slouken@1895
   216
                            "movq   %%mm5,8(%5)\n"      // wrote out ! row2
slouken@1895
   217
                            "addl  $4,%2\n"     // lum+4
slouken@1895
   218
                            "leal  16(%3),%3\n" // row1+16
slouken@1895
   219
                            "leal  16(%5),%5\n" // row2+16
slouken@1895
   220
                            "addl  $2, %%ebx\n" // cr+2
slouken@1895
   221
                            "addl  $2, %1\n"    // cb+2
slouken@1895
   222
                            "addl  $4,%6\n"     // x+4
slouken@1895
   223
                            "cmpl  %4,%6\n" "jl    1b\n" "addl           %4,     %2\n"  // lum += cols 
slouken@1895
   224
                            "addl           %8,     %3\n"       // row1+= mod
slouken@1895
   225
                            "addl           %8,     %5\n"       // row2+= mod
slouken@1895
   226
                            "movl           $0,     %6\n"       // x=0
slouken@1895
   227
                            "cmpl           %7,     %2\n"
slouken@1895
   228
                            "jl             1b\n"
slouken@1895
   229
                            "emms\n"
slouken@1895
   230
                            "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum),
slouken@1895
   231
                            "r"(row1), "r"(cols), "r"(row2), "m"(x),
slouken@1895
   232
                            "m"(y), "m"(mod));
slouken@0
   233
}
slouken@0
   234
slouken@1895
   235
void
slouken@1895
   236
Color565DitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix,
slouken@1895
   237
                        unsigned char *lum, unsigned char *cr,
slouken@1895
   238
                        unsigned char *cb, unsigned char *out,
slouken@1895
   239
                        int rows, int cols, int mod)
slouken@0
   240
{
slouken@0
   241
    Uint16 *row1;
slouken@0
   242
    Uint16 *row2;
slouken@0
   243
slouken@1895
   244
    unsigned char *y = lum + cols * rows;       /* Pointer to the end */
slouken@1895
   245
    int x = 0;
slouken@1895
   246
    row1 = (Uint16 *) out;      /* 16 bit target */
slouken@1895
   247
    row2 = (Uint16 *) out + cols + mod; /* start of second row  */
slouken@1895
   248
    mod = (mod + cols + mod) * 2;       /* increment for row1 in byte */
slouken@0
   249
slouken@0
   250
slouken@1895
   251
    __asm__ __volatile__("pushl %%ebx\n" "movl %0, %%ebx\n" ".align 8\n" "1:\n" "movd           (%1),                   %%mm0\n"        // 4 Cb         0  0  0  0 u3 u2 u1 u0
slouken@1895
   252
                         "pxor           %%mm7,                  %%mm7\n" "movd           (%%ebx),                %%mm1\n"      // 4 Cr                0  0  0  0 v3 v2 v1 v0
slouken@1895
   253
                         "punpcklbw      %%mm7,                  %%mm0\n"       // 4 W cb   0 u3  0 u2  0 u1  0 u0
slouken@1895
   254
                         "punpcklbw      %%mm7,                  %%mm1\n"       // 4 W cr   0 v3  0 v2  0 v1  0 v0
slouken@1895
   255
                         "psubw          _MMX_0080w,             %%mm0\n" "psubw          _MMX_0080w,             %%mm1\n" "movq           %%mm0,                  %%mm2\n"     // Cb                   0 u3  0 u2  0 u1  0 u0
slouken@1895
   256
                         "movq           %%mm1,                  %%mm3\n"       // Cr
slouken@1895
   257
                         "pmullw         _MMX_Ugrn565,           %%mm2\n"       // Cb2green 0 R3  0 R2  0 R1  0 R0
slouken@1895
   258
                         "movq           (%2),                   %%mm6\n"       // L1      l7 L6 L5 L4 L3 L2 L1 L0
slouken@1895
   259
                         "pmullw         _MMX_Ublu5x5,           %%mm0\n"       // Cb2blue
slouken@1895
   260
                         "pand           _MMX_00FFw,             %%mm6\n"       // L1      00 L6 00 L4 00 L2 00 L0
slouken@1895
   261
                         "pmullw         _MMX_Vgrn565,           %%mm3\n"       // Cr2green
slouken@1895
   262
                         "movq           (%2),                   %%mm7\n"       // L2
slouken@1895
   263
                         "pmullw         _MMX_Vred5x5,           %%mm1\n"       // Cr2red
slouken@1895
   264
                         "psrlw          $8,                     %%mm7\n"       // L2           00 L7 00 L5 00 L3 00 L1
slouken@1895
   265
                         "pmullw         _MMX_Ycoeff,            %%mm6\n"       // lum1
slouken@1895
   266
                         "paddw          %%mm3,                  %%mm2\n"       // Cb2green + Cr2green == green
slouken@1895
   267
                         "pmullw         _MMX_Ycoeff,            %%mm7\n"       // lum2
slouken@1895
   268
                         "movq           %%mm6,                  %%mm4\n"       // lum1
slouken@1895
   269
                         "paddw          %%mm0,                  %%mm6\n"       // lum1 +blue 00 B6 00 B4 00 B2 00 B0
slouken@1895
   270
                         "movq           %%mm4,                  %%mm5\n"       // lum1
slouken@1895
   271
                         "paddw          %%mm1,                  %%mm4\n"       // lum1 +red  00 R6 00 R4 00 R2 00 R0
slouken@1895
   272
                         "paddw          %%mm2,                  %%mm5\n"       // lum1 +green 00 G6 00 G4 00 G2 00 G0
slouken@1895
   273
                         "psraw          $6,                     %%mm4\n"       // R1 0 .. 64
slouken@1895
   274
                         "movq           %%mm7,                  %%mm3\n"       // lum2                       00 L7 00 L5 00 L3 00 L1
slouken@1895
   275
                         "psraw          $6,                     %%mm5\n"       // G1  - .. +
slouken@1895
   276
                         "paddw          %%mm0,                  %%mm7\n"       // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
slouken@1895
   277
                         "psraw          $6,                     %%mm6\n"       // B1         0 .. 64
slouken@1895
   278
                         "packuswb       %%mm4,                  %%mm4\n"       // R1 R1
slouken@1895
   279
                         "packuswb       %%mm5,                  %%mm5\n"       // G1 G1
slouken@1895
   280
                         "packuswb       %%mm6,                  %%mm6\n"       // B1 B1
slouken@1895
   281
                         "punpcklbw      %%mm4,                  %%mm4\n" "punpcklbw      %%mm5,                  %%mm5\n" "pand           _MMX_red565,            %%mm4\n" "psllw          $3,                     %%mm5\n"    // GREEN       1
slouken@1895
   282
                         "punpcklbw      %%mm6,                  %%mm6\n" "pand           _MMX_grn565,            %%mm5\n" "pand           _MMX_red565,            %%mm6\n" "por            %%mm5,                  %%mm4\n"    //
slouken@1895
   283
                         "psrlw          $11,                    %%mm6\n"       // BLUE        1
slouken@1895
   284
                         "movq           %%mm3,                  %%mm5\n"       // lum2
slouken@1895
   285
                         "paddw          %%mm1,                  %%mm3\n"       // lum2 +red      00 R7 00 R5 00 R3 00 R1
slouken@1895
   286
                         "paddw          %%mm2,                  %%mm5\n"       // lum2 +green 00 G7 00 G5 00 G3 00 G1
slouken@1895
   287
                         "psraw          $6,                     %%mm3\n"       // R2
slouken@1895
   288
                         "por            %%mm6,                  %%mm4\n"       // MM4
slouken@1895
   289
                         "psraw          $6,                     %%mm5\n"       // G2
slouken@1895
   290
                         "movq           (%2, %4),               %%mm6\n"       // L3 load lum2
slouken@1895
   291
                         "psraw          $6,                     %%mm7\n" "packuswb       %%mm3,                  %%mm3\n" "packuswb       %%mm5,                  %%mm5\n" "packuswb       %%mm7,                  %%mm7\n" "pand           _MMX_00FFw,             %%mm6\n"   // L3
slouken@1895
   292
                         "punpcklbw      %%mm3,                  %%mm3\n" "punpcklbw      %%mm5,                  %%mm5\n" "pmullw         _MMX_Ycoeff,            %%mm6\n"     // lum3
slouken@1895
   293
                         "punpcklbw      %%mm7,                  %%mm7\n" "psllw          $3,                     %%mm5\n"      // GREEN 2
slouken@1895
   294
                         "pand           _MMX_red565,            %%mm7\n" "pand           _MMX_red565,            %%mm3\n" "psrlw          $11,                    %%mm7\n"     // BLUE  2
slouken@1895
   295
                         "pand           _MMX_grn565,            %%mm5\n" "por            %%mm7,                  %%mm3\n" "movq           (%2,%4),                %%mm7\n"     // L4 load lum2
slouken@1895
   296
                         "por            %%mm5,                  %%mm3\n"       //
slouken@1895
   297
                         "psrlw          $8,                     %%mm7\n"       // L4
slouken@1895
   298
                         "movq           %%mm4,                  %%mm5\n" "punpcklwd      %%mm3,                  %%mm4\n" "pmullw         _MMX_Ycoeff,            %%mm7\n"     // lum4
slouken@1895
   299
                         "punpckhwd      %%mm3,                  %%mm5\n" "movq           %%mm4,                  (%3)\n"       // write row1
slouken@1895
   300
                         "movq           %%mm5,                  8(%3)\n"       // write row1
slouken@1895
   301
                         "movq           %%mm6,                  %%mm4\n"       // Lum3
slouken@1895
   302
                         "paddw          %%mm0,                  %%mm6\n"       // Lum3 +blue
slouken@1895
   303
                         "movq           %%mm4,                  %%mm5\n"       // Lum3
slouken@1895
   304
                         "paddw          %%mm1,                  %%mm4\n"       // Lum3 +red
slouken@1895
   305
                         "paddw          %%mm2,                  %%mm5\n"       // Lum3 +green
slouken@1895
   306
                         "psraw          $6,                     %%mm4\n" "movq           %%mm7,                  %%mm3\n"      // Lum4
slouken@1895
   307
                         "psraw          $6,                     %%mm5\n" "paddw          %%mm0,                  %%mm7\n"      // Lum4 +blue
slouken@1895
   308
                         "psraw          $6,                     %%mm6\n"       // Lum3 +blue
slouken@1895
   309
                         "movq           %%mm3,                  %%mm0\n"       // Lum4
slouken@1895
   310
                         "packuswb       %%mm4,                  %%mm4\n" "paddw          %%mm1,                  %%mm3\n"      // Lum4 +red
slouken@1895
   311
                         "packuswb       %%mm5,                  %%mm5\n" "paddw          %%mm2,                  %%mm0\n"      // Lum4 +green
slouken@1895
   312
                         "packuswb       %%mm6,                  %%mm6\n" "punpcklbw      %%mm4,                  %%mm4\n" "punpcklbw      %%mm5,                  %%mm5\n" "punpcklbw      %%mm6,                  %%mm6\n" "psllw          $3,                     %%mm5\n"   // GREEN 3
slouken@1895
   313
                         "pand           _MMX_red565,            %%mm4\n" "psraw          $6,                     %%mm3\n"      // psr 6
slouken@1895
   314
                         "psraw          $6,                     %%mm0\n" "pand           _MMX_red565,            %%mm6\n"      // BLUE
slouken@1895
   315
                         "pand           _MMX_grn565,            %%mm5\n" "psrlw          $11,                    %%mm6\n"      // BLUE  3
slouken@1895
   316
                         "por            %%mm5,                  %%mm4\n" "psraw          $6,                     %%mm7\n" "por            %%mm6,                  %%mm4\n" "packuswb       %%mm3,                  %%mm3\n" "packuswb       %%mm0,                  %%mm0\n" "packuswb       %%mm7,                  %%mm7\n" "punpcklbw      %%mm3,                  %%mm3\n" "punpcklbw      %%mm0,                  %%mm0\n" "punpcklbw      %%mm7,                  %%mm7\n" "pand           _MMX_red565,            %%mm3\n" "pand           _MMX_red565,            %%mm7\n"     // BLUE
slouken@1895
   317
                         "psllw          $3,                     %%mm0\n"       // GREEN 4
slouken@1895
   318
                         "psrlw          $11,                    %%mm7\n" "pand           _MMX_grn565,            %%mm0\n" "por            %%mm7,                  %%mm3\n" "por            %%mm0,                  %%mm3\n" "movq           %%mm4,                  %%mm5\n" "punpcklwd      %%mm3,                  %%mm4\n" "punpckhwd      %%mm3,                  %%mm5\n" "movq           %%mm4,                  (%5)\n" "movq           %%mm5,                  8(%5)\n" "addl           $8,                     %6\n" "addl           $8,                     %2\n" "addl           $4,                     %%ebx\n" "addl           $4,                     %1\n" "cmpl           %4,                     %6\n" "leal           16(%3),                 %3\n" "leal           16(%5),%5\n"    // row2+16
slouken@1895
   319
                         "jl             1b\n" "addl           %4,     %2\n"    // lum += cols 
slouken@1895
   320
                         "addl           %8,     %3\n"  // row1+= mod
slouken@1895
   321
                         "addl           %8,     %5\n"  // row2+= mod
slouken@1895
   322
                         "movl           $0,     %6\n"  // x=0
slouken@1895
   323
                         "cmpl           %7,     %2\n"
slouken@1895
   324
                         "jl             1b\n"
slouken@1895
   325
                         "emms\n"
slouken@1895
   326
                         "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum),
slouken@1895
   327
                         "r"(row1), "r"(cols), "r"(row2), "m"(x),
slouken@1895
   328
                         "m"(y), "m"(mod));
slouken@0
   329
}
slouken@0
   330
slouken@0
   331
#endif /* GCC i386 inline assembly */
slouken@1413
   332
#endif /* 0 */
slouken@1895
   333
/* vi: set ts=4 sw=4 expandtab: */