/*
    SDL - Simple DirectMedia Layer
    Copyright (C) 1997-2006 Sam Lantinga

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    Sam Lantinga
    slouken@libsdl.org
*/
#include "SDL_config.h"

/* This is the software implementation of the YUV video overlay support */

/* This code was derived from code carrying the following copyright notices:

 * Copyright (c) 1995 The Regents of the University of California.
 * All rights reserved.
 * 
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose, without fee, and without written agreement is
 * hereby granted, provided that the above copyright notice and the following
 * two paragraphs appear in all copies of this software.
 * 
 * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
 * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
 * CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
 * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO
 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

 * Copyright (c) 1995 Erik Corry
 * All rights reserved.
 * 
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose, without fee, and without written agreement is
 * hereby granted, provided that the above copyright notice and the following
 * two paragraphs appear in all copies of this software.
 * 
 * IN NO EVENT SHALL ERIK CORRY BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
 * SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
 * THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF ERIK CORRY HAS BEEN ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * ERIK CORRY SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
 * BASIS, AND ERIK CORRY HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT,
 * UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

 * Portions of this software Copyright (c) 1995 Brown University.
 * All rights reserved.
 * 
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose, without fee, and without written agreement
 * is hereby granted, provided that the above copyright notice and the
 * following two paragraphs appear in all copies of this software.
 * 
 * IN NO EVENT SHALL BROWN UNIVERSITY BE LIABLE TO ANY PARTY FOR
 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
 * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF BROWN
 * UNIVERSITY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * BROWN UNIVERSITY SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS"
 * BASIS, AND BROWN UNIVERSITY HAS NO OBLIGATION TO PROVIDE MAINTENANCE,
 * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 */

#include "SDL_video.h"
#include "SDL_cpuinfo.h"
#include "SDL_stretch_c.h"
#include "SDL_yuvfuncs.h"
#include "SDL_yuv_sw_c.h"

/* The functions used to manipulate software video overlays */
static struct private_yuvhwfuncs sw_yuvfuncs = {
    SDL_LockYUV_SW,
    SDL_UnlockYUV_SW,
    SDL_DisplayYUV_SW,
    SDL_FreeYUV_SW
};

/* RGB conversion lookup tables */
struct private_yuvhwdata
{
    SDL_Surface *stretch;
    SDL_Surface *display;
    Uint8 *pixels;
    int *colortab;
    Uint32 *rgb_2_pix;
    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
                       unsigned char *lum, unsigned char *cr,
                       unsigned char *cb, unsigned char *out,
                       int rows, int cols, int mod);
    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
                       unsigned char *lum, unsigned char *cr,
                       unsigned char *cb, unsigned char *out,
                       int rows, int cols, int mod);

    /* These are just so we don't have to allocate them separately */
    Uint16 pitches[3];
    Uint8 *planes[3];
};


/* The colorspace conversion functions */

#if 0                           /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES */
extern void Color565DitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix,
                                     unsigned char *lum, unsigned char *cr,
                                     unsigned char *cb, unsigned char *out,
                                     int rows, int cols, int mod);
extern void ColorRGBDitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix,
                                     unsigned char *lum, unsigned char *cr,
                                     unsigned char *cb, unsigned char *out,
                                     int rows, int cols, int mod);
#endif

static void
Color16DitherYV12Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned short *row1;
    unsigned short *row2;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row1 = (unsigned short *) out;
    row2 = row1 + cols + mod;
    lum2 = lum + cols;

    mod += cols + mod;

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            *row1++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);

            L = *lum++;
            *row1++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);


            /* Now, do second row.  */

            L = *lum2++;
            *row2++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);

            L = *lum2++;
            *row2++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

static void
Color24DitherYV12Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int value;
    unsigned char *row1;
    unsigned char *row2;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row1 = out;
    row2 = row1 + cols * 3 + mod * 3;
    lum2 = lum + cols;

    mod += cols + mod;
    mod *= 3;

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row1++ = (value) & 0xFF;
            *row1++ = (value >> 8) & 0xFF;
            *row1++ = (value >> 16) & 0xFF;

            L = *lum++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row1++ = (value) & 0xFF;
            *row1++ = (value >> 8) & 0xFF;
            *row1++ = (value >> 16) & 0xFF;


            /* Now, do second row.  */

            L = *lum2++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row2++ = (value) & 0xFF;
            *row2++ = (value >> 8) & 0xFF;
            *row2++ = (value >> 16) & 0xFF;

            L = *lum2++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row2++ = (value) & 0xFF;
            *row2++ = (value >> 8) & 0xFF;
            *row2++ = (value >> 16) & 0xFF;
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

static void
Color32DitherYV12Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row1;
    unsigned int *row2;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row1 = (unsigned int *) out;
    row2 = row1 + cols + mod;
    lum2 = lum + cols;

    mod += cols + mod;

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            *row1++ = (rgb_2_pix[L + cr_r] |
                       rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);

            L = *lum++;
            *row1++ = (rgb_2_pix[L + cr_r] |
                       rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);


            /* Now, do second row.  */

            L = *lum2++;
            *row2++ = (rgb_2_pix[L + cr_r] |
                       rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);

            L = *lum2++;
            *row2++ = (rgb_2_pix[L + cr_r] |
                       rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

/*
 * In this function I make use of a nasty trick. The tables have the lower
 * 16 bits replicated in the upper 16. This means I can write ints and get
 * the horisontal doubling for free (almost).
 */
static void
Color16DitherYV12Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row1 = (unsigned int *) out;
    const int next_row = cols + (mod / 2);
    unsigned int *row2 = row1 + 2 * next_row;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    lum2 = lum + cols;

    mod = (next_row * 3) + (mod / 2);

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            row1[0] = row1[next_row] = (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);
            row1++;

            L = *lum++;
            row1[0] = row1[next_row] = (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);
            row1++;


            /* Now, do second row. */

            L = *lum2++;
            row2[0] = row2[next_row] = (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);
            row2++;

            L = *lum2++;
            row2[0] = row2[next_row] = (rgb_2_pix[L + cr_r] |
                                        rgb_2_pix[L + crb_g] |
                                        rgb_2_pix[L + cb_b]);
            row2++;
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

static void
Color24DitherYV12Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int value;
    unsigned char *row1 = out;
    const int next_row = (cols * 2 + mod) * 3;
    unsigned char *row2 = row1 + 2 * next_row;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    lum2 = lum + cols;

    mod = next_row * 3 + mod * 3;

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row1[0 + 0] = row1[3 + 0] = row1[next_row + 0] =
                row1[next_row + 3 + 0] = (value) & 0xFF;
            row1[0 + 1] = row1[3 + 1] = row1[next_row + 1] =
                row1[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row1[0 + 2] = row1[3 + 2] = row1[next_row + 2] =
                row1[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row1 += 2 * 3;

            L = *lum++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row1[0 + 0] = row1[3 + 0] = row1[next_row + 0] =
                row1[next_row + 3 + 0] = (value) & 0xFF;
            row1[0 + 1] = row1[3 + 1] = row1[next_row + 1] =
                row1[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row1[0 + 2] = row1[3 + 2] = row1[next_row + 2] =
                row1[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row1 += 2 * 3;


            /* Now, do second row. */

            L = *lum2++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row2[0 + 0] = row2[3 + 0] = row2[next_row + 0] =
                row2[next_row + 3 + 0] = (value) & 0xFF;
            row2[0 + 1] = row2[3 + 1] = row2[next_row + 1] =
                row2[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row2[0 + 2] = row2[3 + 2] = row2[next_row + 2] =
                row2[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row2 += 2 * 3;

            L = *lum2++;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row2[0 + 0] = row2[3 + 0] = row2[next_row + 0] =
                row2[next_row + 3 + 0] = (value) & 0xFF;
            row2[0 + 1] = row2[3 + 1] = row2[next_row + 1] =
                row2[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row2[0 + 2] = row2[3 + 2] = row2[next_row + 2] =
                row2[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row2 += 2 * 3;
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

static void
Color32DitherYV12Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row1 = (unsigned int *) out;
    const int next_row = cols * 2 + mod;
    unsigned int *row2 = row1 + 2 * next_row;
    unsigned char *lum2;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    lum2 = lum + cols;

    mod = (next_row * 3) + mod;

    y = rows / 2;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            ++cr;
            ++cb;

            L = *lum++;
            row1[0] = row1[1] = row1[next_row] = row1[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row1 += 2;

            L = *lum++;
            row1[0] = row1[1] = row1[next_row] = row1[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row1 += 2;


            /* Now, do second row. */

            L = *lum2++;
            row2[0] = row2[1] = row2[next_row] = row2[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row2 += 2;

            L = *lum2++;
            row2[0] = row2[1] = row2[next_row] = row2[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row2 += 2;
        }

        /*
         * These values are at the start of the next line, (due
         * to the ++'s above),but they need to be at the start
         * of the line after that.
         */
        lum += cols;
        lum2 += cols;
        row1 += mod;
        row2 += mod;
    }
}

static void
Color16DitherYUY2Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned short *row;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row = (unsigned short *) out;

    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            *row++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                       rgb_2_pix[L + crb_g] |
                                       rgb_2_pix[L + cb_b]);

            L = *lum;
            lum += 2;
            *row++ = (unsigned short) (rgb_2_pix[L + cr_r] |
                                       rgb_2_pix[L + crb_g] |
                                       rgb_2_pix[L + cb_b]);

        }

        row += mod;
    }
}

static void
Color24DitherYUY2Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int value;
    unsigned char *row;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row = (unsigned char *) out;
    mod *= 3;
    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row++ = (value) & 0xFF;
            *row++ = (value >> 8) & 0xFF;
            *row++ = (value >> 16) & 0xFF;

            L = *lum;
            lum += 2;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            *row++ = (value) & 0xFF;
            *row++ = (value >> 8) & 0xFF;
            *row++ = (value >> 16) & 0xFF;

        }
        row += mod;
    }
}

static void
Color32DitherYUY2Mod1X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    row = (unsigned int *) out;
    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            *row++ = (rgb_2_pix[L + cr_r] |
                      rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);

            L = *lum;
            lum += 2;
            *row++ = (rgb_2_pix[L + cr_r] |
                      rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);


        }
        row += mod;
    }
}

/*
 * In this function I make use of a nasty trick. The tables have the lower
 * 16 bits replicated in the upper 16. This means I can write ints and get
 * the horisontal doubling for free (almost).
 */
static void
Color16DitherYUY2Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row = (unsigned int *) out;
    const int next_row = cols + (mod / 2);
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;

    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            row[0] = row[next_row] = (rgb_2_pix[L + cr_r] |
                                      rgb_2_pix[L + crb_g] |
                                      rgb_2_pix[L + cb_b]);
            row++;

            L = *lum;
            lum += 2;
            row[0] = row[next_row] = (rgb_2_pix[L + cr_r] |
                                      rgb_2_pix[L + crb_g] |
                                      rgb_2_pix[L + cb_b]);
            row++;

        }
        row += next_row;
    }
}

static void
Color24DitherYUY2Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int value;
    unsigned char *row = out;
    const int next_row = (cols * 2 + mod) * 3;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;
    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row[0 + 0] = row[3 + 0] = row[next_row + 0] =
                row[next_row + 3 + 0] = (value) & 0xFF;
            row[0 + 1] = row[3 + 1] = row[next_row + 1] =
                row[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row[0 + 2] = row[3 + 2] = row[next_row + 2] =
                row[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row += 2 * 3;

            L = *lum;
            lum += 2;
            value = (rgb_2_pix[L + cr_r] |
                     rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row[0 + 0] = row[3 + 0] = row[next_row + 0] =
                row[next_row + 3 + 0] = (value) & 0xFF;
            row[0 + 1] = row[3 + 1] = row[next_row + 1] =
                row[next_row + 3 + 1] = (value >> 8) & 0xFF;
            row[0 + 2] = row[3 + 2] = row[next_row + 2] =
                row[next_row + 3 + 2] = (value >> 16) & 0xFF;
            row += 2 * 3;

        }
        row += next_row;
    }
}

static void
Color32DitherYUY2Mod2X (int *colortab, Uint32 * rgb_2_pix,
                        unsigned char *lum, unsigned char *cr,
                        unsigned char *cb, unsigned char *out,
                        int rows, int cols, int mod)
{
    unsigned int *row = (unsigned int *) out;
    const int next_row = cols * 2 + mod;
    int x, y;
    int cr_r;
    int crb_g;
    int cb_b;
    int cols_2 = cols / 2;
    mod += mod;
    y = rows;
    while (y--) {
        x = cols_2;
        while (x--) {
            register int L;

            cr_r = 0 * 768 + 256 + colortab[*cr + 0 * 256];
            crb_g = 1 * 768 + 256 + colortab[*cr + 1 * 256]
                + colortab[*cb + 2 * 256];
            cb_b = 2 * 768 + 256 + colortab[*cb + 3 * 256];
            cr += 4;
            cb += 4;

            L = *lum;
            lum += 2;
            row[0] = row[1] = row[next_row] = row[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row += 2;

            L = *lum;
            lum += 2;
            row[0] = row[1] = row[next_row] = row[next_row + 1] =
                (rgb_2_pix[L + cr_r] |
                 rgb_2_pix[L + crb_g] | rgb_2_pix[L + cb_b]);
            row += 2;


        }

        row += next_row;
    }
}

/*
 * How many 1 bits are there in the Uint32.
 * Low performance, do not call often.
 */
static int
number_of_bits_set (Uint32 a)
{
    if (!a)
        return 0;
    if (a & 1)
        return 1 + number_of_bits_set (a >> 1);
    return (number_of_bits_set (a >> 1));
}

/*
 * How many 0 bits are there at least significant end of Uint32.
 * Low performance, do not call often.
 */
static int
free_bits_at_bottom (Uint32 a)
{
    /* assume char is 8 bits */
    if (!a)
        return sizeof (Uint32) * 8;
    if (((Sint32) a) & 1l)
        return 0;
    return 1 + free_bits_at_bottom (a >> 1);
}


SDL_Overlay *
SDL_CreateYUV_SW (_THIS, int width, int height, Uint32 format,
                  SDL_Surface * display)
{
    SDL_Overlay *overlay;
    struct private_yuvhwdata *swdata;
    int *Cr_r_tab;
    int *Cr_g_tab;
    int *Cb_g_tab;
    int *Cb_b_tab;
    Uint32 *r_2_pix_alloc;
    Uint32 *g_2_pix_alloc;
    Uint32 *b_2_pix_alloc;
    int i;
    int CR, CB;
    Uint32 Rmask, Gmask, Bmask;

    /* Only RGB packed pixel conversion supported */
    if ((display->format->BytesPerPixel != 2) &&
        (display->format->BytesPerPixel != 3) &&
        (display->format->BytesPerPixel != 4)) {
        SDL_SetError ("Can't use YUV data on non 16/24/32 bit surfaces");
        return (NULL);
    }

    /* Verify that we support the format */
    switch (format) {
    case SDL_YV12_OVERLAY:
    case SDL_IYUV_OVERLAY:
    case SDL_YUY2_OVERLAY:
    case SDL_UYVY_OVERLAY:
    case SDL_YVYU_OVERLAY:
        break;
    default:
        SDL_SetError ("Unsupported YUV format");
        return (NULL);
    }

    /* Create the overlay structure */
    overlay = (SDL_Overlay *) SDL_malloc (sizeof *overlay);
    if (overlay == NULL) {
        SDL_OutOfMemory ();
        return (NULL);
    }
    SDL_memset (overlay, 0, (sizeof *overlay));

    /* Fill in the basic members */
    overlay->format = format;
    overlay->w = width;
    overlay->h = height;

    /* Set up the YUV surface function structure */
    overlay->hwfuncs = &sw_yuvfuncs;

    /* Create the pixel data and lookup tables */
    swdata = (struct private_yuvhwdata *) SDL_malloc (sizeof *swdata);
    overlay->hwdata = swdata;
    if (swdata == NULL) {
        SDL_OutOfMemory ();
        SDL_FreeYUVOverlay (overlay);
        return (NULL);
    }
    swdata->stretch = NULL;
    swdata->display = display;
    swdata->pixels = (Uint8 *) SDL_malloc (width * height * 2);
    swdata->colortab = (int *) SDL_malloc (4 * 256 * sizeof (int));
    Cr_r_tab = &swdata->colortab[0 * 256];
    Cr_g_tab = &swdata->colortab[1 * 256];
    Cb_g_tab = &swdata->colortab[2 * 256];
    Cb_b_tab = &swdata->colortab[3 * 256];
    swdata->rgb_2_pix = (Uint32 *) SDL_malloc (3 * 768 * sizeof (Uint32));
    r_2_pix_alloc = &swdata->rgb_2_pix[0 * 768];
    g_2_pix_alloc = &swdata->rgb_2_pix[1 * 768];
    b_2_pix_alloc = &swdata->rgb_2_pix[2 * 768];
    if (!swdata->pixels || !swdata->colortab || !swdata->rgb_2_pix) {
        SDL_OutOfMemory ();
        SDL_FreeYUVOverlay (overlay);
        return (NULL);
    }

    /* Generate the tables for the display surface */
    for (i = 0; i < 256; i++) {
        /* Gamma correction (luminescence table) and chroma correction
           would be done here.  See the Berkeley mpeg_play sources.
         */
        CB = CR = (i - 128);
        Cr_r_tab[i] = (int) ((0.419 / 0.299) * CR);
        Cr_g_tab[i] = (int) (-(0.299 / 0.419) * CR);
        Cb_g_tab[i] = (int) (-(0.114 / 0.331) * CB);
        Cb_b_tab[i] = (int) ((0.587 / 0.331) * CB);
    }

    /* 
     * Set up entries 0-255 in rgb-to-pixel value tables.
     */
    Rmask = display->format->Rmask;
    Gmask = display->format->Gmask;
    Bmask = display->format->Bmask;
    for (i = 0; i < 256; ++i) {
        r_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (Rmask));
        r_2_pix_alloc[i + 256] <<= free_bits_at_bottom (Rmask);
        g_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (Gmask));
        g_2_pix_alloc[i + 256] <<= free_bits_at_bottom (Gmask);
        b_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set (Bmask));
        b_2_pix_alloc[i + 256] <<= free_bits_at_bottom (Bmask);
    }

    /*
     * If we have 16-bit output depth, then we double the value
     * in the top word. This means that we can write out both
     * pixels in the pixel doubling mode with one op. It is 
     * harmless in the normal case as storing a 32-bit value
     * through a short pointer will lose the top bits anyway.
     */
    if (display->format->BytesPerPixel == 2) {
        for (i = 0; i < 256; ++i) {
            r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 16;
            g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 16;
            b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 16;
        }
    }

    /*
     * Spread out the values we have to the rest of the array so that
     * we do not need to check for overflow.
     */
    for (i = 0; i < 256; ++i) {
        r_2_pix_alloc[i] = r_2_pix_alloc[256];
        r_2_pix_alloc[i + 512] = r_2_pix_alloc[511];
        g_2_pix_alloc[i] = g_2_pix_alloc[256];
        g_2_pix_alloc[i + 512] = g_2_pix_alloc[511];
        b_2_pix_alloc[i] = b_2_pix_alloc[256];
        b_2_pix_alloc[i + 512] = b_2_pix_alloc[511];
    }

    /* You have chosen wisely... */
    switch (format) {
    case SDL_YV12_OVERLAY:
    case SDL_IYUV_OVERLAY:
        if (display->format->BytesPerPixel == 2) {
#if 0                           /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES */
            /* inline assembly functions */
            if (SDL_HasMMX () && (Rmask == 0xF800) &&
                (Gmask == 0x07E0) && (Bmask == 0x001F) && (width & 15) == 0) {
/*printf("Using MMX 16-bit 565 dither\n");*/
                swdata->Display1X = Color565DitherYV12MMX1X;
            } else {
/*printf("Using C 16-bit dither\n");*/
                swdata->Display1X = Color16DitherYV12Mod1X;
            }
#else
            swdata->Display1X = Color16DitherYV12Mod1X;
#endif
            swdata->Display2X = Color16DitherYV12Mod2X;
        }
        if (display->format->BytesPerPixel == 3) {
            swdata->Display1X = Color24DitherYV12Mod1X;
            swdata->Display2X = Color24DitherYV12Mod2X;
        }
        if (display->format->BytesPerPixel == 4) {
#if 0                           /*defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES */
            /* inline assembly functions */
            if (SDL_HasMMX () && (Rmask == 0x00FF0000) &&
                (Gmask == 0x0000FF00) &&
                (Bmask == 0x000000FF) && (width & 15) == 0) {
/*printf("Using MMX 32-bit dither\n");*/
                swdata->Display1X = ColorRGBDitherYV12MMX1X;
            } else {
/*printf("Using C 32-bit dither\n");*/
                swdata->Display1X = Color32DitherYV12Mod1X;
            }
#else
            swdata->Display1X = Color32DitherYV12Mod1X;
#endif
            swdata->Display2X = Color32DitherYV12Mod2X;
        }
        break;
    case SDL_YUY2_OVERLAY:
    case SDL_UYVY_OVERLAY:
    case SDL_YVYU_OVERLAY:
        if (display->format->BytesPerPixel == 2) {
            swdata->Display1X = Color16DitherYUY2Mod1X;
            swdata->Display2X = Color16DitherYUY2Mod2X;
        }
        if (display->format->BytesPerPixel == 3) {
            swdata->Display1X = Color24DitherYUY2Mod1X;
            swdata->Display2X = Color24DitherYUY2Mod2X;
        }
        if (display->format->BytesPerPixel == 4) {
            swdata->Display1X = Color32DitherYUY2Mod1X;
            swdata->Display2X = Color32DitherYUY2Mod2X;
        }
        break;
    default:
        /* We should never get here (caught above) */
        break;
    }

    /* Find the pitch and offset values for the overlay */
    overlay->pitches = swdata->pitches;
    overlay->pixels = swdata->planes;
    switch (format) {
    case SDL_YV12_OVERLAY:
    case SDL_IYUV_OVERLAY:
        overlay->pitches[0] = overlay->w;
        overlay->pitches[1] = overlay->pitches[0] / 2;
        overlay->pitches[2] = overlay->pitches[0] / 2;
        overlay->pixels[0] = swdata->pixels;
        overlay->pixels[1] = overlay->pixels[0] +
            overlay->pitches[0] * overlay->h;
        overlay->pixels[2] = overlay->pixels[1] +
            overlay->pitches[1] * overlay->h / 2;
        overlay->planes = 3;
        break;
    case SDL_YUY2_OVERLAY:
    case SDL_UYVY_OVERLAY:
    case SDL_YVYU_OVERLAY:
        overlay->pitches[0] = overlay->w * 2;
        overlay->pixels[0] = swdata->pixels;
        overlay->planes = 1;
        break;
    default:
        /* We should never get here (caught above) */
        break;
    }

    /* We're all done.. */
    return (overlay);
}

int
SDL_LockYUV_SW (_THIS, SDL_Overlay * overlay)
{
    return (0);
}

void
SDL_UnlockYUV_SW (_THIS, SDL_Overlay * overlay)
{
    return;
}

int
SDL_DisplayYUV_SW (_THIS, SDL_Overlay * overlay, SDL_Rect * src,
                   SDL_Rect * dst)
{
    struct private_yuvhwdata *swdata;
    int stretch;
    int scale_2x;
    SDL_Surface *display;
    Uint8 *lum, *Cr, *Cb;
    Uint8 *dstp;
    int mod;

    swdata = overlay->hwdata;
    stretch = 0;
    scale_2x = 0;
    if (src->x || src->y || src->w < overlay->w || src->h < overlay->h) {
        /* The source rectangle has been clipped.
           Using a scratch surface is easier than adding clipped
           source support to all the blitters, plus that would
           slow them down in the general unclipped case.
         */
        stretch = 1;
    } else if ((src->w != dst->w) || (src->h != dst->h)) {
        if ((dst->w == 2 * src->w) && (dst->h == 2 * src->h)) {
            scale_2x = 1;
        } else {
            stretch = 1;
        }
    }
    if (stretch) {
        if (!swdata->stretch) {
            display = swdata->display;
            swdata->stretch = SDL_CreateRGBSurface (SDL_SWSURFACE,
                                                    overlay->w,
                                                    overlay->h,
                                                    display->format->
                                                    BitsPerPixel,
                                                    display->format->
                                                    Rmask,
                                                    display->format->
                                                    Gmask,
                                                    display->format->
                                                    Bmask, 0);
            if (!swdata->stretch) {
                return (-1);
            }
        }
        display = swdata->stretch;
    } else {
        display = swdata->display;
    }
    switch (overlay->format) {
    case SDL_YV12_OVERLAY:
        lum = overlay->pixels[0];
        Cr = overlay->pixels[1];
        Cb = overlay->pixels[2];
        break;
    case SDL_IYUV_OVERLAY:
        lum = overlay->pixels[0];
        Cr = overlay->pixels[2];
        Cb = overlay->pixels[1];
        break;
    case SDL_YUY2_OVERLAY:
        lum = overlay->pixels[0];
        Cr = lum + 3;
        Cb = lum + 1;
        break;
    case SDL_UYVY_OVERLAY:
        lum = overlay->pixels[0] + 1;
        Cr = lum + 1;
        Cb = lum - 1;
        break;
    case SDL_YVYU_OVERLAY:
        lum = overlay->pixels[0];
        Cr = lum + 1;
        Cb = lum + 3;
        break;
    default:
        SDL_SetError ("Unsupported YUV format in blit");
        return (-1);
    }
    if (SDL_MUSTLOCK (display)) {
        if (SDL_LockSurface (display) < 0) {
            return (-1);
        }
    }
    if (stretch) {
        dstp = (Uint8 *) swdata->stretch->pixels;
    } else {
        dstp = (Uint8 *) display->pixels
            + dst->x * display->format->BytesPerPixel
            + dst->y * display->pitch;
    }
    mod = (display->pitch / display->format->BytesPerPixel);

    if (scale_2x) {
        mod -= (overlay->w * 2);
        swdata->Display2X (swdata->colortab, swdata->rgb_2_pix,
                           lum, Cr, Cb, dstp, overlay->h, overlay->w, mod);
    } else {
        mod -= overlay->w;
        swdata->Display1X (swdata->colortab, swdata->rgb_2_pix,
                           lum, Cr, Cb, dstp, overlay->h, overlay->w, mod);
    }
    if (SDL_MUSTLOCK (display)) {
        SDL_UnlockSurface (display);
    }
    if (stretch) {
        display = swdata->display;
        SDL_SoftStretch (swdata->stretch, src, display, dst);
    }
    SDL_UpdateRects (display, 1, dst);

    return (0);
}

void
SDL_FreeYUV_SW (_THIS, SDL_Overlay * overlay)
{
    struct private_yuvhwdata *swdata;

    swdata = overlay->hwdata;
    if (swdata) {
        if (swdata->stretch) {
            SDL_FreeSurface (swdata->stretch);
        }
        if (swdata->pixels) {
            SDL_free (swdata->pixels);
        }
        if (swdata->colortab) {
            SDL_free (swdata->colortab);
        }
        if (swdata->rgb_2_pix) {
            SDL_free (swdata->rgb_2_pix);
        }
        SDL_free (swdata);
    }
}

/* vi: set ts=4 sw=4 expandtab: */