Merged Martin's code changes from Google Summer of Code 2009
authorSam Lantinga <slouken@libsdl.org>
Mon, 07 Sep 2009 04:51:29 +0000
changeset 325794fb40a4a9a7
parent 3256 83c87f2b2aab
child 3258 e786366ea23b
Merged Martin's code changes from Google Summer of Code 2009
Makefile.in
README.PS3
configure.in
include/SDL_config.h.in
src/video/SDL_sysvideo.h
src/video/SDL_video.c
src/video/SDL_yuv_sw.c
src/video/SDL_yuv_sw_c.h
src/video/ps3/SDL_ps3events.c
src/video/ps3/SDL_ps3events_c.h
src/video/ps3/SDL_ps3modes.c
src/video/ps3/SDL_ps3modes_c.h
src/video/ps3/SDL_ps3render.c
src/video/ps3/SDL_ps3render_c.h
src/video/ps3/SDL_ps3spe.c
src/video/ps3/SDL_ps3spe_c.h
src/video/ps3/SDL_ps3video.c
src/video/ps3/SDL_ps3video.h
src/video/ps3/spulibs/Makefile
src/video/ps3/spulibs/bilin_scaler.c
src/video/ps3/spulibs/fb_writer.c
src/video/ps3/spulibs/spu_common.h
src/video/ps3/spulibs/yuv2rgb.c
     1.1 --- a/Makefile.in	Sun Sep 06 15:04:38 2009 +0000
     1.2 +++ b/Makefile.in	Mon Sep 07 04:51:29 2009 +0000
     1.3 @@ -40,6 +40,11 @@
     1.4  SDLMAIN_SOURCES = @SDLMAIN_SOURCES@
     1.5  SDLMAIN_OBJECTS = @SDLMAIN_OBJECTS@
     1.6  
     1.7 +# PS3 SPU programs
     1.8 +SPU_GCC = @SPU_GCC@
     1.9 +EMBEDSPU = @EMBEDSPU@
    1.10 +include $(srcdir)/src/video/ps3/spulibs/Makefile
    1.11 +
    1.12  DIST = acinclude.m4 autogen.sh Borland.html Borland.zip BUGS build-scripts configure configure.in COPYING CREDITS docs docs.html include INSTALL Makefile.dc Makefile.minimal Makefile.in README* sdl-config.in sdl.m4 sdl.pc.in SDL.qpg.in SDL.spec SDL.spec.in src test TODO VisualC.html VisualC VisualCE Watcom-OS2.zip Watcom-Win32.zip WhatsNew Xcode
    1.13  
    1.14  HDRS = SDL.h SDL_atomic.h SDL_audio.h SDL_cdrom.h SDL_compat.h SDL_cpuinfo.h SDL_endian.h SDL_error.h SDL_events.h SDL_haptic.h SDL_joystick.h SDL_keyboard.h SDL_keysym.h SDL_loadso.h SDL_main.h SDL_mouse.h SDL_mutex.h SDL_name.h SDL_opengl.h SDL_opengles.h SDL_pixels.h SDL_platform.h SDL_power.h SDL_quit.h SDL_rect.h SDL_revision.h SDL_rwops.h SDL_scancode.h SDL_stdinc.h SDL_surface.h SDL_syswm.h SDL_thread.h SDL_timer.h SDL_types.h SDL_version.h SDL_video.h begin_code.h close_code.h
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/README.PS3	Mon Sep 07 04:51:29 2009 +0000
     2.3 @@ -0,0 +1,35 @@
     2.4 +
     2.5 +SDL on Sony Playstation3
     2.6 +------------------------
     2.7 +
     2.8 +Installation:
     2.9 +  First, you have to install the Cell SDK
    2.10 +  - Download the Cell SDK installer RPM and ISO images to
    2.11 +    a temporary directory such as /tmp/cellsdk.
    2.12 +  - Mount the image: mount -o loop CellSDK-Devel-Fedora_3.1.0.0.0.iso /tmp/cellsdk
    2.13 +  - Install the SDK installer: rpm -ivh cell-install-3.1.0-0.0.noarch.rpm
    2.14 +  - Install the SDK: cd /opt/cell && ./cellsdk --iso /tmp/cellsdkiso install
    2.15 +
    2.16 +  You'll than need to install the SPU-libs
    2.17 +  - Run make ps3-libs && make ps3libs-install
    2.18 +
    2.19 +  Finally, install SDL
    2.20 +  - Go to SDL-1.2/ and build SDL like any other GNU style package.
    2.21 +  e.g.
    2.22 +    - Build the configure-script with ./autogen.sh
    2.23 +    - Configure SDL for your needs: ./configure --enable-video-ps3 ...
    2.24 +    - Build and install it: make && make install
    2.25 +
    2.26 +
    2.27 +Todo:
    2.28 +  - Mouse & Keyboard support
    2.29 +  - On SPU-side the current scaler and converter restrictions are:
    2.30 +    - resolution has to be a multiple of 8 (will work on that)
    2.31 +    - scaler/converter only supports the YV12 and IYUV format
    2.32 +    - the scaler works only bilinear (lanzos would be nice)
    2.33 +  - Optimize the SPU-program handling on the PPE side
    2.34 +  - Integrate spumedia in SDL
    2.35 +
    2.36 +Have fun!
    2.37 +  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot ibm [dot] com>
    2.38 +
     3.1 --- a/configure.in	Sun Sep 06 15:04:38 2009 +0000
     3.2 +++ b/configure.in	Mon Sep 07 04:51:29 2009 +0000
     3.3 @@ -1509,6 +1509,46 @@
     3.4      fi
     3.5  }
     3.6  
     3.7 +dnl See if we're running on PlayStation 3 Cell hardware
     3.8 +CheckPS3()
     3.9 +{
    3.10 +  AC_ARG_ENABLE(video-ps3,
    3.11 +                AC_HELP_STRING([--enable-video-ps3], [use PlayStation 3 Cell driver [[default=yes]]]),
    3.12 +                , enable_video_ps3=yes)
    3.13 +  if test x$enable_video = xyes -a x$enable_video_ps3 = xyes; then 
    3.14 +    video_ps3=no
    3.15 +    AC_CHECK_HEADER([linux/fb.h])
    3.16 +    AC_CHECK_HEADER([asm/ps3fb.h], [have_ps3fb_hdr=yes], [],
    3.17 +			[#ifndef _LINUX_TYPES_H
    3.18 +				#include <linux/types.h>
    3.19 +			#endif])
    3.20 +    AC_CHECK_HEADER([libspe2.h], have_libspe2_hdr=yes)
    3.21 +    AC_CHECK_LIB([spe2], spe_context_create, have_spe2_lib=yes)
    3.22 +
    3.23 +    AC_CHECK_PROGS(SPU_GCC, [spu-gcc])
    3.24 +    AC_CHECK_PROGS(EMBEDSPU, [embedspu])
    3.25 +
    3.26 +    have_spu_libs=yes
    3.27 +    AC_CHECK_LIB([fb_writer_spu], [main], [], [have_spu_libs=no])
    3.28 +    AC_CHECK_LIB([yuv2rgb_spu], [main], [], [have_spu_libs=no])
    3.29 +    AC_CHECK_LIB([bilin_scaler_spu], [main], [], [have_spu_libs=no])
    3.30 +    if test x$have_ps3fb_hdr = xyes -a x$have_libspe2_hdr = xyes -a x$have_spe2_lib = xyes -a "$SPU_GCC" -a "$EMBEDSPU"; then
    3.31 +        AC_DEFINE(SDL_VIDEO_DRIVER_PS3)
    3.32 +        video_ps3=yes
    3.33 +        have_video=yes
    3.34 +        SOURCES="$SOURCES $srcdir/src/video/ps3/*.c"
    3.35 +        EXTRA_CFLAGS="$EXTRA_CFLAGS -I/opt/cell/sdk/usr/include"
    3.36 +        EXTRA_LDFLAGS="$EXTRA_LDFLAGS -L/opt/cell/sdk/usr/lib -lspe2 -lfb_writer_spu -lyuv2rgb_spu -lbilin_scaler_spu"
    3.37 +
    3.38 +        if test x$have_spu_libs = xno; then 
    3.39 +              AC_MSG_WARN([ps3libs missing, please run make ps3libs])
    3.40 +        fi
    3.41 +    fi
    3.42 +    AC_MSG_CHECKING([for PlayStation 3 Cell support])
    3.43 +    AC_MSG_RESULT([$video_ps3])
    3.44 +  fi
    3.45 +}
    3.46 +
    3.47  dnl Find the SVGAlib includes and libraries
    3.48  CheckSVGA()
    3.49  {
    3.50 @@ -2401,6 +2441,7 @@
    3.51          CheckDirectFB
    3.52          CheckFusionSound
    3.53          CheckPS2GS
    3.54 +        CheckPS3
    3.55          CheckSVGA
    3.56          CheckVGL
    3.57          CheckWscons
     4.1 --- a/include/SDL_config.h.in	Sun Sep 06 15:04:38 2009 +0000
     4.2 +++ b/include/SDL_config.h.in	Mon Sep 07 04:51:29 2009 +0000
     4.3 @@ -273,6 +273,7 @@
     4.4  #undef SDL_VIDEO_DRIVER_PHOTON
     4.5  #undef SDL_VIDEO_DRIVER_QNXGF
     4.6  #undef SDL_VIDEO_DRIVER_PS2GS
     4.7 +#undef SDL_VIDEO_DRIVER_PS3
     4.8  #undef SDL_VIDEO_DRIVER_RISCOS
     4.9  #undef SDL_VIDEO_DRIVER_SVGALIB
    4.10  #undef SDL_VIDEO_DRIVER_VGL
     5.1 --- a/src/video/SDL_sysvideo.h	Sun Sep 06 15:04:38 2009 +0000
     5.2 +++ b/src/video/SDL_sysvideo.h	Mon Sep 07 04:51:29 2009 +0000
     5.3 @@ -359,6 +359,9 @@
     5.4  #if SDL_VIDEO_DRIVER_PS2GS
     5.5  extern VideoBootStrap PS2GS_bootstrap;
     5.6  #endif
     5.7 +#if SDL_VIDEO_DRIVER_PS3
     5.8 +extern VideoBootStrap PS3_bootstrap;
     5.9 +#endif
    5.10  #if SDL_VIDEO_DRIVER_VGL
    5.11  extern VideoBootStrap VGL_bootstrap;
    5.12  #endif
     6.1 --- a/src/video/SDL_video.c	Sun Sep 06 15:04:38 2009 +0000
     6.2 +++ b/src/video/SDL_video.c	Mon Sep 07 04:51:29 2009 +0000
     6.3 @@ -73,6 +73,9 @@
     6.4  #if SDL_VIDEO_DRIVER_PS2GS
     6.5      &PS2GS_bootstrap,
     6.6  #endif
     6.7 +#if SDL_VIDEO_DRIVER_PS3
     6.8 +    &PS3_bootstrap,
     6.9 +#endif
    6.10  #if SDL_VIDEO_DRIVER_VGL
    6.11      &VGL_bootstrap,
    6.12  #endif
     7.1 --- a/src/video/SDL_yuv_sw.c	Sun Sep 06 15:04:38 2009 +0000
     7.2 +++ b/src/video/SDL_yuv_sw.c	Mon Sep 07 04:51:29 2009 +0000
     7.3 @@ -88,32 +88,6 @@
     7.4  #include "SDL_yuv_sw_c.h"
     7.5  
     7.6  
     7.7 -struct SDL_SW_YUVTexture
     7.8 -{
     7.9 -    Uint32 format;
    7.10 -    Uint32 target_format;
    7.11 -    int w, h;
    7.12 -    Uint8 *pixels;
    7.13 -    int *colortab;
    7.14 -    Uint32 *rgb_2_pix;
    7.15 -    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
    7.16 -                       unsigned char *lum, unsigned char *cr,
    7.17 -                       unsigned char *cb, unsigned char *out,
    7.18 -                       int rows, int cols, int mod);
    7.19 -    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
    7.20 -                       unsigned char *lum, unsigned char *cr,
    7.21 -                       unsigned char *cb, unsigned char *out,
    7.22 -                       int rows, int cols, int mod);
    7.23 -
    7.24 -    /* These are just so we don't have to allocate them separately */
    7.25 -    Uint16 pitches[3];
    7.26 -    Uint8 *planes[3];
    7.27 -
    7.28 -    /* This is a temporary surface in case we have to stretch copy */
    7.29 -    SDL_Surface *stretch;
    7.30 -    SDL_Surface *display;
    7.31 -};
    7.32 -
    7.33  /* The colorspace conversion functions */
    7.34  
    7.35  #if (__GNUC__ > 2) && defined(__i386__) && __OPTIMIZE__ && SDL_ASSEMBLY_ROUTINES
     8.1 --- a/src/video/SDL_yuv_sw_c.h	Sun Sep 06 15:04:38 2009 +0000
     8.2 +++ b/src/video/SDL_yuv_sw_c.h	Mon Sep 07 04:51:29 2009 +0000
     8.3 @@ -26,6 +26,32 @@
     8.4  
     8.5  /* This is the software implementation of the YUV texture support */
     8.6  
     8.7 +struct SDL_SW_YUVTexture
     8.8 +{
     8.9 +    Uint32 format;
    8.10 +    Uint32 target_format;
    8.11 +    int w, h;
    8.12 +    Uint8 *pixels;
    8.13 +    int *colortab;
    8.14 +    Uint32 *rgb_2_pix;
    8.15 +    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
    8.16 +                       unsigned char *lum, unsigned char *cr,
    8.17 +                       unsigned char *cb, unsigned char *out,
    8.18 +                       int rows, int cols, int mod);
    8.19 +    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
    8.20 +                       unsigned char *lum, unsigned char *cr,
    8.21 +                       unsigned char *cb, unsigned char *out,
    8.22 +                       int rows, int cols, int mod);
    8.23 +
    8.24 +    /* These are just so we don't have to allocate them separately */
    8.25 +    Uint16 pitches[3];
    8.26 +    Uint8 *planes[3];
    8.27 +
    8.28 +    /* This is a temporary surface in case we have to stretch copy */
    8.29 +    SDL_Surface *stretch;
    8.30 +    SDL_Surface *display;
    8.31 +};
    8.32 +
    8.33  typedef struct SDL_SW_YUVTexture SDL_SW_YUVTexture;
    8.34  
    8.35  SDL_SW_YUVTexture *SDL_SW_CreateYUVTexture(Uint32 format, int w, int h);
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/src/video/ps3/SDL_ps3events.c	Mon Sep 07 04:51:29 2009 +0000
     9.3 @@ -0,0 +1,36 @@
     9.4 +/*
     9.5 +    SDL - Simple DirectMedia Layer
     9.6 +    Copyright (C) 1997-2009 Sam Lantinga
     9.7 +
     9.8 +    This library is free software; you can redistribute it and/or
     9.9 +    modify it under the terms of the GNU Lesser General Public
    9.10 +    License as published by the Free Software Foundation; either
    9.11 +    version 2.1 of the License, or (at your option) any later version.
    9.12 +
    9.13 +    This library is distributed in the hope that it will be useful,
    9.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
    9.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    9.16 +    Lesser General Public License for more details.
    9.17 +
    9.18 +    You should have received a copy of the GNU Lesser General Public
    9.19 +    License along with this library; if not, write to the Free Software
    9.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    9.21 +
    9.22 +    Sam Lantinga
    9.23 +    slouken@libsdl.org
    9.24 +*/
    9.25 +#include "SDL_config.h"
    9.26 +
    9.27 +#include "../../events/SDL_sysevents.h"
    9.28 +#include "../../events/SDL_events_c.h"
    9.29 +
    9.30 +#include "SDL_ps3video.h"
    9.31 +#include "SDL_ps3events_c.h"
    9.32 +
    9.33 +void
    9.34 +PS3_PumpEvents(_THIS)
    9.35 +{
    9.36 +    /* do nothing. */
    9.37 +}
    9.38 +
    9.39 +/* vi: set ts=4 sw=4 expandtab: */
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/src/video/ps3/SDL_ps3events_c.h	Mon Sep 07 04:51:29 2009 +0000
    10.3 @@ -0,0 +1,28 @@
    10.4 +/*
    10.5 +    SDL - Simple DirectMedia Layer
    10.6 +    Copyright (C) 1997-2009 Sam Lantinga
    10.7 +
    10.8 +    This library is free software; you can redistribute it and/or
    10.9 +    modify it under the terms of the GNU Lesser General Public
   10.10 +    License as published by the Free Software Foundation; either
   10.11 +    version 2.1 of the License, or (at your option) any later version.
   10.12 +
   10.13 +    This library is distributed in the hope that it will be useful,
   10.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   10.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   10.16 +    Lesser General Public License for more details.
   10.17 +
   10.18 +    You should have received a copy of the GNU Lesser General Public
   10.19 +    License along with this library; if not, write to the Free Software
   10.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   10.21 +
   10.22 +    Sam Lantinga
   10.23 +    slouken@libsdl.org
   10.24 +*/
   10.25 +#include "SDL_config.h"
   10.26 +
   10.27 +#include "SDL_ps3video.h"
   10.28 +
   10.29 +extern void PS3_PumpEvents(_THIS);
   10.30 +
   10.31 +/* vi: set ts=4 sw=4 expandtab: */
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/src/video/ps3/SDL_ps3modes.c	Mon Sep 07 04:51:29 2009 +0000
    11.3 @@ -0,0 +1,141 @@
    11.4 +/*
    11.5 +    SDL - Simple DirectMedia Layer
    11.6 +    Copyright (C) 1997-2009 Sam Lantinga
    11.7 +
    11.8 +    This library is free software; you can redistribute it and/or
    11.9 +    modify it under the terms of the GNU Lesser General Public
   11.10 +    License as published by the Free Software Foundation; either
   11.11 +    version 2.1 of the License, or (at your option) any later version.
   11.12 +
   11.13 +    This library is distributed in the hope that it will be useful,
   11.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   11.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   11.16 +    Lesser General Public License for more details.
   11.17 +
   11.18 +    You should have received a copy of the GNU Lesser General Public
   11.19 +    License along with this library; if not, write to the Free Software
   11.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   11.21 +
   11.22 +    Sam Lantinga
   11.23 +    slouken@libsdl.org
   11.24 +*/
   11.25 +#include "SDL_config.h"
   11.26 +
   11.27 +#include "SDL_ps3video.h"
   11.28 +
   11.29 +void
   11.30 +PS3_InitModes(_THIS)
   11.31 +{
   11.32 +    deprintf(1, "+PS3_InitModes()\n");
   11.33 +    SDL_VideoDisplay display;
   11.34 +    SDL_VideoData *data = (SDL_VideoData *) _this->driverdata;
   11.35 +    SDL_DisplayMode mode;
   11.36 +    PS3_DisplayModeData *modedata;
   11.37 +    unsigned long vid = 0;
   11.38 +
   11.39 +    modedata = (PS3_DisplayModeData *) SDL_malloc(sizeof(*modedata));
   11.40 +    if (!modedata) {
   11.41 +        return;
   11.42 +    }
   11.43 +
   11.44 +    /* Setting up the DisplayMode based on current settings */
   11.45 +    struct ps3fb_ioctl_res res;
   11.46 +    if (ioctl(data->fbdev, PS3FB_IOCTL_SCREENINFO, &res)) {
   11.47 +        SDL_SetError("Can't get PS3FB_IOCTL_SCREENINFO");
   11.48 +    }
   11.49 +    mode.format = SDL_PIXELFORMAT_RGB888;
   11.50 +    mode.refresh_rate = 0;
   11.51 +    mode.w = res.xres;
   11.52 +    mode.h = res.yres;
   11.53 +
   11.54 +    /* Setting up driver specific mode data,
   11.55 +     * Get the current ps3 specific videmode number */
   11.56 +    if (ioctl(data->fbdev, PS3FB_IOCTL_GETMODE, (unsigned long)&vid)) {
   11.57 +        SDL_SetError("Can't get PS3FB_IOCTL_GETMODE");
   11.58 +    }
   11.59 +    deprintf(2, "PS3FB_IOCTL_GETMODE = %u\n", vid);
   11.60 +    modedata->mode = vid;
   11.61 +    mode.driverdata = modedata;
   11.62 +
   11.63 +    /* Set display's videomode and add it */
   11.64 +    SDL_zero(display);
   11.65 +    display.desktop_mode = mode;
   11.66 +    display.current_mode = mode;
   11.67 +
   11.68 +    SDL_AddVideoDisplay(&display);
   11.69 +    deprintf(1, "-PS3_InitModes()\n");
   11.70 +}
   11.71 +
   11.72 +/* DisplayModes available on the PS3 */
   11.73 +static SDL_DisplayMode ps3fb_modedb[] = {
   11.74 +    /* VESA */
   11.75 +    {SDL_PIXELFORMAT_RGB888, 1280, 768, 0, NULL}, // WXGA
   11.76 +    {SDL_PIXELFORMAT_RGB888, 1280, 1024, 0, NULL}, // SXGA
   11.77 +    {SDL_PIXELFORMAT_RGB888, 1920, 1200, 0, NULL}, // WUXGA
   11.78 +    /* Native resolutions (progressive, "fullscreen") */
   11.79 +    {SDL_PIXELFORMAT_RGB888, 720, 480, 0, NULL}, // 480p
   11.80 +    {SDL_PIXELFORMAT_RGB888, 1280, 720, 0, NULL}, // 720p
   11.81 +    {SDL_PIXELFORMAT_RGB888, 1920, 1080, 0, NULL} // 1080p
   11.82 +};
   11.83 +
   11.84 +/* PS3 videomode number according to ps3fb_modedb */
   11.85 +static PS3_DisplayModeData ps3fb_data[] = {
   11.86 +    {11}, {12}, {13}, {130}, {131}, {133}, 
   11.87 +};
   11.88 +
   11.89 +void
   11.90 +PS3_GetDisplayModes(_THIS) {
   11.91 +    deprintf(1, "+PS3_GetDisplayModes()\n");
   11.92 +    SDL_DisplayMode mode;
   11.93 +    unsigned int nummodes;
   11.94 +
   11.95 +    nummodes = sizeof(ps3fb_modedb) / sizeof(SDL_DisplayMode);
   11.96 +
   11.97 +    int n;
   11.98 +    for (n=0; n<nummodes; ++n) {
   11.99 +        /* Get driver specific mode data */
  11.100 +        ps3fb_modedb[n].driverdata = &ps3fb_data[n];
  11.101 +
  11.102 +        /* Add DisplayMode to list */
  11.103 +        deprintf(2, "Adding resolution %u x %u\n", ps3fb_modedb[n].w, ps3fb_modedb[n].h);
  11.104 +        SDL_AddDisplayMode(_this->current_display, &ps3fb_modedb[n]);
  11.105 +    }
  11.106 +    deprintf(1, "-PS3_GetDisplayModes()\n");
  11.107 +}
  11.108 +
  11.109 +int
  11.110 +PS3_SetDisplayMode(_THIS, SDL_DisplayMode * mode)
  11.111 +{
  11.112 +    deprintf(1, "+PS3_SetDisplayMode()\n");
  11.113 +    SDL_VideoData *data = (SDL_VideoData *) _this->driverdata;
  11.114 +    PS3_DisplayModeData *dispdata = (PS3_DisplayModeData *) mode->driverdata;
  11.115 +
  11.116 +    /* Set the new DisplayMode */
  11.117 +    deprintf(2, "Setting PS3FB_MODE to %u\n", dispdata->mode);
  11.118 +    if (ioctl(data->fbdev, PS3FB_IOCTL_SETMODE, (unsigned long)&dispdata->mode)) {
  11.119 +        deprintf(2, "Could not set PS3FB_MODE\n");
  11.120 +        SDL_SetError("Could not set PS3FB_MODE\n");
  11.121 +        return -1;
  11.122 +    }
  11.123 +
  11.124 +    deprintf(1, "-PS3_SetDisplayMode()\n");
  11.125 +    return 0;
  11.126 +}
  11.127 +
  11.128 +void
  11.129 +PS3_QuitModes(_THIS) {
  11.130 +    deprintf(1, "+PS3_QuitModes()\n");
  11.131 +
  11.132 +    /* There was no mem allocated for driverdata */
  11.133 +    int i, j;
  11.134 +    for (i = _this->num_displays; i--;) {
  11.135 +        SDL_VideoDisplay *display = &_this->displays[i];
  11.136 +        for (j = display->num_display_modes; j--;) {
  11.137 +            display->display_modes[j].driverdata = NULL;
  11.138 +        }
  11.139 +    }
  11.140 +
  11.141 +    deprintf(1, "-PS3_QuitModes()\n");
  11.142 +}
  11.143 +
  11.144 +/* vi: set ts=4 sw=4 expandtab: */
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/src/video/ps3/SDL_ps3modes_c.h	Mon Sep 07 04:51:29 2009 +0000
    12.3 @@ -0,0 +1,34 @@
    12.4 +/*
    12.5 +    SDL - Simple DirectMedia Layer
    12.6 +    Copyright (C) 1997-2009 Sam Lantinga
    12.7 +
    12.8 +    This library is free software; you can redistribute it and/or
    12.9 +    modify it under the terms of the GNU Lesser General Public
   12.10 +    License as published by the Free Software Foundation; either
   12.11 +    version 2.1 of the License, or (at your option) any later version.
   12.12 +
   12.13 +    This library is distributed in the hope that it will be useful,
   12.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   12.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   12.16 +    Lesser General Public License for more details.
   12.17 +
   12.18 +    You should have received a copy of the GNU Lesser General Public
   12.19 +    License along with this library; if not, write to the Free Software
   12.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   12.21 +
   12.22 +    Sam Lantinga
   12.23 +    slouken@libsdl.org
   12.24 +*/
   12.25 +#include "SDL_config.h"
   12.26 +
   12.27 +#ifndef _SDL_ps3modes_h
   12.28 +#define _SDL_ps3modes_h
   12.29 +
   12.30 +extern void PS3_InitModes(_THIS);
   12.31 +extern void PS3_GetDisplayModes(_THIS);
   12.32 +extern int PS3_SetDisplayMode(_THIS, SDL_DisplayMode * mode);
   12.33 +extern void PS3_QuitModes(_THIS);
   12.34 +
   12.35 +#endif /* SDL_ps3modes_h */
   12.36 +
   12.37 +/* vi: set ts=4 sw=4 expandtab: */
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/src/video/ps3/SDL_ps3render.c	Mon Sep 07 04:51:29 2009 +0000
    13.3 @@ -0,0 +1,746 @@
    13.4 +/*
    13.5 +    SDL - Simple DirectMedia Layer
    13.6 +    Copyright (C) 1997-2009 Sam Lantinga
    13.7 +
    13.8 +    This library is free software; you can redistribute it and/or
    13.9 +    modify it under the terms of the GNU Lesser General Public
   13.10 +    License as published by the Free Software Foundation; either
   13.11 +    version 2.1 of the License, or (at your option) any later version.
   13.12 +
   13.13 +    This library is distributed in the hope that it will be useful,
   13.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   13.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13.16 +    Lesser General Public License for more details.
   13.17 +
   13.18 +    You should have received a copy of the GNU Lesser General Public
   13.19 +    License along with this library; if not, write to the Free Software
   13.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   13.21 +
   13.22 +    Sam Lantinga
   13.23 +    slouken@libsdl.org
   13.24 +*/
   13.25 +#include "SDL_config.h"
   13.26 +
   13.27 +#include "SDL_video.h"
   13.28 +#include "../SDL_sysvideo.h"
   13.29 +#include "../SDL_yuv_sw_c.h"
   13.30 +#include "../SDL_renderer_sw.h"
   13.31 +
   13.32 +#include "SDL_ps3video.h"
   13.33 +#include "SDL_ps3spe_c.h"
   13.34 +
   13.35 +#include <fcntl.h>
   13.36 +#include <stdlib.h>
   13.37 +#include <sys/ioctl.h>
   13.38 +#include <linux/kd.h>
   13.39 +#include <linux/fb.h>
   13.40 +#include <sys/mman.h>
   13.41 +#include <asm/ps3fb.h>
   13.42 +
   13.43 +
   13.44 +/* Stores the executable name */
   13.45 +extern spe_program_handle_t yuv2rgb_spu;
   13.46 +extern spe_program_handle_t bilin_scaler_spu;
   13.47 +
   13.48 +/* SDL surface based renderer implementation */
   13.49 +static SDL_Renderer *SDL_PS3_CreateRenderer(SDL_Window * window,
   13.50 +                                              Uint32 flags);
   13.51 +static int SDL_PS3_DisplayModeChanged(SDL_Renderer * renderer);
   13.52 +static int SDL_PS3_ActivateRenderer(SDL_Renderer * renderer);
   13.53 +static int SDL_PS3_RenderPoint(SDL_Renderer * renderer, int x, int y);
   13.54 +static int SDL_PS3_RenderLine(SDL_Renderer * renderer, int x1, int y1,
   13.55 +                                int x2, int y2);
   13.56 +static int SDL_PS3_RenderFill(SDL_Renderer * renderer,
   13.57 +                                const SDL_Rect * rect);
   13.58 +static int SDL_PS3_RenderCopy(SDL_Renderer * renderer,
   13.59 +                                SDL_Texture * texture,
   13.60 +                                const SDL_Rect * srcrect,
   13.61 +                                const SDL_Rect * dstrect);
   13.62 +static void SDL_PS3_RenderPresent(SDL_Renderer * renderer);
   13.63 +static void SDL_PS3_DestroyRenderer(SDL_Renderer * renderer);
   13.64 +
   13.65 +/* Texture */
   13.66 +static int PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture);
   13.67 +static int PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture, void **pixels, int *pitch);
   13.68 +static int PS3_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture, const SDL_Rect * rect, const void *pixels, int pitch);
   13.69 +static int PS3_LockTexture(SDL_Renderer * renderer, SDL_Texture * texture, const SDL_Rect * rect, int markDirty, void **pixels, int *pitch);
   13.70 +static void PS3_UnlockTexture(SDL_Renderer * renderer, SDL_Texture * texture);
   13.71 +static void PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture);
   13.72 +
   13.73 +
   13.74 +SDL_RenderDriver SDL_PS3_RenderDriver = {
   13.75 +    SDL_PS3_CreateRenderer,
   13.76 +    {
   13.77 +     "ps3",
   13.78 +     (SDL_RENDERER_SINGLEBUFFER | SDL_RENDERER_PRESENTVSYNC |
   13.79 +      SDL_RENDERER_PRESENTFLIP2 | SDL_RENDERER_PRESENTDISCARD |
   13.80 +      SDL_RENDERER_ACCELERATED),
   13.81 +     (SDL_TEXTUREMODULATE_NONE),
   13.82 +     (SDL_BLENDMODE_NONE),
   13.83 +     /* We use bilinear scaling on the SPE for YV12 & IYUV
   13.84 +      * (width and height % 8 = 0) */
   13.85 +     (SDL_TEXTURESCALEMODE_SLOW)
   13.86 +     }
   13.87 +};
   13.88 +
   13.89 +typedef struct
   13.90 +{
   13.91 +    int current_screen;
   13.92 +    SDL_Surface *screen;
   13.93 +    SDL_VideoDisplay *display;
   13.94 +    /* adress of the centered image in the framebuffer (double buffered) */
   13.95 +    uint8_t *center[2];
   13.96 +
   13.97 +    /* width of input (bounded by writeable width) */
   13.98 +    unsigned int bounded_width;
   13.99 +    /* height of input (bounded by writeable height) */
  13.100 +    unsigned int bounded_height;
  13.101 +    /* offset from the left side (used for centering) */
  13.102 +    unsigned int offset_left;
  13.103 +    /* offset from the upper side (used for centering) */
  13.104 +    unsigned int offset_top;
  13.105 +    /* width of screen which is writeable */
  13.106 +    unsigned int wr_width;
  13.107 +    /* width of screen which is writeable */
  13.108 +    unsigned int wr_height;
  13.109 +    /* size of a screen line: width * bpp/8 */
  13.110 +    unsigned int line_length;
  13.111 +
  13.112 +    /* Is the kernels fb size bigger than ~12MB
  13.113 +     * double buffering will work for 1080p */
  13.114 +    unsigned int double_buffering;
  13.115 +
  13.116 +    /* SPE threading stuff */
  13.117 +    spu_data_t *converter_thread_data;
  13.118 +    spu_data_t *scaler_thread_data;
  13.119 +
  13.120 +    /* YUV converting transfer data */
  13.121 +    volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128)));
  13.122 +    /* Scaler transfer data */
  13.123 +    volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128)));
  13.124 +} SDL_PS3_RenderData;
  13.125 +
  13.126 +typedef struct
  13.127 +{
  13.128 +    int pitch;
  13.129 +    /* Image data */
  13.130 +    volatile void *pixels;
  13.131 +    /* Use software renderer for not supported formats */
  13.132 +    SDL_SW_YUVTexture *yuv;
  13.133 +} PS3_TextureData;
  13.134 +
  13.135 +SDL_Renderer *
  13.136 +SDL_PS3_CreateRenderer(SDL_Window * window, Uint32 flags)
  13.137 +{
  13.138 +    deprintf(1, "+SDL_PS3_CreateRenderer()\n");
  13.139 +    SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window);
  13.140 +    SDL_DisplayMode *displayMode = &display->current_mode;
  13.141 +    SDL_VideoData *devdata = display->device->driverdata;
  13.142 +    SDL_Renderer *renderer;
  13.143 +    SDL_PS3_RenderData *data;
  13.144 +    struct ps3fb_ioctl_res res;
  13.145 +    int i, n;
  13.146 +    int bpp;
  13.147 +    Uint32 Rmask, Gmask, Bmask, Amask;
  13.148 +
  13.149 +    if (!SDL_PixelFormatEnumToMasks
  13.150 +        (displayMode->format, &bpp, &Rmask, &Gmask, &Bmask, &Amask)) {
  13.151 +        SDL_SetError("Unknown display format");
  13.152 +        return NULL;
  13.153 +    }
  13.154 +
  13.155 +    renderer = (SDL_Renderer *) SDL_calloc(1, sizeof(*renderer));
  13.156 +    if (!renderer) {
  13.157 +        SDL_OutOfMemory();
  13.158 +        return NULL;
  13.159 +    }
  13.160 +
  13.161 +    data = (SDL_PS3_RenderData *) SDL_malloc(sizeof(*data));
  13.162 +    if (!data) {
  13.163 +        SDL_PS3_DestroyRenderer(renderer);
  13.164 +        SDL_OutOfMemory();
  13.165 +        return NULL;
  13.166 +    }
  13.167 +    SDL_zerop(data);
  13.168 +
  13.169 +    renderer->CreateTexture = PS3_CreateTexture;
  13.170 +    renderer->DestroyTexture = PS3_DestroyTexture;
  13.171 +    renderer->QueryTexturePixels = PS3_QueryTexturePixels;
  13.172 +    renderer->UpdateTexture = PS3_UpdateTexture;
  13.173 +    renderer->LockTexture = PS3_LockTexture;
  13.174 +    renderer->UnlockTexture = PS3_UnlockTexture;
  13.175 +    renderer->ActivateRenderer = SDL_PS3_ActivateRenderer;
  13.176 +    renderer->DisplayModeChanged = SDL_PS3_DisplayModeChanged;
  13.177 +    renderer->RenderPoint = SDL_PS3_RenderPoint;
  13.178 +    renderer->RenderLine = SDL_PS3_RenderLine;
  13.179 +    renderer->RenderFill = SDL_PS3_RenderFill;
  13.180 +    renderer->RenderCopy = SDL_PS3_RenderCopy;
  13.181 +    renderer->RenderPresent = SDL_PS3_RenderPresent;
  13.182 +    renderer->DestroyRenderer = SDL_PS3_DestroyRenderer;
  13.183 +    renderer->info.name = SDL_PS3_RenderDriver.info.name;
  13.184 +    renderer->info.flags = 0;
  13.185 +    renderer->window = window->id;
  13.186 +    renderer->driverdata = data;
  13.187 +
  13.188 +    deprintf(1, "window->w = %u\n", window->w);
  13.189 +    deprintf(1, "window->h = %u\n", window->h);
  13.190 +
  13.191 +    data->double_buffering = 0;
  13.192 +
  13.193 +    /* Get ps3 screeninfo */
  13.194 +    if (ioctl(devdata->fbdev, PS3FB_IOCTL_SCREENINFO, (unsigned long)&res) < 0) {
  13.195 +        SDL_SetError("[PS3] PS3FB_IOCTL_SCREENINFO failed");
  13.196 +    }
  13.197 +    deprintf(2, "res.num_frames = %d\n", res.num_frames);
  13.198 +
  13.199 +    /* Only use double buffering if enough fb memory is available */
  13.200 +    if (res.num_frames > 1) {
  13.201 +        renderer->info.flags |= SDL_RENDERER_PRESENTFLIP2;
  13.202 +        n = 2;
  13.203 +        data->double_buffering = 1;
  13.204 +    } else {
  13.205 +        renderer->info.flags |= SDL_RENDERER_PRESENTCOPY;
  13.206 +        n = 1;
  13.207 +    }
  13.208 +
  13.209 +    data->screen =
  13.210 +        SDL_CreateRGBSurface(0, window->w, window->h, bpp, Rmask, Gmask,
  13.211 +                             Bmask, Amask);
  13.212 +    if (!data->screen) {
  13.213 +        SDL_PS3_DestroyRenderer(renderer);
  13.214 +        return NULL;
  13.215 +    }
  13.216 +    /* Allocate aligned memory for pixels */
  13.217 +    SDL_free(data->screen->pixels);
  13.218 +    data->screen->pixels = (void *)memalign(16, data->screen->h * data->screen->pitch);
  13.219 +    if (!data->screen->pixels) {
  13.220 +        SDL_FreeSurface(data->screen);
  13.221 +        SDL_OutOfMemory();
  13.222 +        return NULL;
  13.223 +    }
  13.224 +    SDL_memset(data->screen->pixels, 0, data->screen->h * data->screen->pitch);
  13.225 +    SDL_SetSurfacePalette(data->screen, display->palette);
  13.226 +
  13.227 +    data->current_screen = 0;
  13.228 +
  13.229 +    /* Create SPU parms structure */
  13.230 +    data->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t));
  13.231 +    data->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t));
  13.232 +    if (data->converter_parms == NULL || data->scaler_parms == NULL) {
  13.233 +        SDL_PS3_DestroyRenderer(renderer);
  13.234 +        SDL_OutOfMemory();
  13.235 +        return NULL;
  13.236 +    }
  13.237 +
  13.238 +    /* Set up the SPE threading data */
  13.239 +    data->converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
  13.240 +    data->scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
  13.241 +    if (data->converter_thread_data == NULL || data->scaler_thread_data == NULL) {
  13.242 +        SDL_PS3_DestroyRenderer(renderer);
  13.243 +        SDL_OutOfMemory();
  13.244 +        return NULL;
  13.245 +    }
  13.246 +
  13.247 +    /* Set up the SPE scaler (booted) */
  13.248 +    data->scaler_thread_data->program = bilin_scaler_spu;
  13.249 +    data->scaler_thread_data->program_name = "bilin_scaler_spu";
  13.250 +    data->scaler_thread_data->keepalive = 0;
  13.251 +    data->scaler_thread_data->booted = 0;
  13.252 +
  13.253 +    /* Set up the SPE converter (always running) */
  13.254 +    data->converter_thread_data->program = yuv2rgb_spu;
  13.255 +    data->converter_thread_data->program_name = "yuv2rgb_spu";
  13.256 +    data->converter_thread_data->keepalive = 1;
  13.257 +    data->converter_thread_data->booted = 0;
  13.258 +
  13.259 +    SPE_Start(data->converter_thread_data);
  13.260 +
  13.261 +    deprintf(1, "-SDL_PS3_CreateRenderer()\n");
  13.262 +    return renderer;
  13.263 +}
  13.264 +
  13.265 +static int
  13.266 +SDL_PS3_ActivateRenderer(SDL_Renderer * renderer)
  13.267 +{
  13.268 +    deprintf(1, "+PS3_ActivateRenderer()\n");
  13.269 +    SDL_PS3_RenderData *data = (SDL_PS3_RenderData *) renderer->driverdata;
  13.270 +
  13.271 +    deprintf(1, "-PS3_ActivateRenderer()\n");
  13.272 +    return 0;
  13.273 +}
  13.274 +
  13.275 +static int SDL_PS3_DisplayModeChanged(SDL_Renderer * renderer) {
  13.276 +    deprintf(1, "+PS3_DisplayModeChanged()\n");
  13.277 +    SDL_PS3_RenderData *data = (SDL_PS3_RenderData *) renderer->driverdata;
  13.278 +
  13.279 +    deprintf(1, "-PS3_DisplayModeChanged()\n");
  13.280 +    return 0;
  13.281 +}
  13.282 +
  13.283 +static int
  13.284 +PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture) {
  13.285 +    deprintf(1, "+PS3_CreateTexture()\n");
  13.286 +    PS3_TextureData *data;
  13.287 +    data = (PS3_TextureData *) SDL_calloc(1, sizeof(*data));
  13.288 +    if (!data) {
  13.289 +        SDL_OutOfMemory();
  13.290 +        return -1;
  13.291 +    }
  13.292 +    data->pitch = (texture->w * SDL_BYTESPERPIXEL(texture->format));
  13.293 +
  13.294 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.295 +        /* Use SDLs SW_YUVTexture */
  13.296 +        data->yuv =
  13.297 +            SDL_SW_CreateYUVTexture(texture->format, texture->w, texture->h);
  13.298 +        if (!data->yuv) {
  13.299 +            SDL_OutOfMemory();
  13.300 +            return -1;
  13.301 +        }
  13.302 +        /* but align pixels */
  13.303 +        SDL_free(data->yuv->pixels);
  13.304 +        data->yuv->pixels = (Uint8 *)memalign(16, texture->w * texture->h * 2);
  13.305 +        if (!data->yuv->pixels) {
  13.306 +            SDL_OutOfMemory();
  13.307 +            return -1;
  13.308 +        }
  13.309 +
  13.310 +        /* Redo: Find the pitch and offset values for the overlay */
  13.311 +        SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) data->yuv;
  13.312 +        switch (texture->format) {
  13.313 +            case SDL_PIXELFORMAT_YV12:
  13.314 +            case SDL_PIXELFORMAT_IYUV:
  13.315 +                swdata->pitches[0] = texture->w;
  13.316 +                swdata->pitches[1] = swdata->pitches[0] / 2;
  13.317 +                swdata->pitches[2] = swdata->pitches[0] / 2;
  13.318 +                swdata->planes[0] = swdata->pixels;
  13.319 +                swdata->planes[1] = swdata->planes[0] + swdata->pitches[0] * texture->h;
  13.320 +                swdata->planes[2] = swdata->planes[1] + swdata->pitches[1] * texture->h / 2;
  13.321 +                break;
  13.322 +            case SDL_PIXELFORMAT_YUY2:
  13.323 +            case SDL_PIXELFORMAT_UYVY:
  13.324 +            case SDL_PIXELFORMAT_YVYU:
  13.325 +                swdata->pitches[0] = texture->w * 2;
  13.326 +                swdata->planes[0] = swdata->pixels;
  13.327 +                break;
  13.328 +            default:
  13.329 +                /* We should never get here (caught above) */
  13.330 +                break;
  13.331 +        }
  13.332 +    } else {
  13.333 +        data->pixels = NULL;
  13.334 +        data->pixels = SDL_malloc(texture->h * data->pitch);
  13.335 +        if (!data->pixels) {
  13.336 +            PS3_DestroyTexture(renderer, texture);
  13.337 +            SDL_OutOfMemory();
  13.338 +            return -1;
  13.339 +        }
  13.340 +    }
  13.341 +    texture->driverdata = data;
  13.342 +    deprintf(1, "-PS3_CreateTexture()\n");
  13.343 +    return 0;
  13.344 +}
  13.345 +
  13.346 +static int
  13.347 +PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture,
  13.348 +                      void **pixels, int *pitch)
  13.349 +{
  13.350 +    deprintf(1, "+PS3_QueryTexturePixels()\n");
  13.351 +    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
  13.352 +
  13.353 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.354 +        return SDL_SW_QueryYUVTexturePixels(data->yuv, pixels, pitch);
  13.355 +    } else {
  13.356 +        *pixels = (void *)data->pixels;
  13.357 +        *pitch = data->pitch;
  13.358 +    }
  13.359 +
  13.360 +    deprintf(1, "-PS3_QueryTexturePixels()\n");
  13.361 +    return 0;
  13.362 +}
  13.363 +
  13.364 +static int
  13.365 +PS3_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture,
  13.366 +                       const SDL_Rect * rect, const void *pixels, int pitch)
  13.367 +{
  13.368 +    deprintf(1, "+PS3_UpdateTexture()\n");
  13.369 +    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
  13.370 +
  13.371 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.372 +        return SDL_SW_UpdateYUVTexture(data->yuv, rect, pixels, pitch);
  13.373 +    } else {
  13.374 +        Uint8 *src, *dst;
  13.375 +        int row;
  13.376 +        size_t length;
  13.377 +        Uint8 *dstpixels;
  13.378 +
  13.379 +        src = (Uint8 *) pixels;
  13.380 +        dst = (Uint8 *) dstpixels + rect->y * data->pitch + rect->x
  13.381 +                        * SDL_BYTESPERPIXEL(texture->format);
  13.382 +        length = rect->w * SDL_BYTESPERPIXEL(texture->format);
  13.383 +        /* Update the texture */
  13.384 +        for (row = 0; row < rect->h; ++row) {
  13.385 +            SDL_memcpy(dst, src, length);
  13.386 +            src += pitch;
  13.387 +            dst += data->pitch;
  13.388 +        }
  13.389 +    }
  13.390 +    deprintf(1, "-PS3_UpdateTexture()\n");
  13.391 +    return 0;
  13.392 +}
  13.393 +
  13.394 +static int
  13.395 +PS3_LockTexture(SDL_Renderer * renderer, SDL_Texture * texture,
  13.396 +               const SDL_Rect * rect, int markDirty, void **pixels,
  13.397 +               int *pitch)
  13.398 +{
  13.399 +    deprintf(1, "+PS3_LockTexture()\n");
  13.400 +    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
  13.401 +
  13.402 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.403 +        deprintf(1, "-PS3_LockTexture()\n");
  13.404 +        return SDL_SW_LockYUVTexture(data->yuv, rect, markDirty, pixels, pitch);
  13.405 +    } else {
  13.406 +        *pixels =
  13.407 +            (void *) ((Uint8 *) data->pixels + rect->y * data->pitch +
  13.408 +                      rect->x * SDL_BYTESPERPIXEL(texture->format));
  13.409 +        *pitch = data->pitch;
  13.410 +        deprintf(1, "-PS3_LockTexture()\n");
  13.411 +        return 0;
  13.412 +    }
  13.413 +}
  13.414 +
  13.415 +static void
  13.416 +PS3_UnlockTexture(SDL_Renderer * renderer, SDL_Texture * texture)
  13.417 +{
  13.418 +    deprintf(1, "+PS3_UnlockTexture()\n");
  13.419 +    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
  13.420 +
  13.421 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.422 +        SDL_SW_UnlockYUVTexture(data->yuv);
  13.423 +    }
  13.424 +    deprintf(1, "-PS3_UnlockTexture()\n");
  13.425 +}
  13.426 +
  13.427 +static void
  13.428 +PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture)
  13.429 +{
  13.430 +    deprintf(1, "+PS3_DestroyTexture()\n");
  13.431 +    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
  13.432 +
  13.433 +    if (!data) {
  13.434 +        return;
  13.435 +    }
  13.436 +    if (data->yuv) {
  13.437 +        SDL_SW_DestroyYUVTexture(data->yuv);
  13.438 +    }
  13.439 +    if (data->pixels) {
  13.440 +        SDL_free((void *)data->pixels);
  13.441 +    }
  13.442 +    deprintf(1, "-PS3_DestroyTexture()\n");
  13.443 +}
  13.444 +
  13.445 +static int
  13.446 +SDL_PS3_RenderPoint(SDL_Renderer * renderer, int x, int y)
  13.447 +{
  13.448 +    SDL_PS3_RenderData *data =
  13.449 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.450 +    SDL_Surface *target = data->screen;
  13.451 +    int status;
  13.452 +
  13.453 +    if (renderer->blendMode == SDL_BLENDMODE_NONE ||
  13.454 +        renderer->blendMode == SDL_BLENDMODE_MASK) {
  13.455 +        Uint32 color =
  13.456 +            SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b,
  13.457 +                        renderer->a);
  13.458 +
  13.459 +        status = SDL_DrawPoint(target, x, y, color);
  13.460 +    } else {
  13.461 +        status =
  13.462 +            SDL_BlendPoint(target, x, y, renderer->blendMode, renderer->r,
  13.463 +                           renderer->g, renderer->b, renderer->a);
  13.464 +    }
  13.465 +    return status;
  13.466 +}
  13.467 +
  13.468 +static int
  13.469 +SDL_PS3_RenderLine(SDL_Renderer * renderer, int x1, int y1, int x2, int y2)
  13.470 +{
  13.471 +    SDL_PS3_RenderData *data =
  13.472 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.473 +    SDL_Surface *target = data->screen;
  13.474 +    int status;
  13.475 +
  13.476 +    if (renderer->blendMode == SDL_BLENDMODE_NONE ||
  13.477 +        renderer->blendMode == SDL_BLENDMODE_MASK) {
  13.478 +        Uint32 color =
  13.479 +            SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b,
  13.480 +                        renderer->a);
  13.481 +
  13.482 +        status = SDL_DrawLine(target, x1, y1, x2, y2, color);
  13.483 +    } else {
  13.484 +        status =
  13.485 +            SDL_BlendLine(target, x1, y1, x2, y2, renderer->blendMode,
  13.486 +                          renderer->r, renderer->g, renderer->b, renderer->a);
  13.487 +    }
  13.488 +    return status;
  13.489 +}
  13.490 +
  13.491 +static int
  13.492 +SDL_PS3_RenderFill(SDL_Renderer * renderer, const SDL_Rect * rect)
  13.493 +{
  13.494 +    deprintf(1, "SDL_PS3_RenderFill()\n");
  13.495 +    SDL_PS3_RenderData *data =
  13.496 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.497 +    SDL_Surface *target = data->screen;
  13.498 +    SDL_Rect real_rect = *rect;
  13.499 +    int status;
  13.500 +
  13.501 +    if (renderer->blendMode == SDL_BLENDMODE_NONE) {
  13.502 +        Uint32 color =
  13.503 +            SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b,
  13.504 +                        renderer->a);
  13.505 +
  13.506 +        status = SDL_FillRect(target, &real_rect, color);
  13.507 +    } else {
  13.508 +        status =
  13.509 +            SDL_BlendRect(target, &real_rect, renderer->blendMode,
  13.510 +                          renderer->r, renderer->g, renderer->b, renderer->a);
  13.511 +    }
  13.512 +    return status;
  13.513 +}
  13.514 +
  13.515 +static int
  13.516 +SDL_PS3_RenderCopy(SDL_Renderer * renderer, SDL_Texture * texture,
  13.517 +                     const SDL_Rect * srcrect, const SDL_Rect * dstrect)
  13.518 +{
  13.519 +    deprintf(1, "+SDL_PS3_RenderCopy()\n");
  13.520 +    SDL_PS3_RenderData *data =
  13.521 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.522 +    SDL_Window *window = SDL_GetWindowFromID(renderer->window);
  13.523 +    SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window);
  13.524 +    PS3_TextureData *txdata = (PS3_TextureData *) texture->driverdata;
  13.525 +    SDL_VideoData *devdata = display->device->driverdata;
  13.526 +
  13.527 +    if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
  13.528 +        deprintf(1, "Texture is in a FOURCC format\n");
  13.529 +        if ((texture->format == SDL_PIXELFORMAT_YV12 || texture->format == SDL_PIXELFORMAT_IYUV)
  13.530 +                && texture->w % 8 == 0 && texture->h % 8 == 0
  13.531 +                && dstrect->w % 8 == 0 && dstrect->h % 8 == 0) {
  13.532 +            deprintf(1, "Use SPE for scaling/converting\n");
  13.533 +
  13.534 +            SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) txdata->yuv;
  13.535 +            Uint8 *lum, *Cr, *Cb;
  13.536 +            Uint8 *scaler_out = NULL;
  13.537 +            Uint8 *dstpixels;
  13.538 +            switch (texture->format) {
  13.539 +                case SDL_PIXELFORMAT_YV12:
  13.540 +                    lum = swdata->planes[0];
  13.541 +                    Cr = swdata->planes[1];
  13.542 +                    Cb = swdata->planes[2];
  13.543 +                    break;
  13.544 +                case SDL_PIXELFORMAT_IYUV:
  13.545 +                    lum = swdata->planes[0];
  13.546 +                    Cr = swdata->planes[2];
  13.547 +                    Cb = swdata->planes[1];
  13.548 +                    break;
  13.549 +                default:
  13.550 +                    /* We should never get here (caught above) */
  13.551 +                    return -1;
  13.552 +            }
  13.553 +
  13.554 +            if (srcrect->w != dstrect->w || srcrect->h != dstrect->h) {
  13.555 +                deprintf(1, "We need to scale the texture from %u x %u to %u x %u\n",
  13.556 +                        srcrect->w, srcrect->h, dstrect->w, dstrect->h);
  13.557 +                /* Alloc mem for scaled YUV picture */
  13.558 +                scaler_out = (Uint8 *) memalign(16, dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 1));
  13.559 +                if (scaler_out == NULL) {
  13.560 +                    SDL_OutOfMemory();
  13.561 +                    return -1;
  13.562 +                }
  13.563 +
  13.564 +                /* Set parms for scaling */
  13.565 +                data->scaler_parms->src_pixel_width = srcrect->w;
  13.566 +                data->scaler_parms->src_pixel_height = srcrect->h;
  13.567 +                data->scaler_parms->dst_pixel_width = dstrect->w;
  13.568 +                data->scaler_parms->dst_pixel_height = dstrect->h;
  13.569 +                data->scaler_parms->y_plane = lum;
  13.570 +                data->scaler_parms->v_plane = Cr;
  13.571 +                data->scaler_parms->u_plane = Cb;
  13.572 +                data->scaler_parms->dstBuffer = scaler_out;
  13.573 +                data->scaler_thread_data->argp = (void *)data->scaler_parms;
  13.574 +
  13.575 +                /* Scale the YUV overlay to given size */
  13.576 +                SPE_Start(data->scaler_thread_data);
  13.577 +                SPE_Stop(data->scaler_thread_data);
  13.578 +
  13.579 +                /* Set parms for converting after scaling */
  13.580 +                data->converter_parms->y_plane = scaler_out;
  13.581 +                data->converter_parms->v_plane = scaler_out + dstrect->w * dstrect->h;
  13.582 +                data->converter_parms->u_plane = scaler_out + dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 2);
  13.583 +            } else {
  13.584 +                data->converter_parms->y_plane = lum;
  13.585 +                data->converter_parms->v_plane = Cr;
  13.586 +                data->converter_parms->u_plane = Cb;
  13.587 +            }
  13.588 +
  13.589 +            dstpixels = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x
  13.590 +                            * SDL_BYTESPERPIXEL(texture->format);
  13.591 +            data->converter_parms->src_pixel_width = dstrect->w;
  13.592 +            data->converter_parms->src_pixel_height = dstrect->h;
  13.593 +            data->converter_parms->dstBuffer = dstpixels/*(Uint8 *)data->screen->pixels*/;
  13.594 +            data->converter_thread_data->argp = (void *)data->converter_parms;
  13.595 +
  13.596 +            /* Convert YUV texture to RGB */
  13.597 +            SPE_SendMsg(data->converter_thread_data, SPU_START);
  13.598 +            SPE_SendMsg(data->converter_thread_data, (unsigned int)data->converter_thread_data->argp);
  13.599 +
  13.600 +            /* We can probably move that to RenderPresent() */
  13.601 +            SPE_WaitForMsg(data->converter_thread_data, SPU_FIN);
  13.602 +            if (scaler_out) {
  13.603 +                free(scaler_out);
  13.604 +            }
  13.605 +        } else {
  13.606 +            deprintf(1, "Use software for scaling/converting\n");
  13.607 +            Uint8 *dst;
  13.608 +            /* FIXME: Not good */
  13.609 +            dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x
  13.610 +                            * SDL_BYTESPERPIXEL(texture->format);
  13.611 +            return SDL_SW_CopyYUVToRGB(txdata->yuv, srcrect, display->current_mode.format,
  13.612 +                                   dstrect->w, dstrect->h, dst/*data->screen->pixels*/,
  13.613 +                                   data->screen->pitch);
  13.614 +        }
  13.615 +    } else {
  13.616 +        deprintf(1, "SDL_ISPIXELFORMAT_FOURCC = false\n");
  13.617 +
  13.618 +        Uint8 *src, *dst;
  13.619 +        int row;
  13.620 +        size_t length;
  13.621 +        Uint8 *dstpixels;
  13.622 +
  13.623 +        src = (Uint8 *) txdata->pixels;
  13.624 +        dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x
  13.625 +                        * SDL_BYTESPERPIXEL(texture->format);
  13.626 +        length = dstrect->w * SDL_BYTESPERPIXEL(texture->format);
  13.627 +        for (row = 0; row < dstrect->h; ++row) {
  13.628 +            SDL_memcpy(dst, src, length);
  13.629 +            src += txdata->pitch;
  13.630 +            dst += data->screen->pitch;
  13.631 +        }
  13.632 +    }
  13.633 +
  13.634 +    deprintf(1, "-SDL_PS3_RenderCopy()\n");
  13.635 +    return 0;
  13.636 +}
  13.637 +
  13.638 +static void
  13.639 +SDL_PS3_RenderPresent(SDL_Renderer * renderer)
  13.640 +{
  13.641 +    deprintf(1, "+SDL_PS3_RenderPresent()\n");
  13.642 +    SDL_PS3_RenderData *data =
  13.643 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.644 +    SDL_Window *window = SDL_GetWindowFromID(renderer->window);
  13.645 +    SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window);
  13.646 +    SDL_VideoData *devdata = display->device->driverdata;
  13.647 +
  13.648 +    /* Send the data to the screen */
  13.649 +    /* Get screeninfo */
  13.650 +    struct fb_fix_screeninfo fb_finfo;
  13.651 +    if (ioctl(devdata->fbdev, FBIOGET_FSCREENINFO, &fb_finfo)) {
  13.652 +        SDL_SetError("[PS3] Can't get fixed screeninfo");
  13.653 +    }
  13.654 +    struct fb_var_screeninfo fb_vinfo;
  13.655 +    if (ioctl(devdata->fbdev, FBIOGET_VSCREENINFO, &fb_vinfo)) {
  13.656 +        SDL_SetError("[PS3] Can't get VSCREENINFO");
  13.657 +    }
  13.658 +
  13.659 +    /* 16 and 15 bpp is reported as 16 bpp */
  13.660 +    //txdata->bpp = fb_vinfo.bits_per_pixel;
  13.661 +    //if (txdata->bpp == 16)
  13.662 +    //    txdata->bpp = fb_vinfo.red.length + fb_vinfo.green.length + fb_vinfo.blue.length;
  13.663 +
  13.664 +    /* Adjust centering */
  13.665 +    data->bounded_width = window->w < fb_vinfo.xres ? window->w : fb_vinfo.xres;
  13.666 +    data->bounded_height = window->h < fb_vinfo.yres ? window->h : fb_vinfo.yres;
  13.667 +    /* We could use SDL's CENTERED flag for centering */
  13.668 +    data->offset_left = (fb_vinfo.xres - data->bounded_width) >> 1;
  13.669 +    data->offset_top = (fb_vinfo.yres - data->bounded_height) >> 1;
  13.670 +    data->center[0] = devdata->frame_buffer + data->offset_left * /*txdata->bpp/8*/ 4 +
  13.671 +                data->offset_top * fb_finfo.line_length;
  13.672 +    data->center[1] = data->center[0] + fb_vinfo.yres * fb_finfo.line_length;
  13.673 +
  13.674 +    deprintf(1, "offset_left = %u\n", data->offset_left);
  13.675 +    deprintf(1, "offset_top = %u\n", data->offset_top);
  13.676 +
  13.677 +    /* Set SPU parms for copying the surface to framebuffer */
  13.678 +    devdata->fb_parms->data = (unsigned char *)data->screen->pixels;
  13.679 +    devdata->fb_parms->center = data->center[data->current_screen];
  13.680 +    devdata->fb_parms->out_line_stride = fb_finfo.line_length;
  13.681 +    devdata->fb_parms->in_line_stride = window->w * /*txdata->bpp / 8*/4;
  13.682 +    devdata->fb_parms->bounded_input_height = data->bounded_height;
  13.683 +    devdata->fb_parms->bounded_input_width = data->bounded_width;
  13.684 +    //devdata->fb_parms->fb_pixel_size = txdata->bpp / 8;
  13.685 +    devdata->fb_parms->fb_pixel_size = 4;//SDL_BYTESPERPIXEL(window->format);
  13.686 +
  13.687 +    deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", devdata->fb_thread_data->argp);
  13.688 +
  13.689 +    /* Copying.. */
  13.690 +    SPE_SendMsg(devdata->fb_thread_data, SPU_START);
  13.691 +    SPE_SendMsg(devdata->fb_thread_data, (unsigned int)devdata->fb_thread_data->argp);
  13.692 +
  13.693 +    SPE_WaitForMsg(devdata->fb_thread_data, SPU_FIN);
  13.694 +
  13.695 +    /* Wait for vsync */
  13.696 +    if (renderer->info.flags & SDL_RENDERER_PRESENTVSYNC) {
  13.697 +        unsigned long crt = 0;
  13.698 +        deprintf(1, "[PS3] Wait for vsync\n");
  13.699 +        ioctl(devdata->fbdev, FBIO_WAITFORVSYNC, &crt);
  13.700 +    }
  13.701 +
  13.702 +    /* Page flip */
  13.703 +    deprintf(1, "[PS3] Page flip to buffer #%u 0x%x\n", data->current_screen, data->center[data->current_screen]);
  13.704 +    ioctl(devdata->fbdev, PS3FB_IOCTL_FSEL, (unsigned long)&data->current_screen);
  13.705 +
  13.706 +    /* Update the flipping chain, if any */
  13.707 +    if (data->double_buffering) {
  13.708 +        data->current_screen = (data->current_screen + 1) % 2;
  13.709 +    }
  13.710 +    deprintf(1, "-SDL_PS3_RenderPresent()\n");
  13.711 +}
  13.712 +
  13.713 +static void
  13.714 +SDL_PS3_DestroyRenderer(SDL_Renderer * renderer)
  13.715 +{
  13.716 +    deprintf(1, "+SDL_PS3_DestroyRenderer()\n");
  13.717 +    SDL_PS3_RenderData *data =
  13.718 +        (SDL_PS3_RenderData *) renderer->driverdata;
  13.719 +    int i;
  13.720 +
  13.721 +    if (data) {
  13.722 +        for (i = 0; i < SDL_arraysize(data->screen); ++i) {
  13.723 +            if (data->screen) {
  13.724 +                SDL_FreeSurface(data->screen);
  13.725 +            }
  13.726 +        }
  13.727 +
  13.728 +        /* Shutdown SPE and release related resources */
  13.729 +        if (data->scaler_thread_data) {
  13.730 +            free((void *)data->scaler_thread_data);
  13.731 +        }
  13.732 +        if (data->scaler_parms) {
  13.733 +            free((void *)data->scaler_parms);
  13.734 +        }
  13.735 +        if (data->converter_thread_data) {
  13.736 +            SPE_Shutdown(data->converter_thread_data);
  13.737 +            free((void *)data->converter_thread_data);
  13.738 +        }
  13.739 +        if (data->converter_parms) {
  13.740 +            free((void *)data->converter_parms);
  13.741 +        }
  13.742 +
  13.743 +        SDL_free(data);
  13.744 +    }
  13.745 +    SDL_free(renderer);
  13.746 +    deprintf(1, "-SDL_PS3_DestroyRenderer()\n");
  13.747 +}
  13.748 +
  13.749 +/* vi: set ts=4 sw=4 expandtab: */
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/src/video/ps3/SDL_ps3render_c.h	Mon Sep 07 04:51:29 2009 +0000
    14.3 @@ -0,0 +1,29 @@
    14.4 +/*
    14.5 +    SDL - Simple DirectMedia Layer
    14.6 +    Copyright (C) 1997-2009 Sam Lantinga
    14.7 +
    14.8 +    This library is free software; you can redistribute it and/or
    14.9 +    modify it under the terms of the GNU Lesser General Public
   14.10 +    License as published by the Free Software Foundation; either
   14.11 +    version 2.1 of the License, or (at your option) any later version.
   14.12 +
   14.13 +    This library is distributed in the hope that it will be useful,
   14.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   14.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14.16 +    Lesser General Public License for more details.
   14.17 +
   14.18 +    You should have received a copy of the GNU Lesser General Public
   14.19 +    License along with this library; if not, write to the Free Software
   14.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   14.21 +
   14.22 +    Sam Lantinga
   14.23 +    slouken@libsdl.org
   14.24 +*/
   14.25 +#include "SDL_config.h"
   14.26 +
   14.27 +/* Default framebuffer device on PS3 */
   14.28 +/* SDL surface based renderer implementation */
   14.29 +
   14.30 +extern SDL_RenderDriver SDL_PS3_RenderDriver;
   14.31 +
   14.32 +/* vi: set ts=4 sw=4 expandtab: */
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/src/video/ps3/SDL_ps3spe.c	Mon Sep 07 04:51:29 2009 +0000
    15.3 @@ -0,0 +1,166 @@
    15.4 +/*
    15.5 +    SDL - Simple DirectMedia Layer
    15.6 +    Copyright (C) 1997-2009 Sam Lantinga
    15.7 +
    15.8 +    This library is free software; you can redistribute it and/or
    15.9 +    modify it under the terms of the GNU Lesser General Public
   15.10 +    License as published by the Free Software Foundation; either
   15.11 +    version 2.1 of the License, or (at your option) any later version.
   15.12 +
   15.13 +    This library is distributed in the hope that it will be useful,
   15.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   15.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   15.16 +    Lesser General Public License for more details.
   15.17 +
   15.18 +    You should have received a copy of the GNU Lesser General Public
   15.19 +    License along with this library; if not, write to the Free Software
   15.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   15.21 +
   15.22 +    Sam Lantinga
   15.23 +    slouken@libsdl.org
   15.24 +*/
   15.25 +#include "SDL_config.h"
   15.26 +
   15.27 +#include "SDL_video.h"
   15.28 +#include "SDL_ps3spe_c.h"
   15.29 +
   15.30 +#include "SDL_ps3video.h"
   15.31 +#include "SDL_ps3render_c.h"
   15.32 +
   15.33 +/* Start the SPE thread */
   15.34 +int SPE_Start(spu_data_t * spe_data)
   15.35 +{
   15.36 +  deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
   15.37 +  if (!(spe_data->booted))
   15.38 +    SPE_Boot(spe_data);
   15.39 +
   15.40 +  /* To allow re-running of context, spe_ctx_entry has to be set before each call */
   15.41 +  spe_data->entry = SPE_DEFAULT_ENTRY;
   15.42 +  spe_data->error_code = 0;
   15.43 +
   15.44 +  /* Create SPE thread and run */
   15.45 +  deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
   15.46 +  if (pthread_create
   15.47 +      (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
   15.48 +    deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
   15.49 +    SDL_SetError("[PS3->SPU] Could not create pthread for spe");
   15.50 +    return -1;
   15.51 +  }
   15.52 +
   15.53 +  if (spe_data->keepalive)
   15.54 +    SPE_WaitForMsg(spe_data, SPU_READY);
   15.55 +}
   15.56 +
   15.57 +/* Stop the SPE thread */
   15.58 +int SPE_Stop(spu_data_t * spe_data)
   15.59 +{
   15.60 +  deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
   15.61 +  /* Wait for SPE thread to complete */
   15.62 +  deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
   15.63 +  if (pthread_join(spe_data->thread, NULL)) {
   15.64 +    deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
   15.65 +    SDL_SetError("[PS3->SPU] Failed joining the thread");
   15.66 +    return -1;
   15.67 +  }
   15.68 +
   15.69 +  return 0;
   15.70 +}
   15.71 +
   15.72 +/* Create SPE context and load program */
   15.73 +int SPE_Boot(spu_data_t * spe_data)
   15.74 +{
   15.75 +  /* Create SPE context */
   15.76 +  deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
   15.77 +  spe_data->ctx = spe_context_create(0, NULL);
   15.78 +  if (spe_data->ctx == NULL) {
   15.79 +    deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
   15.80 +    SDL_SetError("[PS3->SPU] Failed creating SPE context");
   15.81 +    return -1;
   15.82 +  }
   15.83 +
   15.84 +  /* Load SPE object into SPE local store */
   15.85 +  deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
   15.86 +  if (spe_program_load(spe_data->ctx, &spe_data->program)) {
   15.87 +    deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
   15.88 +    SDL_SetError
   15.89 +        ("[PS3->SPU] Failed loading program into SPE context");
   15.90 +    return -1;
   15.91 +  }
   15.92 +  spe_data->booted = 1;
   15.93 +  deprintf(2, "[PS3->SPU] SPE boot successful\n");
   15.94 +
   15.95 +  return 0;
   15.96 +}
   15.97 +
   15.98 +/* (Stop and) shutdown the SPE */
   15.99 +int SPE_Shutdown(spu_data_t * spe_data)
  15.100 +{
  15.101 +  if (spe_data->keepalive && spe_data->booted) {
  15.102 +    SPE_SendMsg(spe_data, SPU_EXIT);
  15.103 +    SPE_Stop(spe_data);
  15.104 +  }
  15.105 +
  15.106 +  /* Destroy SPE context */
  15.107 +  deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
  15.108 +  if (spe_context_destroy(spe_data->ctx)) {
  15.109 +    deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
  15.110 +    SDL_SetError("[PS3->SPU] Failed destroying context");
  15.111 +    return -1;
  15.112 +  }
  15.113 +  deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
  15.114 +  return 0;
  15.115 +}
  15.116 +
  15.117 +/* Send message to the SPE via mailboxe */
  15.118 +int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg)
  15.119 +{
  15.120 +  deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
  15.121 +  /* Send one message, block until message was sent */
  15.122 +  unsigned int spe_in_mbox_msgs[1];
  15.123 +  spe_in_mbox_msgs[0] = msg;
  15.124 +  int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
  15.125 +
  15.126 +  if (1 > in_mbox_write) {
  15.127 +    deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
  15.128 +    SDL_SetError("[PS3->SPU] No message could be written");
  15.129 +    return -1;
  15.130 +  }
  15.131 +  return 0;
  15.132 +}
  15.133 +
  15.134 +
  15.135 +/* Read 1 message from SPE, block until at least 1 message was received */
  15.136 +int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg)
  15.137 +{
  15.138 +  deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
  15.139 +  unsigned int out_messages[1];
  15.140 +  while (!spe_out_mbox_status(spe_data->ctx));
  15.141 +  int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
  15.142 +  deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
  15.143 +  if (out_messages[0] == msg)
  15.144 +    return 0;
  15.145 +  else
  15.146 +    return -1;
  15.147 +}
  15.148 +
  15.149 +/* Re-runnable invocation of the spe_context_run call */
  15.150 +void SPE_RunContext(void *thread_argp)
  15.151 +{
  15.152 +  /* argp is the pointer to argument to be passed to the SPE program */
  15.153 +  spu_data_t *args = (spu_data_t *) thread_argp;
  15.154 +  deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
  15.155 +
  15.156 +  /* Run it.. */
  15.157 +  deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
  15.158 +  if (spe_context_run
  15.159 +      (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
  15.160 +       NULL) < 0) {
  15.161 +    deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
  15.162 +    SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
  15.163 +    exit(1);
  15.164 +  }
  15.165 +
  15.166 +  pthread_exit(NULL);
  15.167 +}
  15.168 +
  15.169 +/* vi: set ts=4 sw=4 expandtab: */
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/src/video/ps3/SDL_ps3spe_c.h	Mon Sep 07 04:51:29 2009 +0000
    16.3 @@ -0,0 +1,87 @@
    16.4 +/*
    16.5 +    SDL - Simple DirectMedia Layer
    16.6 +    Copyright (C) 1997-2009 Sam Lantinga
    16.7 +
    16.8 +    This library is free software; you can redistribute it and/or
    16.9 +    modify it under the terms of the GNU Lesser General Public
   16.10 +    License as published by the Free Software Foundation; either
   16.11 +    version 2.1 of the License, or (at your option) any later version.
   16.12 +
   16.13 +    This library is distributed in the hope that it will be useful,
   16.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   16.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   16.16 +    Lesser General Public License for more details.
   16.17 +
   16.18 +    You should have received a copy of the GNU Lesser General Public
   16.19 +    License along with this library; if not, write to the Free Software
   16.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   16.21 +
   16.22 +    Sam Lantinga
   16.23 +    slouken@libsdl.org
   16.24 +*/
   16.25 +
   16.26 +/* This SPE API basically provides 3 ways to run and control a program
   16.27 + * on the SPE:
   16.28 + * - Start and stop the program (keepalive=0).
   16.29 + *   SPE_Start() will implicitly boot up the program, create a thread and run
   16.30 + *   the context.
   16.31 + *   SPE_Stop() will join the (terminated) thread (may block) and return.
   16.32 + * - Boot the program and run it (keepalive=0).
   16.33 + *   SPE_Boot() will create a context and load the program and finally start
   16.34 + *   the context with SPE_Start().
   16.35 + *   SPE_Stop() will savely end the program.
   16.36 + * - Boot, Run and send messages to the program (keepalive=1).
   16.37 + *   Start the program by using one of the methods described above. When
   16.38 + *   received the READY-message the program is in its infinite loop waiting
   16.39 + *   for new messages.
   16.40 + *   Every time you run the program, send SPU_START and the address of the
   16.41 + *   according struct using SPE_SendMsg().
   16.42 + *   SPE_WaitForMsg() will than wait for SPU_FIN and is blocking.
   16.43 + *   SPE_Shutdown() sends SPU_EXIT and finally stops the program.
   16.44 + *
   16.45 + * Therefor the SPE program
   16.46 + * - either runs once and returns
   16.47 + * - or runs in an infinite loop and is controlled by messages.
   16.48 + */
   16.49 +
   16.50 +#include "SDL_config.h"
   16.51 +
   16.52 +#include "spulibs/spu_common.h"
   16.53 +
   16.54 +#include <libspe2.h>
   16.55 +
   16.56 +#ifndef _SDL_ps3spe_h
   16.57 +#define _SDL_ps3spe_h
   16.58 +
   16.59 +/* SPU handling data */
   16.60 +typedef struct spu_data {
   16.61 +    /* Context to be executed */
   16.62 +    spe_context_ptr_t ctx;
   16.63 +    spe_program_handle_t program;
   16.64 +    /* Thread running the context */
   16.65 +    pthread_t thread;
   16.66 +    /* For debugging */
   16.67 +    char * program_name;
   16.68 +    /* SPE_Start() or SPE_Boot() called */
   16.69 +    unsigned int booted;
   16.70 +    /* Runs the program in an infinite loop? */
   16.71 +    unsigned int keepalive;
   16.72 +    unsigned int entry;
   16.73 +    /* Exit code of the program */
   16.74 +    int error_code;
   16.75 +    /* Arguments passed to the program */
   16.76 +    void * argp;
   16.77 +} spu_data_t;
   16.78 +
   16.79 +/* SPU specific API functions */
   16.80 +int SPE_Start(spu_data_t * spe_data);
   16.81 +int SPE_Stop(spu_data_t * spe_data);
   16.82 +int SPE_Boot(spu_data_t * spe_data);
   16.83 +int SPE_Shutdown(spu_data_t * spe_data);
   16.84 +int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg);
   16.85 +int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg);
   16.86 +void SPE_RunContext(void *thread_argp);
   16.87 +
   16.88 +#endif /* _SDL_ps3spe_h */
   16.89 +
   16.90 +/* vi: set ts=4 sw=4 expandtab: */
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/src/video/ps3/SDL_ps3video.c	Mon Sep 07 04:51:29 2009 +0000
    17.3 @@ -0,0 +1,224 @@
    17.4 +/*
    17.5 +    SDL - Simple DirectMedia Layer
    17.6 +    Copyright (C) 1997-2009 Sam Lantinga
    17.7 +
    17.8 +    This library is free software; you can redistribute it and/or
    17.9 +    modify it under the terms of the GNU Lesser General Public
   17.10 +    License as published by the Free Software Foundation; either
   17.11 +    version 2.1 of the License, or (at your option) any later version.
   17.12 +
   17.13 +    This library is distributed in the hope that it will be useful,
   17.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   17.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   17.16 +    Lesser General Public License for more details.
   17.17 +
   17.18 +    You should have received a copy of the GNU Lesser General Public
   17.19 +    License along with this library; if not, write to the Free Software
   17.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   17.21 +
   17.22 +    Sam Lantinga
   17.23 +    slouken@libsdl.org
   17.24 +*/
   17.25 +#include "SDL_config.h"
   17.26 +
   17.27 +/* SDL PS3 video driver implementation based on dummy video driver
   17.28 + *
   17.29 + * Initial work by Ryan C. Gordon (icculus@icculus.org). A good portion
   17.30 + *  of this was cut-and-pasted from Stephane Peter's work in the AAlib
   17.31 + *  SDL video driver.  Renamed to "DUMMY" by Sam Lantinga.
   17.32 + */
   17.33 +
   17.34 +#include "SDL_video.h"
   17.35 +#include "SDL_mouse.h"
   17.36 +#include "../SDL_sysvideo.h"
   17.37 +#include "../SDL_pixels_c.h"
   17.38 +#include "../../events/SDL_events_c.h"
   17.39 +
   17.40 +#include "SDL_ps3video.h"
   17.41 +#include "SDL_ps3spe_c.h"
   17.42 +#include "SDL_ps3events_c.h"
   17.43 +#include "SDL_ps3render_c.h"
   17.44 +#include "SDL_ps3modes_c.h"
   17.45 +
   17.46 +#include <fcntl.h>
   17.47 +#include <linux/fb.h>
   17.48 +#include <asm/ps3fb.h>
   17.49 +#include <sys/mman.h>
   17.50 +
   17.51 +#define PS3VID_DRIVER_NAME "ps3"
   17.52 +
   17.53 +/* Initialization/Query functions */
   17.54 +static int PS3_VideoInit(_THIS);
   17.55 +static void PS3_VideoQuit(_THIS);
   17.56 +
   17.57 +/* Stores the SPE executable name of fb_writer_spu */
   17.58 +extern spe_program_handle_t fb_writer_spu;
   17.59 +
   17.60 +/* PS3 driver bootstrap functions */
   17.61 +
   17.62 +static int
   17.63 +PS3_Available(void)
   17.64 +{
   17.65 +    deprintf(1, "+PS3_Available()\n");
   17.66 +    const char *envr = SDL_getenv("SDL_VIDEODRIVER");
   17.67 +    if ((envr) && (SDL_strcmp(envr, PS3VID_DRIVER_NAME) == 0)) {
   17.68 +        return (1);
   17.69 +    }
   17.70 +
   17.71 +    deprintf(1, "-PS3_Available()\n");
   17.72 +    return (0);
   17.73 +}
   17.74 +
   17.75 +static void
   17.76 +PS3_DeleteDevice(SDL_VideoDevice * device)
   17.77 +{
   17.78 +    deprintf(1, "+PS3_DeleteDevice()\n");
   17.79 +    SDL_free(device->driverdata);
   17.80 +    SDL_free(device);
   17.81 +    deprintf(1, "-PS3_DeleteDevice()\n");
   17.82 +}
   17.83 +
   17.84 +static SDL_VideoDevice *
   17.85 +PS3_CreateDevice(int devindex)
   17.86 +{
   17.87 +    deprintf(1, "+PS3_CreateDevice()\n");
   17.88 +    SDL_VideoDevice *device;
   17.89 +    SDL_VideoData *data;
   17.90 +
   17.91 +    /* Initialize all variables that we clean on shutdown */
   17.92 +    device = (SDL_VideoDevice *) SDL_calloc(1, sizeof(SDL_VideoDevice));
   17.93 +    if (!device) {
   17.94 +        SDL_OutOfMemory();
   17.95 +        if (device) {
   17.96 +            SDL_free(device);
   17.97 +        }
   17.98 +        return (0);
   17.99 +    }
  17.100 +    data = (struct SDL_VideoData *) SDL_calloc(1, sizeof(SDL_VideoData));
  17.101 +    if (!data) {
  17.102 +        SDL_OutOfMemory();
  17.103 +        SDL_free(device);
  17.104 +        return (0);
  17.105 +    }
  17.106 +    device->driverdata = data;
  17.107 +
  17.108 +    /* Set the function pointers */
  17.109 +    device->VideoInit = PS3_VideoInit;
  17.110 +    device->VideoQuit = PS3_VideoQuit;
  17.111 +    device->SetDisplayMode = PS3_SetDisplayMode;
  17.112 +    device->GetDisplayModes = PS3_GetDisplayModes;
  17.113 +    device->PumpEvents = PS3_PumpEvents;
  17.114 +
  17.115 +    device->free = PS3_DeleteDevice;
  17.116 +
  17.117 +    deprintf(1, "-PS3_CreateDevice()\n");
  17.118 +    return device;
  17.119 +}
  17.120 +
  17.121 +VideoBootStrap PS3_bootstrap = {
  17.122 +    PS3VID_DRIVER_NAME, "SDL PS3 Cell video driver",
  17.123 +    PS3_Available, PS3_CreateDevice
  17.124 +};
  17.125 +
  17.126 +
  17.127 +int
  17.128 +PS3_VideoInit(_THIS)
  17.129 +{
  17.130 +    deprintf(1, "PS3_VideoInit()\n");
  17.131 +
  17.132 +    SDL_VideoData *data = (SDL_VideoData *) _this->driverdata;
  17.133 +    SDL_DisplayMode mode;
  17.134 +
  17.135 +    /* Create SPU fb_parms and thread structure */
  17.136 +    data->fb_parms = (struct fb_writer_parms_t *)
  17.137 +        memalign(16, sizeof(struct fb_writer_parms_t));
  17.138 +    data->fb_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
  17.139 +    if (data->fb_parms == NULL || data->fb_thread_data == NULL) {
  17.140 +        SDL_OutOfMemory();
  17.141 +        return -1;
  17.142 +    }
  17.143 +    data->fb_thread_data->program = fb_writer_spu;
  17.144 +    data->fb_thread_data->program_name = "fb_writer_spu";
  17.145 +    data->fb_thread_data->argp = (void *)data->fb_parms;
  17.146 +    data->fb_thread_data->keepalive = 1;
  17.147 +    data->fb_thread_data->booted = 0;
  17.148 +
  17.149 +    SPE_Start(data->fb_thread_data);
  17.150 +
  17.151 +    /* Open the device */
  17.152 +    data->fbdev = open(PS3DEV, O_RDWR);
  17.153 +    if (data->fbdev < 0) {
  17.154 +        SDL_SetError("[PS3] Unable to open device %s", PS3DEV);
  17.155 +        return -1;
  17.156 +    }
  17.157 +
  17.158 +    /* Take control of frame buffer from kernel, for details see
  17.159 +     * http://felter.org/wesley/files/ps3/linux-20061110-docs/ApplicationProgrammingEnvironment.html
  17.160 +     * kernel will no longer flip the screen itself
  17.161 +     */
  17.162 +    ioctl(data->fbdev, PS3FB_IOCTL_ON, 0);
  17.163 +
  17.164 +    /* Unblank screen */
  17.165 +    ioctl(data->fbdev, FBIOBLANK, 0);
  17.166 +
  17.167 +    struct fb_fix_screeninfo fb_finfo;
  17.168 +    if (ioctl(data->fbdev, FBIOGET_FSCREENINFO, &fb_finfo)) {
  17.169 +        SDL_SetError("[PS3] Can't get fixed screeninfo");
  17.170 +        return (0);
  17.171 +    }
  17.172 +
  17.173 +    /* Note: on PS3, fb_finfo.smem_len is enough for double buffering */
  17.174 +    if ((data->frame_buffer = (uint8_t *)mmap(0, fb_finfo.smem_len,
  17.175 +        PROT_READ | PROT_WRITE, MAP_SHARED,
  17.176 +        data->fbdev, 0)) == (uint8_t *) - 1) {
  17.177 +        SDL_SetError("[PS3] Can't mmap for %s", PS3DEV);
  17.178 +        return (0);
  17.179 +    } else {
  17.180 +        /* Enable double buffering */
  17.181 +    }
  17.182 +
  17.183 +    /* Blank screen */
  17.184 +    memset(data->frame_buffer, 0x00, fb_finfo.smem_len);
  17.185 +
  17.186 +    PS3_InitModes(_this);
  17.187 +    SDL_AddRenderDriver(0, &SDL_PS3_RenderDriver);
  17.188 +
  17.189 +    /* We're done! */
  17.190 +    return 0;
  17.191 +}
  17.192 +
  17.193 +void
  17.194 +PS3_VideoQuit(_THIS)
  17.195 +{
  17.196 +    deprintf(1, "PS3_VideoQuit()\n");
  17.197 +    SDL_VideoData *data = (SDL_VideoData *) _this->driverdata;
  17.198 +
  17.199 +    PS3_QuitModes(_this);
  17.200 +
  17.201 +    /* Unmap framebuffer */
  17.202 +    if (data->frame_buffer) {
  17.203 +        struct fb_fix_screeninfo fb_finfo;
  17.204 +        if (ioctl(data->fbdev, FBIOGET_FSCREENINFO, &fb_finfo) != -1) {
  17.205 +            munmap(data->frame_buffer, fb_finfo.smem_len);
  17.206 +            data->frame_buffer = 0;
  17.207 +        }
  17.208 +    }
  17.209 +
  17.210 +    /* Shutdown SPE and related resources */
  17.211 +    if (data->fb_parms)
  17.212 +        free((void *)data->fb_parms);
  17.213 +    if (data->fb_thread_data) {
  17.214 +        SPE_Shutdown(data->fb_thread_data);
  17.215 +        free((void *)data->fb_thread_data);
  17.216 +    }
  17.217 +
  17.218 +    /* Close device */
  17.219 +    if (data->fbdev) {
  17.220 +        /* Give control of frame buffer back to kernel */
  17.221 +        ioctl(data->fbdev, PS3FB_IOCTL_OFF, 0);
  17.222 +        close(data->fbdev);
  17.223 +        data->fbdev = -1;
  17.224 +    }
  17.225 +}
  17.226 +
  17.227 +/* vi: set ts=4 sw=4 expandtab: */
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/src/video/ps3/SDL_ps3video.h	Mon Sep 07 04:51:29 2009 +0000
    18.3 @@ -0,0 +1,79 @@
    18.4 +/*
    18.5 +    SDL - Simple DirectMedia Layer
    18.6 +    Copyright (C) 1997-2009 Sam Lantinga
    18.7 +
    18.8 +    This library is free software; you can redistribute it and/or
    18.9 +    modify it under the terms of the GNU Lesser General Public
   18.10 +    License as published by the Free Software Foundation; either
   18.11 +    version 2.1 of the License, or (at your option) any later version.
   18.12 +
   18.13 +    This library is distributed in the hope that it will be useful,
   18.14 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   18.15 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   18.16 +    Lesser General Public License for more details.
   18.17 +
   18.18 +    You should have received a copy of the GNU Lesser General Public
   18.19 +    License along with this library; if not, write to the Free Software
   18.20 +    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
   18.21 +
   18.22 +    Sam Lantinga
   18.23 +    slouken@libsdl.org
   18.24 +*/
   18.25 +#include "SDL_config.h"
   18.26 +
   18.27 +#ifndef _SDL_ps3video_h
   18.28 +#define _SDL_ps3video_h
   18.29 +
   18.30 +#include "../SDL_sysvideo.h"
   18.31 +#include "SDL_ps3spe_c.h"
   18.32 +
   18.33 +#include <linux/fb.h>
   18.34 +#include <asm/ps3fb.h>
   18.35 +
   18.36 +/* Debugging
   18.37 + * 0: No debug messages
   18.38 + * 1: Video debug messages
   18.39 + * 2: SPE debug messages
   18.40 + * 3: Memory adresses
   18.41 + */
   18.42 +#define DEBUG_LEVEL 0
   18.43 +
   18.44 +#ifdef DEBUG_LEVEL
   18.45 +#define deprintf( level, fmt, args... ) \
   18.46 +    do \
   18.47 +{ \
   18.48 +    if ( (unsigned)(level) <= DEBUG_LEVEL ) \
   18.49 +    { \
   18.50 +        fprintf( stdout, fmt, ##args ); \
   18.51 +        fflush( stdout ); \
   18.52 +    } \
   18.53 +} while ( 0 )
   18.54 +#else
   18.55 +#define deprintf( level, fmt, args... )
   18.56 +#endif
   18.57 +
   18.58 +/* Default framebuffer device on PS3 */
   18.59 +#define PS3DEV "/dev/fb0"
   18.60 +
   18.61 +/* Private display data */
   18.62 +typedef struct SDL_VideoData
   18.63 +{
   18.64 +    /* Framebuffer device descriptor */
   18.65 +    int fbdev;
   18.66 +    /* mmap'd access to fbdev */
   18.67 +    uint8_t * frame_buffer;
   18.68 +    /* SPE threading stuff of the framebuffer */
   18.69 +    spu_data_t * fb_thread_data;
   18.70 +    /* Framebuffer transfer data */
   18.71 +    volatile struct fb_writer_parms_t * fb_parms __attribute__((aligned(128)));
   18.72 +} SDL_VideoData;
   18.73 +
   18.74 +typedef struct SDL_DisplayModeData
   18.75 +{
   18.76 +    unsigned long mode;
   18.77 +    //struct ps3fb_ioctl_res res;
   18.78 +} PS3_DisplayModeData;
   18.79 +
   18.80 +#endif /* _SDL_ps3video_h */
   18.81 +
   18.82 +/* vi: set ts=4 sw=4 expandtab: */
    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/src/video/ps3/spulibs/Makefile	Mon Sep 07 04:51:29 2009 +0000
    19.3 @@ -0,0 +1,47 @@
    19.4 +# This Makefile is for building the CELL BE SPU libs
    19.5 +# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so
    19.6 +
    19.7 +# Toolchain
    19.8 +PPU_LD=/usr/bin/ld
    19.9 +SPU_SRCDIR=$(srcdir)/src/video/ps3/spulibs
   19.10 +SPU_LIBDIR=$(srcdir)/src/video/ps3/spulibs/libs
   19.11 +SPU_CFLAGS=-g -W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
   19.12 +
   19.13 +DEPS = $(SPU_SRCDIR)/spu_common.h
   19.14 +LIBS= fb_writer yuv2rgb bilin_scaler
   19.15 +
   19.16 +OBJLIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.a)
   19.17 +SHALIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.so)
   19.18 +
   19.19 +
   19.20 +ps3libs: $(foreach lib,$(OBJLIBS),$(SPU_LIBDIR)/$(lib)) $(foreach lib,$(SHALIBS),$(SPU_LIBDIR)/$(lib))
   19.21 +
   19.22 +
   19.23 +$(SPU_LIBDIR)/lib%_spu.a: $(SPU_LIBDIR)/%-embed.o
   19.24 +	$(AR) -qcs $@ $<
   19.25 +
   19.26 +$(SPU_LIBDIR)/lib%_spu.so: $(SPU_LIBDIR)/%-embed.o
   19.27 +	$(PPU_LD) -o $@ -shared -soname=$(notdir $@) $<
   19.28 +
   19.29 +$(SPU_LIBDIR)/%-embed.o: $(SPU_LIBDIR)/%.o
   19.30 +	$(EMBEDSPU) -m32 $(subst -embed.o,,$(notdir $@))_spu $< $@
   19.31 +
   19.32 +$(SPU_LIBDIR)/%.o: $(SPU_SRCDIR)/%.c $(DEPS)
   19.33 +	$(SPU_GCC) $(SPU_CFLAGS) -o $@ $< -lm
   19.34 +
   19.35 +
   19.36 +ps3libs-install: $(foreach obj,$(OBJLIBS),$(SPU_LIBDIR)/$(obj)) $(foreach obj,$(SHALIBS),$(SPU_LIBDIR)/$(obj))
   19.37 +	for file in $(OBJLIBS); do \
   19.38 +		$(INSTALL) -c -m 0655 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \
   19.39 +	done
   19.40 +	for file in $(SHALIBS); do \
   19.41 +		$(INSTALL) -c -m 0755 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \
   19.42 +	done
   19.43 +
   19.44 +ps3libs-uninstall:
   19.45 +	for file in $(OBJLIBS) $(SHALIBS); do \
   19.46 +		rm -f $(DESTDIR)$(libdir)/$$file; \
   19.47 +	done
   19.48 +
   19.49 +ps3libs-clean:
   19.50 +	rm -f $(SPU_LIBDIR)/*
    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/src/video/ps3/spulibs/bilin_scaler.c	Mon Sep 07 04:51:29 2009 +0000
    20.3 @@ -0,0 +1,2050 @@
    20.4 +/*
    20.5 + * SDL - Simple DirectMedia Layer
    20.6 + * CELL BE Support for PS3 Framebuffer
    20.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    20.8 + *
    20.9 + * This library is free software; you can redistribute it and/or modify it
   20.10 + * under the terms of the GNU Lesser General Public License as published
   20.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   20.12 + * (at your option) any later version.
   20.13 + *
   20.14 + * This library is distributed in the hope that it will be useful, but
   20.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   20.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   20.17 + * Lesser General Public License for more details.
   20.18 + *
   20.19 + * You should have received a copy of the GNU Lesser General Public
   20.20 + * License along with this library; if not, write to the Free Software
   20.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   20.22 + * USA
   20.23 + *
   20.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   20.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   20.26 + *  SPE code based on research by:
   20.27 + *  Rene Becker
   20.28 + *  Thimo Emmerich
   20.29 + */
   20.30 +
   20.31 +#include "spu_common.h"
   20.32 +
   20.33 +#include <spu_intrinsics.h>
   20.34 +#include <spu_mfcio.h>
   20.35 +
   20.36 +// Debugging
   20.37 +//#define DEBUG
   20.38 +
   20.39 +#ifdef DEBUG
   20.40 +#define deprintf(fmt, args... ) \
   20.41 +	fprintf( stdout, fmt, ##args ); \
   20.42 +	fflush( stdout );
   20.43 +#else
   20.44 +#define deprintf( fmt, args... )
   20.45 +#endif
   20.46 +
   20.47 +struct scale_parms_t parms __attribute__((aligned(128)));
   20.48 +
   20.49 +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
   20.50 + * there might be the need to retrieve misaligned data, adjust
   20.51 + * incoming v and u plane to be able to handle this (add 128)
   20.52 + */
   20.53 +unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
   20.54 +unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
   20.55 +unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
   20.56 +
   20.57 +/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
   20.58 +unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
   20.59 +unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
   20.60 +unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
   20.61 +
   20.62 +/* some vectors needed by the float to int conversion */
   20.63 +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
   20.64 +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
   20.65 +
   20.66 +void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
   20.67 +void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
   20.68 +
   20.69 +void scale_srcw16_dstw16();
   20.70 +void scale_srcw16_dstw32();
   20.71 +void scale_srcw32_dstw16();
   20.72 +void scale_srcw32_dstw32();
   20.73 +
   20.74 +int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
   20.75 +{
   20.76 +	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
   20.77 +	/* DMA transfer for the input parameters */
   20.78 +	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
   20.79 +	DMA_WAIT_TAG(TAG_INIT);
   20.80 +
   20.81 +	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
   20.82 +			parms.dst_pixel_width, parms.dst_pixel_height);
   20.83 +
   20.84 +	if(parms.src_pixel_width & 0x1f) {
   20.85 +		if(parms.dst_pixel_width & 0x1F) {
   20.86 +			deprintf("[SPU] Using scale_srcw16_dstw16\n");
   20.87 +			scale_srcw16_dstw16();
   20.88 +		} else {
   20.89 +			deprintf("[SPU] Using scale_srcw16_dstw32\n");
   20.90 +			scale_srcw16_dstw32();
   20.91 +		}
   20.92 +	} else {
   20.93 +		if(parms.dst_pixel_width & 0x1F) {
   20.94 +			deprintf("[SPU] Using scale_srcw32_dstw16\n");
   20.95 +			scale_srcw32_dstw16();
   20.96 +		} else {
   20.97 +			deprintf("[SPU] Using scale_srcw32_dstw32\n");
   20.98 +			scale_srcw32_dstw32();
   20.99 +		}
  20.100 +	}
  20.101 +	deprintf("[SPU] bilin_scaler_spu... done!\n");
  20.102 +
  20.103 +	return 0;
  20.104 +}
  20.105 +
  20.106 +
  20.107 +/*
  20.108 + * vfloat_to_vuint()
  20.109 + *
  20.110 + * converts a float vector to an unsinged int vector using saturated
  20.111 + * arithmetic
  20.112 + *
  20.113 + * @param vec_s float vector for conversion
  20.114 + * @returns converted unsigned int vector
  20.115 + */
  20.116 +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
  20.117 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  20.118 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  20.119 +
  20.120 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  20.121 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  20.122 +	return spu_convtu(vec_s,0);
  20.123 +}
  20.124 +
  20.125 +
  20.126 +/*
  20.127 + * scale_srcw16_dstw16()
  20.128 + *
  20.129 + * processes an input image of width 16
  20.130 + * scaling is done to a width 16
  20.131 + * result stored in RAM
  20.132 + */
  20.133 +void scale_srcw16_dstw16() {
  20.134 +	// extract parameters
  20.135 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  20.136 +
  20.137 +	unsigned int src_width = parms.src_pixel_width;
  20.138 +	unsigned int src_height = parms.src_pixel_height;
  20.139 +	unsigned int dst_width = parms.dst_pixel_width;
  20.140 +	unsigned int dst_height = parms.dst_pixel_height;
  20.141 +
  20.142 +	// YVU
  20.143 +	unsigned int src_linestride_y = src_width;
  20.144 +	unsigned int src_dbl_linestride_y = src_width<<1;
  20.145 +	unsigned int src_linestride_vu = src_width>>1;
  20.146 +	unsigned int src_dbl_linestride_vu = src_width;
  20.147 +
  20.148 +	// scaled YVU
  20.149 +	unsigned int scaled_src_linestride_y = dst_width;
  20.150 +
  20.151 +	// ram addresses
  20.152 +	unsigned char* src_addr_y = parms.y_plane;
  20.153 +	unsigned char* src_addr_v = parms.v_plane;
  20.154 +	unsigned char* src_addr_u = parms.u_plane;
  20.155 +
  20.156 +	// for handling misalignment, addresses are precalculated
  20.157 +	unsigned char* precalc_src_addr_v = src_addr_v;
  20.158 +	unsigned char* precalc_src_addr_u = src_addr_u;
  20.159 +
  20.160 +	unsigned int dst_picture_size = dst_width*dst_height;
  20.161 +
  20.162 +	// Sizes for destination
  20.163 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  20.164 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  20.165 +
  20.166 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  20.167 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  20.168 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  20.169 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  20.170 +
  20.171 +	// calculate scale factors
  20.172 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  20.173 +	float y_scale = (float)src_height/(float)dst_height;
  20.174 +
  20.175 +	// double buffered processing
  20.176 +	// buffer switching
  20.177 +	unsigned int curr_src_idx = 0;
  20.178 +	unsigned int curr_dst_idx = 0;
  20.179 +	unsigned int next_src_idx, next_dst_idx;
  20.180 +
  20.181 +	// 2 lines y as output, upper and lowerline
  20.182 +	unsigned int curr_interpl_y_upper = 0;
  20.183 +	unsigned int next_interpl_y_upper;
  20.184 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  20.185 +	// only 1 line v/u output, both planes have the same dimension
  20.186 +	unsigned int curr_interpl_vu = 0;
  20.187 +	unsigned int next_interpl_vu;
  20.188 +
  20.189 +	// weights, calculated in every loop iteration
  20.190 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.191 +	vector float vf_next_NSweight_y_upper;
  20.192 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  20.193 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.194 +	vector float vf_next_NSweight_vu;
  20.195 +
  20.196 +	// line indices for the src picture
  20.197 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  20.198 +	float curr_src_y_lower, next_src_y_lower;
  20.199 +	float curr_src_vu = 0.0f, next_src_vu;
  20.200 +
  20.201 +	// line indices for the dst picture
  20.202 +	unsigned int dst_y=0, dst_vu=0;
  20.203 +
  20.204 +	// offset for the v and u plane to handle misalignement
  20.205 +	unsigned int curr_lsoff_v = 0, next_lsoff_v;
  20.206 +	unsigned int curr_lsoff_u = 0, next_lsoff_u;
  20.207 +
  20.208 +	// calculate lower line indices
  20.209 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  20.210 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  20.211 +	// lower line weight
  20.212 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  20.213 +
  20.214 +
  20.215 +	// start partially double buffered processing
  20.216 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  20.217 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  20.218 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.219 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  20.220 +			src_dbl_linestride_y,
  20.221 +			RETR_BUF,
  20.222 +			0, 0 );
  20.223 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.224 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.225 +
  20.226 +	/* iteration loop
  20.227 +	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  20.228 +	 * the scaled output is 2 lines y, 1 line v, 1 line u
  20.229 +	 * the yuv2rgb-converted output is stored to RAM
  20.230 +	 */
  20.231 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  20.232 +		dst_y = dst_vu<<1;
  20.233 +
  20.234 +		// calculate next indices
  20.235 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  20.236 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  20.237 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  20.238 +
  20.239 +		next_interpl_vu = (unsigned int) next_src_vu;
  20.240 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  20.241 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  20.242 +
  20.243 +		// calculate weight NORTH-SOUTH
  20.244 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  20.245 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  20.246 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  20.247 +
  20.248 +		// get next lines
  20.249 +		next_src_idx = curr_src_idx^1;
  20.250 +		next_dst_idx = curr_dst_idx^1;
  20.251 +
  20.252 +		// 4 lines y
  20.253 +		mfc_get( y_plane[next_src_idx],
  20.254 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  20.255 +				src_dbl_linestride_y,
  20.256 +				RETR_BUF+next_src_idx,
  20.257 +				0, 0 );
  20.258 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  20.259 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  20.260 +				src_dbl_linestride_y,
  20.261 +				RETR_BUF+next_src_idx,
  20.262 +				0, 0 );
  20.263 +
  20.264 +		// 2 lines v
  20.265 +		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
  20.266 +		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
  20.267 +		mfc_get( v_plane[next_src_idx],
  20.268 +				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
  20.269 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  20.270 +				RETR_BUF+next_src_idx,
  20.271 +				0, 0 );
  20.272 +		// 2 lines u
  20.273 +		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
  20.274 +		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
  20.275 +		mfc_get( u_plane[next_src_idx],
  20.276 +				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
  20.277 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  20.278 +				RETR_BUF+next_src_idx,
  20.279 +				0, 0 );
  20.280 +
  20.281 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.282 +
  20.283 +		// scaling
  20.284 +		// work line y_upper
  20.285 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.286 +				scaled_y_plane[curr_src_idx],
  20.287 +				dst_width,
  20.288 +				vf_x_scale,
  20.289 +				vf_curr_NSweight_y_upper,
  20.290 +				src_linestride_y );
  20.291 +		// work line y_lower
  20.292 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.293 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.294 +				dst_width,
  20.295 +				vf_x_scale,
  20.296 +				vf_curr_NSweight_y_lower,
  20.297 +				src_linestride_y );
  20.298 +		// work line v
  20.299 +		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  20.300 +				scaled_v_plane[curr_src_idx],
  20.301 +				dst_width>>1,
  20.302 +				vf_x_scale,
  20.303 +				vf_curr_NSweight_vu,
  20.304 +				src_linestride_vu );
  20.305 +		// work line u
  20.306 +		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  20.307 +				scaled_u_plane[curr_src_idx],
  20.308 +				dst_width>>1,
  20.309 +				vf_x_scale,
  20.310 +				vf_curr_NSweight_vu,
  20.311 +				src_linestride_vu );
  20.312 +
  20.313 +
  20.314 +		// Store the result back to main memory into a destination buffer in YUV format
  20.315 +		//---------------------------------------------------------------------------------------------
  20.316 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.317 +
  20.318 +		// Perform three DMA transfers to 3 different locations in the main memory!
  20.319 +		// dst_width:	Pixel width of destination image
  20.320 +		// dst_addr:	Destination address in main memory
  20.321 +		// dst_vu:	Counter which is incremented one by one
  20.322 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.323 +		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
  20.324 +				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  20.325 +				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
  20.326 +				STR_BUF+curr_dst_idx,						// Tag
  20.327 +				0, 0 );
  20.328 +
  20.329 +		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
  20.330 +				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.331 +				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
  20.332 +				STR_BUF+curr_dst_idx,						// Tag
  20.333 +				0, 0 );
  20.334 +
  20.335 +		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
  20.336 +				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.337 +				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
  20.338 +				STR_BUF+curr_dst_idx,						// Tag
  20.339 +				0, 0 );
  20.340 +		//---------------------------------------------------------------------------------------------
  20.341 +
  20.342 +
  20.343 +		// update for next cycle
  20.344 +		curr_src_idx = next_src_idx;
  20.345 +		curr_dst_idx = next_dst_idx;
  20.346 +
  20.347 +		curr_interpl_y_upper = next_interpl_y_upper;
  20.348 +		curr_interpl_y_lower = next_interpl_y_lower;
  20.349 +		curr_interpl_vu = next_interpl_vu;
  20.350 +
  20.351 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  20.352 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  20.353 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  20.354 +
  20.355 +		curr_src_y_upper = next_src_y_upper;
  20.356 +		curr_src_y_lower = next_src_y_lower;
  20.357 +		curr_src_vu = next_src_vu;
  20.358 +
  20.359 +		curr_lsoff_v = next_lsoff_v;
  20.360 +		curr_lsoff_u = next_lsoff_u;
  20.361 +	}
  20.362 +
  20.363 +
  20.364 +
  20.365 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.366 +
  20.367 +	// scaling
  20.368 +	// work line y_upper
  20.369 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.370 +			scaled_y_plane[curr_src_idx],
  20.371 +			dst_width,
  20.372 +			vf_x_scale,
  20.373 +			vf_curr_NSweight_y_upper,
  20.374 +			src_linestride_y );
  20.375 +	// work line y_lower
  20.376 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.377 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.378 +			dst_width,
  20.379 +			vf_x_scale,
  20.380 +			vf_curr_NSweight_y_lower,
  20.381 +			src_linestride_y );
  20.382 +	// work line v
  20.383 +	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  20.384 +			scaled_v_plane[curr_src_idx],
  20.385 +			dst_width>>1,
  20.386 +			vf_x_scale,
  20.387 +			vf_curr_NSweight_vu,
  20.388 +			src_linestride_vu );
  20.389 +	// work line u
  20.390 +	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  20.391 +			scaled_u_plane[curr_src_idx],
  20.392 +			dst_width>>1,
  20.393 +			vf_x_scale,
  20.394 +			vf_curr_NSweight_vu,
  20.395 +			src_linestride_vu );
  20.396 +
  20.397 +
  20.398 +	// Store the result back to main memory into a destination buffer in YUV format
  20.399 +	//---------------------------------------------------------------------------------------------
  20.400 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.401 +
  20.402 +	// Perform three DMA transfers to 3 different locations in the main memory!
  20.403 +	// dst_width:	Pixel width of destination image
  20.404 +	// dst_addr:	Destination address in main memory
  20.405 +	// dst_vu:	Counter which is incremented one by one
  20.406 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.407 +	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
  20.408 +			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  20.409 +			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
  20.410 +			STR_BUF+curr_dst_idx,						// Tag
  20.411 +			0, 0 );
  20.412 +
  20.413 +	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
  20.414 +			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.415 +			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
  20.416 +			STR_BUF+curr_dst_idx,						// Tag
  20.417 +			0, 0 );
  20.418 +
  20.419 +	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
  20.420 +			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.421 +			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
  20.422 +			STR_BUF+curr_dst_idx,						// Tag
  20.423 +			0, 0 );
  20.424 +
  20.425 +	// wait for completion
  20.426 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.427 +	//---------------------------------------------------------------------------------------------
  20.428 +}
  20.429 +
  20.430 +
  20.431 +/*
  20.432 + * scale_srcw16_dstw32()
  20.433 + *
  20.434 + * processes an input image of width 16
  20.435 + * scaling is done to a width 32
  20.436 + * yuv2rgb conversion on a width of 32
  20.437 + * result stored in RAM
  20.438 + */
  20.439 +void scale_srcw16_dstw32() {
  20.440 +	// extract parameters
  20.441 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  20.442 +
  20.443 +	unsigned int src_width = parms.src_pixel_width;
  20.444 +	unsigned int src_height = parms.src_pixel_height;
  20.445 +	unsigned int dst_width = parms.dst_pixel_width;
  20.446 +	unsigned int dst_height = parms.dst_pixel_height;
  20.447 +
  20.448 +	// YVU
  20.449 +	unsigned int src_linestride_y = src_width;
  20.450 +	unsigned int src_dbl_linestride_y = src_width<<1;
  20.451 +	unsigned int src_linestride_vu = src_width>>1;
  20.452 +	unsigned int src_dbl_linestride_vu = src_width;
  20.453 +	// scaled YVU
  20.454 +	unsigned int scaled_src_linestride_y = dst_width;
  20.455 +
  20.456 +	// ram addresses
  20.457 +	unsigned char* src_addr_y = parms.y_plane;
  20.458 +	unsigned char* src_addr_v = parms.v_plane;
  20.459 +	unsigned char* src_addr_u = parms.u_plane;
  20.460 +
  20.461 +	unsigned int dst_picture_size = dst_width*dst_height;
  20.462 +
  20.463 +	// Sizes for destination
  20.464 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  20.465 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  20.466 +
  20.467 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  20.468 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  20.469 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  20.470 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  20.471 +
  20.472 +
  20.473 +	// for handling misalignment, addresses are precalculated
  20.474 +	unsigned char* precalc_src_addr_v = src_addr_v;
  20.475 +	unsigned char* precalc_src_addr_u = src_addr_u;
  20.476 +
  20.477 +	// calculate scale factors
  20.478 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  20.479 +	float y_scale = (float)src_height/(float)dst_height;
  20.480 +
  20.481 +	// double buffered processing
  20.482 +	// buffer switching
  20.483 +	unsigned int curr_src_idx = 0;
  20.484 +	unsigned int curr_dst_idx = 0;
  20.485 +	unsigned int next_src_idx, next_dst_idx;
  20.486 +
  20.487 +	// 2 lines y as output, upper and lowerline
  20.488 +	unsigned int curr_interpl_y_upper = 0;
  20.489 +	unsigned int next_interpl_y_upper;
  20.490 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  20.491 +	// only 1 line v/u output, both planes have the same dimension
  20.492 +	unsigned int curr_interpl_vu = 0;
  20.493 +	unsigned int next_interpl_vu;
  20.494 +
  20.495 +	// weights, calculated in every loop iteration
  20.496 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.497 +	vector float vf_next_NSweight_y_upper;
  20.498 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  20.499 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.500 +	vector float vf_next_NSweight_vu;
  20.501 +
  20.502 +	// line indices for the src picture
  20.503 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  20.504 +	float curr_src_y_lower, next_src_y_lower;
  20.505 +	float curr_src_vu = 0.0f, next_src_vu;
  20.506 +
  20.507 +	// line indices for the dst picture
  20.508 +	unsigned int dst_y=0, dst_vu=0;
  20.509 +
  20.510 +	// offset for the v and u plane to handle misalignement
  20.511 +	unsigned int curr_lsoff_v = 0, next_lsoff_v;
  20.512 +	unsigned int curr_lsoff_u = 0, next_lsoff_u;
  20.513 +
  20.514 +	// calculate lower line idices
  20.515 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  20.516 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  20.517 +	// lower line weight
  20.518 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  20.519 +
  20.520 +
  20.521 +	// start partially double buffered processing
  20.522 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  20.523 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  20.524 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.525 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  20.526 +			src_dbl_linestride_y,
  20.527 +			RETR_BUF,
  20.528 +			0, 0 );
  20.529 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.530 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.531 +
  20.532 +	// iteration loop
  20.533 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  20.534 +	// the scaled output is 2 lines y, 1 line v, 1 line u
  20.535 +	// the yuv2rgb-converted output is stored to RAM
  20.536 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  20.537 +		dst_y = dst_vu<<1;
  20.538 +
  20.539 +		// calculate next indices
  20.540 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  20.541 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  20.542 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  20.543 +
  20.544 +		next_interpl_vu = (unsigned int) next_src_vu;
  20.545 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  20.546 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  20.547 +
  20.548 +		// calculate weight NORTH-SOUTH
  20.549 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  20.550 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  20.551 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  20.552 +
  20.553 +		// get next lines
  20.554 +		next_src_idx = curr_src_idx^1;
  20.555 +		next_dst_idx = curr_dst_idx^1;
  20.556 +
  20.557 +		// 4 lines y
  20.558 +		mfc_get( y_plane[next_src_idx],
  20.559 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  20.560 +				src_dbl_linestride_y,
  20.561 +				RETR_BUF+next_src_idx,
  20.562 +				0, 0 );
  20.563 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  20.564 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  20.565 +				src_dbl_linestride_y,
  20.566 +				RETR_BUF+next_src_idx,
  20.567 +				0, 0 );
  20.568 +
  20.569 +		// 2 lines v
  20.570 +		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
  20.571 +		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
  20.572 +		mfc_get( v_plane[next_src_idx],
  20.573 +				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
  20.574 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  20.575 +				RETR_BUF+next_src_idx,
  20.576 +				0, 0 );
  20.577 +		// 2 lines u
  20.578 +		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
  20.579 +		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
  20.580 +		mfc_get( u_plane[next_src_idx],
  20.581 +				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
  20.582 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  20.583 +				RETR_BUF+next_src_idx,
  20.584 +				0, 0 );
  20.585 +
  20.586 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.587 +
  20.588 +		// scaling
  20.589 +		// work line y_upper
  20.590 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.591 +				scaled_y_plane[curr_src_idx],
  20.592 +				dst_width,
  20.593 +				vf_x_scale,
  20.594 +				vf_curr_NSweight_y_upper,
  20.595 +				src_linestride_y );
  20.596 +		// work line y_lower
  20.597 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.598 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.599 +				dst_width,
  20.600 +				vf_x_scale,
  20.601 +				vf_curr_NSweight_y_lower,
  20.602 +				src_linestride_y );
  20.603 +		// work line v
  20.604 +		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  20.605 +				scaled_v_plane[curr_src_idx],
  20.606 +				dst_width>>1,
  20.607 +				vf_x_scale,
  20.608 +				vf_curr_NSweight_vu,
  20.609 +				src_linestride_vu );
  20.610 +		// work line u
  20.611 +		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  20.612 +				scaled_u_plane[curr_src_idx],
  20.613 +				dst_width>>1,
  20.614 +				vf_x_scale,
  20.615 +				vf_curr_NSweight_vu,
  20.616 +				src_linestride_vu );
  20.617 +
  20.618 +		//---------------------------------------------------------------------------------------------
  20.619 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.620 +
  20.621 +		// Perform three DMA transfers to 3 different locations in the main memory!
  20.622 +		// dst_width:	Pixel width of destination image
  20.623 +		// dst_addr:	Destination address in main memory
  20.624 +		// dst_vu:	Counter which is incremented one by one
  20.625 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.626 +
  20.627 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  20.628 +				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  20.629 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  20.630 +				STR_BUF+curr_dst_idx,								// Tag
  20.631 +				0, 0 );
  20.632 +
  20.633 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  20.634 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.635 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  20.636 +				STR_BUF+curr_dst_idx,								// Tag
  20.637 +				0, 0 );
  20.638 +
  20.639 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  20.640 +				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.641 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  20.642 +				STR_BUF+curr_dst_idx,								// Tag
  20.643 +				0, 0 );
  20.644 +		//---------------------------------------------------------------------------------------------
  20.645 +
  20.646 +
  20.647 +		// update for next cycle
  20.648 +		curr_src_idx = next_src_idx;
  20.649 +		curr_dst_idx = next_dst_idx;
  20.650 +
  20.651 +		curr_interpl_y_upper = next_interpl_y_upper;
  20.652 +		curr_interpl_y_lower = next_interpl_y_lower;
  20.653 +		curr_interpl_vu = next_interpl_vu;
  20.654 +
  20.655 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  20.656 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  20.657 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  20.658 +
  20.659 +		curr_src_y_upper = next_src_y_upper;
  20.660 +		curr_src_y_lower = next_src_y_lower;
  20.661 +		curr_src_vu = next_src_vu;
  20.662 +
  20.663 +		curr_lsoff_v = next_lsoff_v;
  20.664 +		curr_lsoff_u = next_lsoff_u;
  20.665 +	}
  20.666 +
  20.667 +
  20.668 +
  20.669 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.670 +
  20.671 +	// scaling
  20.672 +	// work line y_upper
  20.673 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.674 +			scaled_y_plane[curr_src_idx],
  20.675 +			dst_width,
  20.676 +			vf_x_scale,
  20.677 +			vf_curr_NSweight_y_upper,
  20.678 +			src_linestride_y );
  20.679 +	// work line y_lower
  20.680 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.681 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.682 +			dst_width,
  20.683 +			vf_x_scale,
  20.684 +			vf_curr_NSweight_y_lower,
  20.685 +			src_linestride_y );
  20.686 +	// work line v
  20.687 +	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  20.688 +			scaled_v_plane[curr_src_idx],
  20.689 +			dst_width>>1,
  20.690 +			vf_x_scale,
  20.691 +			vf_curr_NSweight_vu,
  20.692 +			src_linestride_vu );
  20.693 +	// work line u
  20.694 +	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  20.695 +			scaled_u_plane[curr_src_idx],
  20.696 +			dst_width>>1,
  20.697 +			vf_x_scale,
  20.698 +			vf_curr_NSweight_vu,
  20.699 +			src_linestride_vu );
  20.700 +
  20.701 +	//---------------------------------------------------------------------------------------------
  20.702 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.703 +
  20.704 +	// Perform three DMA transfers to 3 different locations in the main memory!
  20.705 +	// dst_width:	Pixel width of destination image
  20.706 +	// dst_addr:	Destination address in main memory
  20.707 +	// dst_vu:	Counter which is incremented one by one
  20.708 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.709 +
  20.710 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  20.711 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  20.712 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  20.713 +			STR_BUF+curr_dst_idx,								// Tag
  20.714 +			0, 0 );
  20.715 +
  20.716 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  20.717 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.718 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  20.719 +			STR_BUF+curr_dst_idx,								// Tag
  20.720 +			0, 0 );
  20.721 +
  20.722 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  20.723 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.724 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  20.725 +			STR_BUF+curr_dst_idx,								// Tag
  20.726 +			0, 0 );
  20.727 +
  20.728 +	// wait for completion
  20.729 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.730 +	//---------------------------------------------------------------------------------------------
  20.731 +}
  20.732 +
  20.733 +
  20.734 +/*
  20.735 + * scale_srcw32_dstw16()
  20.736 + *
  20.737 + * processes an input image of width 32
  20.738 + * scaling is done to a width 16
  20.739 + * yuv2rgb conversion on a width of 16
  20.740 + * result stored in RAM
  20.741 + */
  20.742 +void scale_srcw32_dstw16() {
  20.743 +	// extract parameters
  20.744 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  20.745 +
  20.746 +	unsigned int src_width = parms.src_pixel_width;
  20.747 +	unsigned int src_height = parms.src_pixel_height;
  20.748 +	unsigned int dst_width = parms.dst_pixel_width;
  20.749 +	unsigned int dst_height = parms.dst_pixel_height;
  20.750 +
  20.751 +	// YVU
  20.752 +	unsigned int src_linestride_y = src_width;
  20.753 +	unsigned int src_dbl_linestride_y = src_width<<1;
  20.754 +	unsigned int src_linestride_vu = src_width>>1;
  20.755 +	unsigned int src_dbl_linestride_vu = src_width;
  20.756 +	// scaled YVU
  20.757 +	unsigned int scaled_src_linestride_y = dst_width;
  20.758 +
  20.759 +	// ram addresses
  20.760 +	unsigned char* src_addr_y = parms.y_plane;
  20.761 +	unsigned char* src_addr_v = parms.v_plane;
  20.762 +	unsigned char* src_addr_u = parms.u_plane;
  20.763 +
  20.764 +	unsigned int dst_picture_size = dst_width*dst_height;
  20.765 +
  20.766 +	// Sizes for destination
  20.767 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  20.768 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  20.769 +
  20.770 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  20.771 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  20.772 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  20.773 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  20.774 +
  20.775 +	// calculate scale factors
  20.776 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  20.777 +	float y_scale = (float)src_height/(float)dst_height;
  20.778 +
  20.779 +	// double buffered processing
  20.780 +	// buffer switching
  20.781 +	unsigned int curr_src_idx = 0;
  20.782 +	unsigned int curr_dst_idx = 0;
  20.783 +	unsigned int next_src_idx, next_dst_idx;
  20.784 +
  20.785 +	// 2 lines y as output, upper and lowerline
  20.786 +	unsigned int curr_interpl_y_upper = 0;
  20.787 +	unsigned int next_interpl_y_upper;
  20.788 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  20.789 +	// only 1 line v/u output, both planes have the same dimension
  20.790 +	unsigned int curr_interpl_vu = 0;
  20.791 +	unsigned int next_interpl_vu;
  20.792 +
  20.793 +	// weights, calculated in every loop iteration
  20.794 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.795 +	vector float vf_next_NSweight_y_upper;
  20.796 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  20.797 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  20.798 +	vector float vf_next_NSweight_vu;
  20.799 +
  20.800 +	// line indices for the src picture
  20.801 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  20.802 +	float curr_src_y_lower, next_src_y_lower;
  20.803 +	float curr_src_vu = 0.0f, next_src_vu;
  20.804 +
  20.805 +	// line indices for the dst picture
  20.806 +	unsigned int dst_y=0, dst_vu=0;
  20.807 +
  20.808 +	// calculate lower line idices
  20.809 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  20.810 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  20.811 +	// lower line weight
  20.812 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  20.813 +
  20.814 +
  20.815 +	// start partially double buffered processing
  20.816 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  20.817 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  20.818 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.819 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  20.820 +			src_dbl_linestride_y,
  20.821 +			RETR_BUF,
  20.822 +			0, 0 );
  20.823 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.824 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  20.825 +
  20.826 +	// iteration loop
  20.827 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  20.828 +	// the scaled output is 2 lines y, 1 line v, 1 line u
  20.829 +	// the yuv2rgb-converted output is stored to RAM
  20.830 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  20.831 +		dst_y = dst_vu<<1;
  20.832 +
  20.833 +		// calculate next indices
  20.834 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  20.835 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  20.836 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  20.837 +
  20.838 +		next_interpl_vu = (unsigned int) next_src_vu;
  20.839 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  20.840 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  20.841 +
  20.842 +		// calculate weight NORTH-SOUTH
  20.843 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  20.844 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  20.845 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  20.846 +
  20.847 +		// get next lines
  20.848 +		next_src_idx = curr_src_idx^1;
  20.849 +		next_dst_idx = curr_dst_idx^1;
  20.850 +
  20.851 +		// 4 lines y
  20.852 +		mfc_get( y_plane[next_src_idx],
  20.853 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  20.854 +				src_dbl_linestride_y,
  20.855 +				RETR_BUF+next_src_idx,
  20.856 +				0, 0 );
  20.857 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  20.858 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  20.859 +				src_dbl_linestride_y,
  20.860 +				RETR_BUF+next_src_idx,
  20.861 +				0, 0 );
  20.862 +
  20.863 +		// 2 lines v
  20.864 +		mfc_get( v_plane[next_src_idx],
  20.865 +				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
  20.866 +				src_dbl_linestride_vu,
  20.867 +				RETR_BUF+next_src_idx,
  20.868 +				0, 0 );
  20.869 +		// 2 lines u
  20.870 +		mfc_get( u_plane[next_src_idx],
  20.871 +				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
  20.872 +				src_dbl_linestride_vu,
  20.873 +				RETR_BUF+next_src_idx,
  20.874 +				0, 0 );
  20.875 +
  20.876 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.877 +
  20.878 +		// scaling
  20.879 +		// work line y_upper
  20.880 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.881 +				scaled_y_plane[curr_src_idx],
  20.882 +				dst_width,
  20.883 +				vf_x_scale,
  20.884 +				vf_curr_NSweight_y_upper,
  20.885 +				src_linestride_y );
  20.886 +		// work line y_lower
  20.887 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.888 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.889 +				dst_width,
  20.890 +				vf_x_scale,
  20.891 +				vf_curr_NSweight_y_lower,
  20.892 +				src_linestride_y );
  20.893 +		// work line v
  20.894 +		bilinear_scale_line_w16( v_plane[curr_src_idx],
  20.895 +				scaled_v_plane[curr_src_idx],
  20.896 +				dst_width>>1,
  20.897 +				vf_x_scale,
  20.898 +				vf_curr_NSweight_vu,
  20.899 +				src_linestride_vu );
  20.900 +		// work line u
  20.901 +		bilinear_scale_line_w16( u_plane[curr_src_idx],
  20.902 +				scaled_u_plane[curr_src_idx],
  20.903 +				dst_width>>1,
  20.904 +				vf_x_scale,
  20.905 +				vf_curr_NSweight_vu,
  20.906 +				src_linestride_vu );
  20.907 +
  20.908 +		//---------------------------------------------------------------------------------------------
  20.909 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.910 +
  20.911 +		// Perform three DMA transfers to 3 different locations in the main memory!
  20.912 +		// dst_width:	Pixel width of destination image
  20.913 +		// dst_addr:	Destination address in main memory
  20.914 +		// dst_vu:	Counter which is incremented one by one
  20.915 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.916 +
  20.917 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  20.918 +				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  20.919 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  20.920 +				STR_BUF+curr_dst_idx,								// Tag
  20.921 +				0, 0 );
  20.922 +
  20.923 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  20.924 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  20.925 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  20.926 +				STR_BUF+curr_dst_idx,								// Tag
  20.927 +				0, 0 );
  20.928 +
  20.929 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  20.930 +				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
  20.931 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  20.932 +				STR_BUF+curr_dst_idx,								// Tag
  20.933 +				0, 0 );
  20.934 +		//---------------------------------------------------------------------------------------------
  20.935 +
  20.936 +
  20.937 +		// update for next cycle
  20.938 +		curr_src_idx = next_src_idx;
  20.939 +		curr_dst_idx = next_dst_idx;
  20.940 +
  20.941 +		curr_interpl_y_upper = next_interpl_y_upper;
  20.942 +		curr_interpl_y_lower = next_interpl_y_lower;
  20.943 +		curr_interpl_vu = next_interpl_vu;
  20.944 +
  20.945 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  20.946 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  20.947 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  20.948 +
  20.949 +		curr_src_y_upper = next_src_y_upper;
  20.950 +		curr_src_y_lower = next_src_y_lower;
  20.951 +		curr_src_vu = next_src_vu;
  20.952 +	}
  20.953 +
  20.954 +
  20.955 +
  20.956 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  20.957 +
  20.958 +	// scaling
  20.959 +	// work line y_upper
  20.960 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  20.961 +			scaled_y_plane[curr_src_idx],
  20.962 +			dst_width,
  20.963 +			vf_x_scale,
  20.964 +			vf_curr_NSweight_y_upper,
  20.965 +			src_linestride_y );
  20.966 +	// work line y_lower
  20.967 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  20.968 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  20.969 +			dst_width,
  20.970 +			vf_x_scale,
  20.971 +			vf_curr_NSweight_y_lower,
  20.972 +			src_linestride_y );
  20.973 +	// work line v
  20.974 +	bilinear_scale_line_w16( v_plane[curr_src_idx],
  20.975 +			scaled_v_plane[curr_src_idx],
  20.976 +			dst_width>>1,
  20.977 +			vf_x_scale,
  20.978 +			vf_curr_NSweight_vu,
  20.979 +			src_linestride_vu );
  20.980 +	// work line u
  20.981 +	bilinear_scale_line_w16( u_plane[curr_src_idx],
  20.982 +			scaled_u_plane[curr_src_idx],
  20.983 +			dst_width>>1,
  20.984 +			vf_x_scale,
  20.985 +			vf_curr_NSweight_vu,
  20.986 +			src_linestride_vu );
  20.987 +
  20.988 +
  20.989 +	//---------------------------------------------------------------------------------------------
  20.990 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  20.991 +
  20.992 +	// Perform three DMA transfers to 3 different locations in the main memory!
  20.993 +	// dst_width:	Pixel width of destination image
  20.994 +	// dst_addr:	Destination address in main memory
  20.995 +	// dst_vu:	Counter which is incremented one by one
  20.996 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  20.997 +
  20.998 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  20.999 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 20.1000 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 20.1001 +			STR_BUF+curr_dst_idx,								// Tag
 20.1002 +			0, 0 );
 20.1003 +
 20.1004 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 20.1005 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 20.1006 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 20.1007 +			STR_BUF+curr_dst_idx,								// Tag
 20.1008 +			0, 0 );
 20.1009 +
 20.1010 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 20.1011 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 20.1012 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 20.1013 +			STR_BUF+curr_dst_idx,								// Tag
 20.1014 +			0, 0 );
 20.1015 +
 20.1016 +	// wait for completion
 20.1017 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 20.1018 +	//---------------------------------------------------------------------------------------------
 20.1019 +}
 20.1020 +
 20.1021 +
 20.1022 +/**
 20.1023 + * scale_srcw32_dstw32()
 20.1024 + *
 20.1025 + * processes an input image of width 32
 20.1026 + * scaling is done to a width 32
 20.1027 + * yuv2rgb conversion on a width of 32
 20.1028 + * result stored in RAM
 20.1029 + */
 20.1030 +void scale_srcw32_dstw32() {
 20.1031 +	// extract parameters
 20.1032 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
 20.1033 +
 20.1034 +	unsigned int src_width = parms.src_pixel_width;
 20.1035 +	unsigned int src_height = parms.src_pixel_height;
 20.1036 +	unsigned int dst_width = parms.dst_pixel_width;
 20.1037 +	unsigned int dst_height = parms.dst_pixel_height;
 20.1038 +
 20.1039 +	// YVU
 20.1040 +	unsigned int src_linestride_y = src_width;
 20.1041 +	unsigned int src_dbl_linestride_y = src_width<<1;
 20.1042 +	unsigned int src_linestride_vu = src_width>>1;
 20.1043 +	unsigned int src_dbl_linestride_vu = src_width;
 20.1044 +
 20.1045 +	// scaled YVU
 20.1046 +	unsigned int scaled_src_linestride_y = dst_width;
 20.1047 +
 20.1048 +	// ram addresses
 20.1049 +	unsigned char* src_addr_y = parms.y_plane;
 20.1050 +	unsigned char* src_addr_v = parms.v_plane;
 20.1051 +	unsigned char* src_addr_u = parms.u_plane;
 20.1052 +
 20.1053 +	unsigned int dst_picture_size = dst_width*dst_height;
 20.1054 +
 20.1055 +	// Sizes for destination
 20.1056 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
 20.1057 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
 20.1058 +
 20.1059 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 20.1060 +	unsigned char* dst_addr_main_memory_y = dst_addr;
 20.1061 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 20.1062 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
 20.1063 +
 20.1064 +	// calculate scale factors
 20.1065 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 20.1066 +	float y_scale = (float)src_height/(float)dst_height;
 20.1067 +
 20.1068 +	// double buffered processing
 20.1069 +	// buffer switching
 20.1070 +	unsigned int curr_src_idx = 0;
 20.1071 +	unsigned int curr_dst_idx = 0;
 20.1072 +	unsigned int next_src_idx, next_dst_idx;
 20.1073 +
 20.1074 +	// 2 lines y as output, upper and lowerline
 20.1075 +	unsigned int curr_interpl_y_upper = 0;
 20.1076 +	unsigned int next_interpl_y_upper;
 20.1077 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 20.1078 +	// only 1 line v/u output, both planes have the same dimension
 20.1079 +	unsigned int curr_interpl_vu = 0;
 20.1080 +	unsigned int next_interpl_vu;
 20.1081 +
 20.1082 +	// weights, calculated in every loop iteration
 20.1083 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 20.1084 +	vector float vf_next_NSweight_y_upper;
 20.1085 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 20.1086 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 20.1087 +	vector float vf_next_NSweight_vu;
 20.1088 +
 20.1089 +	// line indices for the src picture
 20.1090 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
 20.1091 +	float curr_src_y_lower, next_src_y_lower;
 20.1092 +	float curr_src_vu = 0.0f, next_src_vu;
 20.1093 +
 20.1094 +	// line indices for the dst picture
 20.1095 +	unsigned int dst_y=0, dst_vu=0;
 20.1096 +
 20.1097 +	// calculate lower line idices
 20.1098 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 20.1099 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 20.1100 +	// lower line weight
 20.1101 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
 20.1102 +
 20.1103 +
 20.1104 +	// start partially double buffered processing
 20.1105 +	// get initial data, 2 sets of y, 1 set v, 1 set u
 20.1106 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 20.1107 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 20.1108 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 20.1109 +			src_dbl_linestride_y,
 20.1110 +			RETR_BUF,
 20.1111 +			0, 0 );
 20.1112 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 20.1113 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 20.1114 +
 20.1115 +	// iteration loop
 20.1116 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 20.1117 +	// the scaled output is 2 lines y, 1 line v, 1 line u
 20.1118 +	// the yuv2rgb-converted output is stored to RAM
 20.1119 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 20.1120 +		dst_y = dst_vu<<1;
 20.1121 +
 20.1122 +		// calculate next indices
 20.1123 +		next_src_vu = ((float)dst_vu+1)*y_scale;
 20.1124 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
 20.1125 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
 20.1126 +
 20.1127 +		next_interpl_vu = (unsigned int) next_src_vu;
 20.1128 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 20.1129 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
 20.1130 +
 20.1131 +		// calculate weight NORTH-SOUTH
 20.1132 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 20.1133 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 20.1134 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
 20.1135 +
 20.1136 +		// get next lines
 20.1137 +		next_src_idx = curr_src_idx^1;
 20.1138 +		next_dst_idx = curr_dst_idx^1;
 20.1139 +
 20.1140 +		// 4 lines y
 20.1141 +		mfc_get( y_plane[next_src_idx],
 20.1142 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 20.1143 +				src_dbl_linestride_y,
 20.1144 +				RETR_BUF+next_src_idx,
 20.1145 +				0, 0 );
 20.1146 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 20.1147 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 20.1148 +				src_dbl_linestride_y,
 20.1149 +				RETR_BUF+next_src_idx,
 20.1150 +				0, 0 );
 20.1151 +
 20.1152 +		// 2 lines v
 20.1153 +		mfc_get( v_plane[next_src_idx],
 20.1154 +				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
 20.1155 +				src_dbl_linestride_vu,
 20.1156 +				RETR_BUF+next_src_idx,
 20.1157 +				0, 0 );
 20.1158 +		// 2 lines u
 20.1159 +		mfc_get( u_plane[next_src_idx],
 20.1160 +				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
 20.1161 +				src_dbl_linestride_vu,
 20.1162 +				RETR_BUF+next_src_idx,
 20.1163 +				0, 0 );
 20.1164 +
 20.1165 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 20.1166 +
 20.1167 +		// scaling
 20.1168 +		// work line y_upper
 20.1169 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
 20.1170 +				scaled_y_plane[curr_src_idx],
 20.1171 +				dst_width,
 20.1172 +				vf_x_scale,
 20.1173 +				vf_curr_NSweight_y_upper,
 20.1174 +				src_linestride_y );
 20.1175 +		// work line y_lower
 20.1176 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 20.1177 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 20.1178 +				dst_width,
 20.1179 +				vf_x_scale,
 20.1180 +				vf_curr_NSweight_y_lower,
 20.1181 +				src_linestride_y );
 20.1182 +		// work line v
 20.1183 +		bilinear_scale_line_w16( v_plane[curr_src_idx],
 20.1184 +				scaled_v_plane[curr_src_idx],
 20.1185 +				dst_width>>1,
 20.1186 +				vf_x_scale,
 20.1187 +				vf_curr_NSweight_vu,
 20.1188 +				src_linestride_vu );
 20.1189 +		// work line u
 20.1190 +		bilinear_scale_line_w16( u_plane[curr_src_idx],
 20.1191 +				scaled_u_plane[curr_src_idx],
 20.1192 +				dst_width>>1,
 20.1193 +				vf_x_scale,
 20.1194 +				vf_curr_NSweight_vu,
 20.1195 +				src_linestride_vu );
 20.1196 +
 20.1197 +
 20.1198 +
 20.1199 +		// Store the result back to main memory into a destination buffer in YUV format
 20.1200 +		//---------------------------------------------------------------------------------------------
 20.1201 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 20.1202 +
 20.1203 +		// Perform three DMA transfers to 3 different locations in the main memory!
 20.1204 +		// dst_width:	Pixel width of destination image
 20.1205 +		// dst_addr:	Destination address in main memory
 20.1206 +		// dst_vu:	Counter which is incremented one by one
 20.1207 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 20.1208 +
 20.1209 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 20.1210 +				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 20.1211 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 20.1212 +				STR_BUF+curr_dst_idx,								// Tag
 20.1213 +				0, 0 );
 20.1214 +
 20.1215 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 20.1216 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 20.1217 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 20.1218 +				STR_BUF+curr_dst_idx,								// Tag
 20.1219 +				0, 0 );
 20.1220 +
 20.1221 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 20.1222 +				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 20.1223 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 20.1224 +				STR_BUF+curr_dst_idx,								// Tag
 20.1225 +				0, 0 );
 20.1226 +		//---------------------------------------------------------------------------------------------
 20.1227 +
 20.1228 +
 20.1229 +		// update for next cycle
 20.1230 +		curr_src_idx = next_src_idx;
 20.1231 +		curr_dst_idx = next_dst_idx;
 20.1232 +
 20.1233 +		curr_interpl_y_upper = next_interpl_y_upper;
 20.1234 +		curr_interpl_y_lower = next_interpl_y_lower;
 20.1235 +		curr_interpl_vu = next_interpl_vu;
 20.1236 +
 20.1237 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 20.1238 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 20.1239 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
 20.1240 +
 20.1241 +		curr_src_y_upper = next_src_y_upper;
 20.1242 +		curr_src_y_lower = next_src_y_lower;
 20.1243 +		curr_src_vu = next_src_vu;
 20.1244 +	}
 20.1245 +
 20.1246 +
 20.1247 +
 20.1248 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 20.1249 +
 20.1250 +	// scaling
 20.1251 +	// work line y_upper
 20.1252 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
 20.1253 +			scaled_y_plane[curr_src_idx],
 20.1254 +			dst_width,
 20.1255 +			vf_x_scale,
 20.1256 +			vf_curr_NSweight_y_upper,
 20.1257 +			src_linestride_y );
 20.1258 +	// work line y_lower
 20.1259 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 20.1260 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 20.1261 +			dst_width,
 20.1262 +			vf_x_scale,
 20.1263 +			vf_curr_NSweight_y_lower,
 20.1264 +			src_linestride_y );
 20.1265 +	// work line v
 20.1266 +	bilinear_scale_line_w16( v_plane[curr_src_idx],
 20.1267 +			scaled_v_plane[curr_src_idx],
 20.1268 +			dst_width>>1,
 20.1269 +			vf_x_scale,
 20.1270 +			vf_curr_NSweight_vu,
 20.1271 +			src_linestride_vu );
 20.1272 +	// work line u
 20.1273 +	bilinear_scale_line_w16( u_plane[curr_src_idx],
 20.1274 +			scaled_u_plane[curr_src_idx],
 20.1275 +			dst_width>>1,
 20.1276 +			vf_x_scale,
 20.1277 +			vf_curr_NSweight_vu,
 20.1278 +			src_linestride_vu );
 20.1279 +
 20.1280 +
 20.1281 +	// Store the result back to main memory into a destination buffer in YUV format
 20.1282 +	//---------------------------------------------------------------------------------------------
 20.1283 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 20.1284 +
 20.1285 +	// Perform three DMA transfers to 3 different locations in the main memory!
 20.1286 +	// dst_width:	Pixel width of destination image
 20.1287 +	// dst_addr:	Destination address in main memory
 20.1288 +	// dst_vu:	Counter which is incremented one by one
 20.1289 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 20.1290 +
 20.1291 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 20.1292 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 20.1293 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 20.1294 +			STR_BUF+curr_dst_idx,								// Tag
 20.1295 +			0, 0 );
 20.1296 +
 20.1297 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 20.1298 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 20.1299 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 20.1300 +			STR_BUF+curr_dst_idx,								// Tag
 20.1301 +			0, 0 );
 20.1302 +
 20.1303 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 20.1304 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 20.1305 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 20.1306 +			STR_BUF+curr_dst_idx,								// Tag
 20.1307 +			0, 0 );
 20.1308 +
 20.1309 +	// wait for completion
 20.1310 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 20.1311 +	//---------------------------------------------------------------------------------------------
 20.1312 +}
 20.1313 +
 20.1314 +
 20.1315 +/*
 20.1316 + * bilinear_scale_line_w8()
 20.1317 + *
 20.1318 + * processes a line of yuv-input, width has to be a multiple of 8
 20.1319 + * scaled yuv-output is written to local store buffer
 20.1320 + *
 20.1321 + * @param src buffer for 2 lines input
 20.1322 + * @param dst_ buffer for 1 line output
 20.1323 + * @param dst_width the width of the destination line
 20.1324 + * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 20.1325 + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 20.1326 + * @param src_linestride the stride of the srcline
 20.1327 + */
 20.1328 +void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
 20.1329 +
 20.1330 +	unsigned char* dst = dst_;
 20.1331 +
 20.1332 +	unsigned int dst_x;
 20.1333 +	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
 20.1334 +		// address calculation for loading the 4 surrounding pixel of each calculated
 20.1335 +		// destination pixel
 20.1336 +		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 20.1337 +		// lower range->first 4 pixel
 20.1338 +		// upper range->next 4 pixel
 20.1339 +		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
 20.1340 +		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
 20.1341 +		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
 20.1342 +		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
 20.1343 +
 20.1344 +		// calculate weight EAST-WEST
 20.1345 +		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
 20.1346 +		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
 20.1347 +		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
 20.1348 +		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
 20.1349 +		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
 20.1350 +		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
 20.1351 +		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
 20.1352 +		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
 20.1353 +		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
 20.1354 +		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
 20.1355 +
 20.1356 +		// calculate address offset
 20.1357 +		//
 20.1358 +		// pixel NORTH WEST
 20.1359 +		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
 20.1360 +		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
 20.1361 +
 20.1362 +		// pixel NORTH EAST-->(offpixelNW+1)
 20.1363 +		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 20.1364 +		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
 20.1365 +		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
 20.1366 +
 20.1367 +		// SOUTH-WEST-->(offpixelNW+src_linestride)
 20.1368 +		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 20.1369 +		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
 20.1370 +		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
 20.1371 +
 20.1372 +		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 20.1373 +		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
 20.1374 +		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
 20.1375 +
 20.1376 +		// calculate each address
 20.1377 +		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 20.1378 +		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
 20.1379 +		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
 20.1380 +		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
 20.1381 +		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
 20.1382 +
 20.1383 +		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
 20.1384 +		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
 20.1385 +		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
 20.1386 +		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
 20.1387 +
 20.1388 +		// get each pixel
 20.1389 +		//
 20.1390 +		// scalar load, afterwards insertion into the right position
 20.1391 +		// NORTH WEST
 20.1392 +		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 20.1393 +		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
 20.1394 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
 20.1395 +		vuc_pixel_NW_lower_range = spu_insert(
 20.1396 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
 20.1397 +				vuc_pixel_NW_lower_range, 7 );
 20.1398 +		vuc_pixel_NW_lower_range = spu_insert(
 20.1399 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
 20.1400 +				vuc_pixel_NW_lower_range, 11 );
 20.1401 +		vuc_pixel_NW_lower_range = spu_insert(
 20.1402 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
 20.1403 +				vuc_pixel_NW_lower_range, 15 );
 20.1404 +
 20.1405 +		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
 20.1406 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
 20.1407 +		vuc_pixel_NW_upper_range = spu_insert(
 20.1408 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
 20.1409 +				vuc_pixel_NW_upper_range, 7 );
 20.1410 +		vuc_pixel_NW_upper_range = spu_insert(
 20.1411 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
 20.1412 +				vuc_pixel_NW_upper_range, 11 );
 20.1413 +		vuc_pixel_NW_upper_range = spu_insert(
 20.1414 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
 20.1415 +				vuc_pixel_NW_upper_range, 15 );
 20.1416 +
 20.1417 +		// NORTH EAST
 20.1418 +		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
 20.1419 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
 20.1420 +		vuc_pixel_NE_lower_range = spu_insert(
 20.1421 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
 20.1422 +				vuc_pixel_NE_lower_range, 7 );
 20.1423 +		vuc_pixel_NE_lower_range = spu_insert(
 20.1424 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
 20.1425 +				vuc_pixel_NE_lower_range, 11 );
 20.1426 +		vuc_pixel_NE_lower_range = spu_insert(
 20.1427 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
 20.1428 +				vuc_pixel_NE_lower_range, 15 );
 20.1429 +
 20.1430 +		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
 20.1431 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
 20.1432 +		vuc_pixel_NE_upper_range = spu_insert(
 20.1433 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
 20.1434 +				vuc_pixel_NE_upper_range, 7 );
 20.1435 +		vuc_pixel_NE_upper_range = spu_insert(
 20.1436 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
 20.1437 +				vuc_pixel_NE_upper_range, 11 );
 20.1438 +		vuc_pixel_NE_upper_range = spu_insert(
 20.1439 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
 20.1440 +				vuc_pixel_NE_upper_range, 15 );
 20.1441 +
 20.1442 +
 20.1443 +		// SOUTH WEST
 20.1444 +		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
 20.1445 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
 20.1446 +		vuc_pixel_SW_lower_range = spu_insert(
 20.1447 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
 20.1448 +				vuc_pixel_SW_lower_range, 7 );
 20.1449 +		vuc_pixel_SW_lower_range = spu_insert(
 20.1450 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
 20.1451 +				vuc_pixel_SW_lower_range, 11 );
 20.1452 +		vuc_pixel_SW_lower_range = spu_insert(
 20.1453 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
 20.1454 +				vuc_pixel_SW_lower_range, 15 );
 20.1455 +
 20.1456 +		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
 20.1457 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
 20.1458 +		vuc_pixel_SW_upper_range = spu_insert(
 20.1459 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
 20.1460 +				vuc_pixel_SW_upper_range, 7 );
 20.1461 +		vuc_pixel_SW_upper_range = spu_insert(
 20.1462 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
 20.1463 +				vuc_pixel_SW_upper_range, 11 );
 20.1464 +		vuc_pixel_SW_upper_range = spu_insert(
 20.1465 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
 20.1466 +				vuc_pixel_SW_upper_range, 15 );
 20.1467 +
 20.1468 +		// SOUTH EAST
 20.1469 +		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
 20.1470 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
 20.1471 +		vuc_pixel_SE_lower_range = spu_insert(
 20.1472 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
 20.1473 +				vuc_pixel_SE_lower_range, 7 );
 20.1474 +		vuc_pixel_SE_lower_range = spu_insert(
 20.1475 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
 20.1476 +				vuc_pixel_SE_lower_range, 11 );
 20.1477 +		vuc_pixel_SE_lower_range = spu_insert(
 20.1478 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
 20.1479 +				vuc_pixel_SE_lower_range, 15 );
 20.1480 +
 20.1481 +		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
 20.1482 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
 20.1483 +		vuc_pixel_SE_upper_range = spu_insert(
 20.1484 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
 20.1485 +				vuc_pixel_SE_upper_range, 7 );
 20.1486 +		vuc_pixel_SE_upper_range = spu_insert(
 20.1487 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
 20.1488 +				vuc_pixel_SE_upper_range, 11 );
 20.1489 +		vuc_pixel_SE_upper_range = spu_insert(
 20.1490 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
 20.1491 +				vuc_pixel_SE_upper_range, 15 );
 20.1492 +
 20.1493 +
 20.1494 +		// convert to float
 20.1495 +		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
 20.1496 +		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
 20.1497 +
 20.1498 +		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
 20.1499 +		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
 20.1500 +
 20.1501 +		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
 20.1502 +		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
 20.1503 +
 20.1504 +		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
 20.1505 +		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
 20.1506 +
 20.1507 +
 20.1508 +
 20.1509 +		// first linear interpolation: EWtop
 20.1510 +		// EWtop = NW + EWweight*(NE-NW)
 20.1511 +		//
 20.1512 +		// lower range
 20.1513 +		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
 20.1514 +		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
 20.1515 +								vf_EWtop_lower_range_tmp,
 20.1516 +								vf_pixel_NW_lower_range );
 20.1517 +
 20.1518 +		// upper range
 20.1519 +		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
 20.1520 +		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
 20.1521 +								vf_EWtop_upper_range_tmp,
 20.1522 +								vf_pixel_NW_upper_range );
 20.1523 +
 20.1524 +
 20.1525 +
 20.1526 +		// second linear interpolation: EWbottom
 20.1527 +		// EWbottom = SW + EWweight*(SE-SW)
 20.1528 +		//
 20.1529 +		// lower range
 20.1530 +		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
 20.1531 +		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
 20.1532 +								vf_EWbottom_lower_range_tmp,
 20.1533 +								vf_pixel_SW_lower_range );
 20.1534 +
 20.1535 +		// upper range
 20.1536 +		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
 20.1537 +		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
 20.1538 +								vf_EWbottom_upper_range_tmp,
 20.1539 +								vf_pixel_SW_upper_range );
 20.1540 +
 20.1541 +
 20.1542 +
 20.1543 +		// third linear interpolation: the bilinear interpolated value
 20.1544 +		// result = EWtop + NSweight*(EWbottom-EWtop);
 20.1545 +		//
 20.1546 +		// lower range
 20.1547 +		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
 20.1548 +		vector float vf_result_lower_range = spu_madd( vf_NSweight,
 20.1549 +								vf_result_lower_range_tmp,
 20.1550 +								vf_EWtop_lower_range );
 20.1551 +
 20.1552 +		// upper range
 20.1553 +		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
 20.1554 +		vector float vf_result_upper_range = spu_madd( vf_NSweight,
 20.1555 +								vf_result_upper_range_tmp,
 20.1556 +								vf_EWtop_upper_range );
 20.1557 +
 20.1558 +
 20.1559 +		// convert back: using saturated arithmetic
 20.1560 +		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
 20.1561 +		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
 20.1562 +
 20.1563 +		// merge results->lower,upper
 20.1564 +		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
 20.1565 +							       0x13, 0x17, 0x1B, 0x1F,
 20.1566 +							       0x00, 0x00, 0x00, 0x00,
 20.1567 +							       0x00, 0x00, 0x00, 0x00 };
 20.1568 +
 20.1569 +		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
 20.1570 +								(vector unsigned char) vui_result_upper_range,
 20.1571 +								vuc_mask_merge_result );
 20.1572 +
 20.1573 +		// partial storing
 20.1574 +		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
 20.1575 +						      0x00, 0x00, 0x00, 0x00,
 20.1576 +						      0xFF, 0xFF, 0xFF, 0xFF,
 20.1577 +						      0xFF, 0xFF, 0xFF, 0xFF };
 20.1578 +
 20.1579 +
 20.1580 +		// get currently stored data
 20.1581 +		vector unsigned char vuc_orig = *((vector unsigned char*)dst);
 20.1582 +
 20.1583 +		// clear currently stored data
 20.1584 +		vuc_orig = spu_and( vuc_orig,
 20.1585 +				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
 20.1586 +
 20.1587 +		// rotate result according to storing address
 20.1588 +		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
 20.1589 +
 20.1590 +		// store result
 20.1591 +		*((vector unsigned char*)dst) = spu_or( vuc_result,
 20.1592 +							vuc_orig );
 20.1593 +		dst += 8;
 20.1594 +	}
 20.1595 +}
 20.1596 +
 20.1597 +
 20.1598 +/*
 20.1599 + * bilinear_scale_line_w16()
 20.1600 + *
 20.1601 + * processes a line of yuv-input, width has to be a multiple of 16
 20.1602 + * scaled yuv-output is written to local store buffer
 20.1603 + *
 20.1604 + * @param src buffer for 2 lines input
 20.1605 + * @param dst_ buffer for 1 line output
 20.1606 + * @param dst_width the width of the destination line
 20.1607 + * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 20.1608 + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 20.1609 + * @param src_linestride the stride of the srcline
 20.1610 + */
 20.1611 +void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
 20.1612 +
 20.1613 +	unsigned char* dst = dst_;
 20.1614 +
 20.1615 +	unsigned int dst_x;
 20.1616 +	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
 20.1617 +		// address calculation for loading the 4 surrounding pixel of each calculated
 20.1618 +		// destination pixel
 20.1619 +		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 20.1620 +		// parallelised processing
 20.1621 +		// first range->pixel 1 2 3 4
 20.1622 +		// second range->pixel 5 6 7 8
 20.1623 +		// third range->pixel 9 10 11 12
 20.1624 +		// fourth range->pixel 13 14 15 16
 20.1625 +		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
 20.1626 +		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
 20.1627 +		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
 20.1628 +		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
 20.1629 +		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
 20.1630 +		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
 20.1631 +		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
 20.1632 +		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
 20.1633 +
 20.1634 +		// calculate weight EAST-WEST
 20.1635 +		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
 20.1636 +		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
 20.1637 +		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
 20.1638 +		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
 20.1639 +		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
 20.1640 +		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
 20.1641 +		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
 20.1642 +		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
 20.1643 +		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
 20.1644 +		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
 20.1645 +		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
 20.1646 +		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
 20.1647 +		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
 20.1648 +		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
 20.1649 +		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
 20.1650 +		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
 20.1651 +		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
 20.1652 +		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
 20.1653 +		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
 20.1654 +		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
 20.1655 +
 20.1656 +		// calculate address offset
 20.1657 +		//
 20.1658 +		// pixel NORTH WEST
 20.1659 +		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
 20.1660 +		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
 20.1661 +		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
 20.1662 +		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
 20.1663 +
 20.1664 +		// pixel NORTH EAST-->(offpixelNW+1)
 20.1665 +		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 20.1666 +		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
 20.1667 +		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
 20.1668 +		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
 20.1669 +		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
 20.1670 +
 20.1671 +		// SOUTH-WEST-->(offpixelNW+src_linestride)
 20.1672 +		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 20.1673 +		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
 20.1674 +		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
 20.1675 +		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
 20.1676 +		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
 20.1677 +
 20.1678 +		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 20.1679 +		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
 20.1680 +		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
 20.1681 +		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
 20.1682 +		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
 20.1683 +
 20.1684 +		// calculate each address
 20.1685 +		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 20.1686 +		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
 20.1687 +		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
 20.1688 +		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
 20.1689 +		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
 20.1690 +
 20.1691 +		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
 20.1692 +		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
 20.1693 +		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
 20.1694 +		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
 20.1695 +
 20.1696 +		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
 20.1697 +		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
 20.1698 +		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
 20.1699 +		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
 20.1700 +
 20.1701 +		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
 20.1702 +		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
 20.1703 +		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
 20.1704 +		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
 20.1705 +
 20.1706 +
 20.1707 +		// get each pixel
 20.1708 +		//
 20.1709 +		// scalar load, afterwards insertion into the right position
 20.1710 +		// NORTH WEST
 20.1711 +		// first range
 20.1712 +		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 20.1713 +		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
 20.1714 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
 20.1715 +		vuc_pixel_NW_first_range = spu_insert(
 20.1716 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
 20.1717 +				vuc_pixel_NW_first_range, 7 );
 20.1718 +		vuc_pixel_NW_first_range = spu_insert(
 20.1719 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
 20.1720 +				vuc_pixel_NW_first_range, 11 );
 20.1721 +		vuc_pixel_NW_first_range = spu_insert(
 20.1722 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
 20.1723 +				vuc_pixel_NW_first_range, 15 );
 20.1724 +		// second range
 20.1725 +		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
 20.1726 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
 20.1727 +		vuc_pixel_NW_second_range = spu_insert(
 20.1728 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
 20.1729 +				vuc_pixel_NW_second_range, 7 );
 20.1730 +		vuc_pixel_NW_second_range = spu_insert(
 20.1731 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
 20.1732 +				vuc_pixel_NW_second_range, 11 );
 20.1733 +		vuc_pixel_NW_second_range = spu_insert(
 20.1734 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
 20.1735 +				vuc_pixel_NW_second_range, 15 );
 20.1736 +		// third range
 20.1737 +		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
 20.1738 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
 20.1739 +		vuc_pixel_NW_third_range = spu_insert(
 20.1740 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
 20.1741 +				vuc_pixel_NW_third_range, 7 );
 20.1742 +		vuc_pixel_NW_third_range = spu_insert(
 20.1743 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
 20.1744 +				vuc_pixel_NW_third_range, 11 );
 20.1745 +		vuc_pixel_NW_third_range = spu_insert(
 20.1746 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
 20.1747 +				vuc_pixel_NW_third_range, 15 );
 20.1748 +		// fourth range
 20.1749 +		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
 20.1750 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
 20.1751 +		vuc_pixel_NW_fourth_range = spu_insert(
 20.1752 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
 20.1753 +				vuc_pixel_NW_fourth_range, 7 );
 20.1754 +		vuc_pixel_NW_fourth_range = spu_insert(
 20.1755 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
 20.1756 +				vuc_pixel_NW_fourth_range, 11 );
 20.1757 +		vuc_pixel_NW_fourth_range = spu_insert(
 20.1758 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
 20.1759 +				vuc_pixel_NW_fourth_range, 15 );
 20.1760 +
 20.1761 +		// NORTH EAST
 20.1762 +		// first range
 20.1763 +		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
 20.1764 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
 20.1765 +		vuc_pixel_NE_first_range = spu_insert(
 20.1766 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
 20.1767 +				vuc_pixel_NE_first_range, 7 );
 20.1768 +		vuc_pixel_NE_first_range = spu_insert(
 20.1769 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
 20.1770 +				vuc_pixel_NE_first_range, 11 );
 20.1771 +		vuc_pixel_NE_first_range = spu_insert(
 20.1772 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
 20.1773 +				vuc_pixel_NE_first_range, 15 );
 20.1774 +		// second range
 20.1775 +		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
 20.1776 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
 20.1777 +		vuc_pixel_NE_second_range = spu_insert(
 20.1778 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
 20.1779 +				vuc_pixel_NE_second_range, 7 );
 20.1780 +		vuc_pixel_NE_second_range = spu_insert(
 20.1781 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
 20.1782 +				vuc_pixel_NE_second_range, 11 );
 20.1783 +		vuc_pixel_NE_second_range = spu_insert(
 20.1784 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
 20.1785 +				vuc_pixel_NE_second_range, 15 );
 20.1786 +		// third range
 20.1787 +		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
 20.1788 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
 20.1789 +		vuc_pixel_NE_third_range = spu_insert(
 20.1790 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
 20.1791 +				vuc_pixel_NE_third_range, 7 );
 20.1792 +		vuc_pixel_NE_third_range = spu_insert(
 20.1793 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
 20.1794 +				vuc_pixel_NE_third_range, 11 );
 20.1795 +		vuc_pixel_NE_third_range = spu_insert(
 20.1796 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
 20.1797 +				vuc_pixel_NE_third_range, 15 );
 20.1798 +		// fourth range
 20.1799 +		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
 20.1800 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
 20.1801 +		vuc_pixel_NE_fourth_range = spu_insert(
 20.1802 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
 20.1803 +				vuc_pixel_NE_fourth_range, 7 );
 20.1804 +		vuc_pixel_NE_fourth_range = spu_insert(
 20.1805 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
 20.1806 +				vuc_pixel_NE_fourth_range, 11 );
 20.1807 +		vuc_pixel_NE_fourth_range = spu_insert(
 20.1808 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
 20.1809 +				vuc_pixel_NE_fourth_range, 15 );
 20.1810 +
 20.1811 +		// SOUTH WEST
 20.1812 +		// first range
 20.1813 +		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
 20.1814 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
 20.1815 +		vuc_pixel_SW_first_range = spu_insert(
 20.1816 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
 20.1817 +				vuc_pixel_SW_first_range, 7 );
 20.1818 +		vuc_pixel_SW_first_range = spu_insert(
 20.1819 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
 20.1820 +				vuc_pixel_SW_first_range, 11 );
 20.1821 +		vuc_pixel_SW_first_range = spu_insert(
 20.1822 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
 20.1823 +				vuc_pixel_SW_first_range, 15 );
 20.1824 +		// second range
 20.1825 +		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
 20.1826 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
 20.1827 +		vuc_pixel_SW_second_range = spu_insert(
 20.1828 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
 20.1829 +				vuc_pixel_SW_second_range, 7 );
 20.1830 +		vuc_pixel_SW_second_range = spu_insert(
 20.1831 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
 20.1832 +				vuc_pixel_SW_second_range, 11 );
 20.1833 +		vuc_pixel_SW_second_range = spu_insert(
 20.1834 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
 20.1835 +				vuc_pixel_SW_second_range, 15 );
 20.1836 +		// third range
 20.1837 +		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
 20.1838 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
 20.1839 +		vuc_pixel_SW_third_range = spu_insert(
 20.1840 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
 20.1841 +				vuc_pixel_SW_third_range, 7 );
 20.1842 +		vuc_pixel_SW_third_range = spu_insert(
 20.1843 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
 20.1844 +				vuc_pixel_SW_third_range, 11 );
 20.1845 +		vuc_pixel_SW_third_range = spu_insert(
 20.1846 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
 20.1847 +				vuc_pixel_SW_third_range, 15 );
 20.1848 +		// fourth range
 20.1849 +		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
 20.1850 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
 20.1851 +		vuc_pixel_SW_fourth_range = spu_insert(
 20.1852 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
 20.1853 +				vuc_pixel_SW_fourth_range, 7 );
 20.1854 +		vuc_pixel_SW_fourth_range = spu_insert(
 20.1855 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
 20.1856 +				vuc_pixel_SW_fourth_range, 11 );
 20.1857 +		vuc_pixel_SW_fourth_range = spu_insert(
 20.1858 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
 20.1859 +				vuc_pixel_SW_fourth_range, 15 );
 20.1860 +
 20.1861 +		// NORTH EAST
 20.1862 +		// first range
 20.1863 +		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
 20.1864 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
 20.1865 +		vuc_pixel_SE_first_range = spu_insert(
 20.1866 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
 20.1867 +				vuc_pixel_SE_first_range, 7 );
 20.1868 +		vuc_pixel_SE_first_range = spu_insert(
 20.1869 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
 20.1870 +				vuc_pixel_SE_first_range, 11 );
 20.1871 +		vuc_pixel_SE_first_range = spu_insert(
 20.1872 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
 20.1873 +				vuc_pixel_SE_first_range, 15 );
 20.1874 +		// second range
 20.1875 +		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
 20.1876 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
 20.1877 +		vuc_pixel_SE_second_range = spu_insert(
 20.1878 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
 20.1879 +				vuc_pixel_SE_second_range, 7 );
 20.1880 +		vuc_pixel_SE_second_range = spu_insert(
 20.1881 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
 20.1882 +				vuc_pixel_SE_second_range, 11 );
 20.1883 +		vuc_pixel_SE_second_range = spu_insert(
 20.1884 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
 20.1885 +				vuc_pixel_SE_second_range, 15 );
 20.1886 +		// third range
 20.1887 +		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
 20.1888 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
 20.1889 +		vuc_pixel_SE_third_range = spu_insert(
 20.1890 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
 20.1891 +				vuc_pixel_SE_third_range, 7 );
 20.1892 +		vuc_pixel_SE_third_range = spu_insert(
 20.1893 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
 20.1894 +				vuc_pixel_SE_third_range, 11 );
 20.1895 +		vuc_pixel_SE_third_range = spu_insert(
 20.1896 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
 20.1897 +				vuc_pixel_SE_third_range, 15 );
 20.1898 +		// fourth range
 20.1899 +		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
 20.1900 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
 20.1901 +		vuc_pixel_SE_fourth_range = spu_insert(
 20.1902 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
 20.1903 +				vuc_pixel_SE_fourth_range, 7 );
 20.1904 +		vuc_pixel_SE_fourth_range = spu_insert(
 20.1905 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
 20.1906 +				vuc_pixel_SE_fourth_range, 11 );
 20.1907 +		vuc_pixel_SE_fourth_range = spu_insert(
 20.1908 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
 20.1909 +				vuc_pixel_SE_fourth_range, 15 );
 20.1910 +
 20.1911 +
 20.1912 +
 20.1913 +		// convert to float
 20.1914 +		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
 20.1915 +		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
 20.1916 +		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
 20.1917 +		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
 20.1918 +
 20.1919 +		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
 20.1920 +		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
 20.1921 +		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
 20.1922 +		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
 20.1923 +
 20.1924 +		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
 20.1925 +		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
 20.1926 +		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
 20.1927 +		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
 20.1928 +
 20.1929 +		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
 20.1930 +		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
 20.1931 +		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
 20.1932 +		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
 20.1933 +
 20.1934 +		// first linear interpolation: EWtop
 20.1935 +		// EWtop = NW + EWweight*(NE-NW)
 20.1936 +		//
 20.1937 +		// first range
 20.1938 +		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
 20.1939 +		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
 20.1940 +								vf_EWtop_first_range_tmp,
 20.1941 +								vf_pixel_NW_first_range );
 20.1942 +
 20.1943 +		// second range
 20.1944 +		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
 20.1945 +		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
 20.1946 +								vf_EWtop_second_range_tmp,
 20.1947 +								vf_pixel_NW_second_range );
 20.1948 +
 20.1949 +		// third range
 20.1950 +		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
 20.1951 +		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
 20.1952 +								vf_EWtop_third_range_tmp,
 20.1953 +								vf_pixel_NW_third_range );
 20.1954 +
 20.1955 +		// fourth range
 20.1956 +		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
 20.1957 +		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
 20.1958 +								vf_EWtop_fourth_range_tmp,
 20.1959 +								vf_pixel_NW_fourth_range );
 20.1960 +
 20.1961 +
 20.1962 +
 20.1963 +		// second linear interpolation: EWbottom
 20.1964 +		// EWbottom = SW + EWweight*(SE-SW)
 20.1965 +		//
 20.1966 +		// first range
 20.1967 +		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
 20.1968 +		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
 20.1969 +								vf_EWbottom_first_range_tmp,
 20.1970 +								vf_pixel_SW_first_range );
 20.1971 +
 20.1972 +		// second range
 20.1973 +		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
 20.1974 +		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
 20.1975 +								vf_EWbottom_second_range_tmp,
 20.1976 +								vf_pixel_SW_second_range );
 20.1977 +		// first range
 20.1978 +		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
 20.1979 +		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
 20.1980 +								vf_EWbottom_third_range_tmp,
 20.1981 +								vf_pixel_SW_third_range );
 20.1982 +
 20.1983 +		// first range
 20.1984 +		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
 20.1985 +		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
 20.1986 +								vf_EWbottom_fourth_range_tmp,
 20.1987 +								vf_pixel_SW_fourth_range );
 20.1988 +
 20.1989 +
 20.1990 +
 20.1991 +		// third linear interpolation: the bilinear interpolated value
 20.1992 +		// result = EWtop + NSweight*(EWbottom-EWtop);
 20.1993 +		//
 20.1994 +		// first range
 20.1995 +		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
 20.1996 +		vector float vf_result_first_range = spu_madd( vf_NSweight,
 20.1997 +								vf_result_first_range_tmp,
 20.1998 +								vf_EWtop_first_range );
 20.1999 +
 20.2000 +		// second range
 20.2001 +		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
 20.2002 +		vector float vf_result_second_range = spu_madd( vf_NSweight,
 20.2003 +								vf_result_second_range_tmp,
 20.2004 +								vf_EWtop_second_range );
 20.2005 +
 20.2006 +		// third range
 20.2007 +		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
 20.2008 +		vector float vf_result_third_range = spu_madd( vf_NSweight,
 20.2009 +								vf_result_third_range_tmp,
 20.2010 +								vf_EWtop_third_range );
 20.2011 +
 20.2012 +		// fourth range
 20.2013 +		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
 20.2014 +		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
 20.2015 +								vf_result_fourth_range_tmp,
 20.2016 +								vf_EWtop_fourth_range );
 20.2017 +
 20.2018 +
 20.2019 +
 20.2020 +		// convert back: using saturated arithmetic
 20.2021 +		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
 20.2022 +		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
 20.2023 +		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
 20.2024 +		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
 20.2025 +
 20.2026 +		// merge results->lower,upper
 20.2027 +		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
 20.2028 +							       		    0x13, 0x17, 0x1B, 0x1F,
 20.2029 +							       		    0x00, 0x00, 0x00, 0x00,
 20.2030 +							       		    0x00, 0x00, 0x00, 0x00 };
 20.2031 +
 20.2032 +		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
 20.2033 +							       		    0x00, 0x00, 0x00, 0x00,
 20.2034 +									    0x03, 0x07, 0x0B, 0x0F,
 20.2035 +							       		    0x13, 0x17, 0x1B, 0x1F };
 20.2036 +
 20.2037 +		vector unsigned char vuc_result_first_second =
 20.2038 +						spu_shuffle( (vector unsigned char) vui_result_first_range,
 20.2039 +								 (vector unsigned char) vui_result_second_range,
 20.2040 +								vuc_mask_merge_result_first_second );
 20.2041 +
 20.2042 +		vector unsigned char vuc_result_third_fourth =
 20.2043 +						spu_shuffle( (vector unsigned char) vui_result_third_range,
 20.2044 +								 (vector unsigned char) vui_result_fourth_range,
 20.2045 +								vuc_mask_merge_result_third_fourth );
 20.2046 +
 20.2047 +		// store result
 20.2048 +		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
 20.2049 +							vuc_result_third_fourth );
 20.2050 +		dst += 16;
 20.2051 +	}
 20.2052 +}
 20.2053 +
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/src/video/ps3/spulibs/fb_writer.c	Mon Sep 07 04:51:29 2009 +0000
    21.3 @@ -0,0 +1,193 @@
    21.4 +/*
    21.5 + * SDL - Simple DirectMedia Layer
    21.6 + * CELL BE Support for PS3 Framebuffer
    21.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    21.8 + *
    21.9 + * This library is free software; you can redistribute it and/or modify it
   21.10 + * under the terms of the GNU Lesser General Public License as published
   21.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   21.12 + * (at your option) any later version.
   21.13 + *
   21.14 + * This library is distributed in the hope that it will be useful, but
   21.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   21.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   21.17 + * Lesser General Public License for more details.
   21.18 + *
   21.19 + * You should have received a copy of the GNU Lesser General Public
   21.20 + * License along with this library; if not, write to the Free Software
   21.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   21.22 + * USA
   21.23 + *
   21.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   21.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   21.26 + *  SPE code based on research by:
   21.27 + *  Rene Becker
   21.28 + *  Thimo Emmerich
   21.29 + */
   21.30 +
   21.31 +#include "spu_common.h"
   21.32 +
   21.33 +#include <spu_intrinsics.h>
   21.34 +#include <spu_mfcio.h>
   21.35 +#include <stdio.h>
   21.36 +#include <string.h>
   21.37 +
   21.38 +// Debugging
   21.39 +//#define DEBUG
   21.40 +
   21.41 +#ifdef DEBUG
   21.42 +#define deprintf(fmt, args... ) \
   21.43 +	fprintf( stdout, fmt, ##args ); \
   21.44 +	fflush( stdout );
   21.45 +#else
   21.46 +#define deprintf( fmt, args... )
   21.47 +#endif
   21.48 +
   21.49 +void cpy_to_fb(unsigned int);
   21.50 +
   21.51 +/* fb_writer_spu parms */
   21.52 +static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128)));
   21.53 +
   21.54 +/* Code running on SPU */
   21.55 +int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused)))
   21.56 +{
   21.57 +	deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id);
   21.58 +	uint32_t ea_mfc, mbox;
   21.59 +	// send ready message
   21.60 +	spu_write_out_mbox(SPU_READY);
   21.61 +
   21.62 +	while (1) {
   21.63 +		/* Check mailbox */
   21.64 +		mbox = spu_read_in_mbox();
   21.65 +		deprintf("[SPU] Message is %u\n", mbox);
   21.66 +		switch (mbox) {
   21.67 +			case SPU_EXIT:
   21.68 +				deprintf("[SPU] fb_writer goes down...\n");
   21.69 +				return 0;
   21.70 +			case SPU_START:
   21.71 +				break;
   21.72 +			default:
   21.73 +				deprintf("[SPU] Cannot handle message\n");
   21.74 +				continue;
   21.75 +		}
   21.76 +
   21.77 +		/* Tag Manager setup */
   21.78 +		unsigned int tags;
   21.79 +		tags = mfc_multi_tag_reserve(5);
   21.80 +		if (tags == MFC_TAG_INVALID) {
   21.81 +			deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n");
   21.82 +			return 0;
   21.83 +		}
   21.84 +
   21.85 +		/* Framebuffer parms */
   21.86 +		ea_mfc = spu_read_in_mbox();
   21.87 +		deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc);
   21.88 +		spu_mfcdma32(&parms, (unsigned int)ea_mfc,
   21.89 +				sizeof(struct fb_writer_parms_t), tags,
   21.90 +				MFC_GET_CMD);
   21.91 +		deprintf("[SPU] argp = %u\n", (unsigned int)argp);
   21.92 +		DMA_WAIT_TAG(tags);
   21.93 +
   21.94 +		/* Copy parms->data to framebuffer */
   21.95 +		deprintf("[SPU] Copying to framebuffer started\n");
   21.96 +		cpy_to_fb(tags);
   21.97 +		deprintf("[SPU] Copying to framebuffer done!\n");
   21.98 +
   21.99 +		mfc_multi_tag_release(tags, 5);
  21.100 +		deprintf("[SPU] fb_writer_spu... done!\n");
  21.101 +		/* Send FIN msg */
  21.102 +		spu_write_out_mbox(SPU_FIN);
  21.103 +	}
  21.104 +
  21.105 +	return 0;
  21.106 +}
  21.107 +
  21.108 +void cpy_to_fb(unsigned int tag_id_base)
  21.109 +{
  21.110 +	unsigned int i;
  21.111 +	unsigned char current_buf;
  21.112 +	uint8_t *in = parms.data;
  21.113 +
  21.114 +	/* Align fb pointer which was centered before */
  21.115 +	uint8_t *fb =
  21.116 +	    (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0);
  21.117 +
  21.118 +	uint32_t bounded_input_height = parms.bounded_input_height;
  21.119 +	uint32_t bounded_input_width = parms.bounded_input_width;
  21.120 +	uint32_t fb_pixel_size = parms.fb_pixel_size;
  21.121 +
  21.122 +	uint32_t out_line_stride = parms.out_line_stride;
  21.123 +	uint32_t in_line_stride = parms.in_line_stride;
  21.124 +	uint32_t in_line_size = bounded_input_width * fb_pixel_size;
  21.125 +
  21.126 +	current_buf = 0;
  21.127 +
  21.128 +	/* Local store buffer */
  21.129 +	static volatile uint8_t buf[4][BUFFER_SIZE]
  21.130 +	    __attribute__ ((aligned(128)));
  21.131 +	/* do 4-times multibuffering using DMA list, process in two steps */
  21.132 +	for (i = 0; i < bounded_input_height >> 2; i++) {
  21.133 +		/* first buffer */
  21.134 +		DMA_WAIT_TAG(tag_id_base + 1);
  21.135 +		// retrieve buffer
  21.136 +		spu_mfcdma32(buf[0], (unsigned int)in, in_line_size,
  21.137 +			     tag_id_base + 1, MFC_GETB_CMD);
  21.138 +		DMA_WAIT_TAG(tag_id_base + 1);
  21.139 +		// store buffer
  21.140 +		spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size,
  21.141 +			     tag_id_base + 1, MFC_PUTB_CMD);
  21.142 +		in += in_line_stride;
  21.143 +		fb += out_line_stride;
  21.144 +		deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in,
  21.145 +		       fb);
  21.146 +
  21.147 +		/* second buffer */
  21.148 +		DMA_WAIT_TAG(tag_id_base + 2);
  21.149 +		// retrieve buffer
  21.150 +		spu_mfcdma32(buf[1], (unsigned int)in, in_line_size,
  21.151 +			     tag_id_base + 2, MFC_GETB_CMD);
  21.152 +		DMA_WAIT_TAG(tag_id_base + 2);
  21.153 +		// store buffer
  21.154 +		spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size,
  21.155 +			     tag_id_base + 2, MFC_PUTB_CMD);
  21.156 +		in += in_line_stride;
  21.157 +		fb += out_line_stride;
  21.158 +		deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in,
  21.159 +		       fb);
  21.160 +
  21.161 +		/* third buffer */
  21.162 +		DMA_WAIT_TAG(tag_id_base + 3);
  21.163 +		// retrieve buffer
  21.164 +		spu_mfcdma32(buf[2], (unsigned int)in, in_line_size,
  21.165 +			     tag_id_base + 3, MFC_GETB_CMD);
  21.166 +		DMA_WAIT_TAG(tag_id_base + 3);
  21.167 +		// store buffer
  21.168 +		spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size,
  21.169 +			     tag_id_base + 3, MFC_PUTB_CMD);
  21.170 +		in += in_line_stride;
  21.171 +		fb += out_line_stride;
  21.172 +		deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in,
  21.173 +		       fb);
  21.174 +
  21.175 +		/* fourth buffer */
  21.176 +		DMA_WAIT_TAG(tag_id_base + 4);
  21.177 +		// retrieve buffer
  21.178 +		spu_mfcdma32(buf[3], (unsigned int)in, in_line_size,
  21.179 +			     tag_id_base + 4, MFC_GETB_CMD);
  21.180 +		DMA_WAIT_TAG(tag_id_base + 4);
  21.181 +		// store buffer
  21.182 +		spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size,
  21.183 +			     tag_id_base + 4, MFC_PUTB_CMD);
  21.184 +		in += in_line_stride;
  21.185 +		fb += out_line_stride;
  21.186 +		deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in,
  21.187 +		       fb);
  21.188 +		deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i,
  21.189 +		       bounded_input_height >> 2);
  21.190 +	}
  21.191 +	DMA_WAIT_TAG(tag_id_base + 2);
  21.192 +	DMA_WAIT_TAG(tag_id_base + 3);
  21.193 +	DMA_WAIT_TAG(tag_id_base + 4);
  21.194 +}
  21.195 +
  21.196 +
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/src/video/ps3/spulibs/spu_common.h	Mon Sep 07 04:51:29 2009 +0000
    22.3 @@ -0,0 +1,108 @@
    22.4 +/*
    22.5 + * SDL - Simple DirectMedia Layer
    22.6 + * CELL BE Support for PS3 Framebuffer
    22.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    22.8 + *
    22.9 + * This library is free software; you can redistribute it and/or modify it
   22.10 + * under the terms of the GNU Lesser General Public License as published
   22.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   22.12 + * (at your option) any later version.
   22.13 + *
   22.14 + * This library is distributed in the hope that it will be useful, but
   22.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   22.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   22.17 + * Lesser General Public License for more details.
   22.18 + *
   22.19 + * You should have received a copy of the GNU Lesser General Public
   22.20 + * License along with this library; if not, write to the Free Software
   22.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   22.22 + * USA
   22.23 + *
   22.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   22.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   22.26 + *  SPE code based on research by:
   22.27 + *  Rene Becker
   22.28 + *  Thimo Emmerich
   22.29 + */
   22.30 +
   22.31 +/* Common definitions/makros for SPUs */
   22.32 +
   22.33 +#ifndef _SPU_COMMON_H
   22.34 +#define _SPU_COMMON_H
   22.35 +
   22.36 +#include <stdio.h>
   22.37 +#include <stdint.h>
   22.38 +#include <string.h>
   22.39 +
   22.40 +/* Tag management */
   22.41 +#define DMA_WAIT_TAG(_tag)     \
   22.42 +    mfc_write_tag_mask(1<<(_tag)); \
   22.43 +    mfc_read_tag_status_all();
   22.44 +
   22.45 +/* SPU mailbox messages */
   22.46 +#define SPU_READY	0
   22.47 +#define SPU_START	1
   22.48 +#define SPU_FIN		2
   22.49 +#define SPU_EXIT	3
   22.50 +
   22.51 +/* Tags */
   22.52 +#define RETR_BUF	0
   22.53 +#define STR_BUF		1
   22.54 +#define TAG_INIT	2
   22.55 +
   22.56 +/* Buffersizes */
   22.57 +#define MAX_HDTV_WIDTH 1920
   22.58 +#define MAX_HDTV_HEIGHT 1080
   22.59 +/* One stride of HDTV */
   22.60 +#define BUFFER_SIZE 7680
   22.61 +
   22.62 +/* fb_writer ppu/spu exchange parms */
   22.63 +struct fb_writer_parms_t {
   22.64 +	uint8_t *data;
   22.65 +	uint8_t *center;
   22.66 +	uint32_t out_line_stride;
   22.67 +	uint32_t in_line_stride;
   22.68 +	uint32_t bounded_input_height;
   22.69 +	uint32_t bounded_input_width;
   22.70 +	uint32_t fb_pixel_size;
   22.71 +
   22.72 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
   22.73 +	char padding[4];
   22.74 +} __attribute__((aligned(128)));
   22.75 +
   22.76 +/* yuv2rgb ppu/spu exchange parms */
   22.77 +struct yuv2rgb_parms_t {
   22.78 +	uint8_t* y_plane;
   22.79 +	uint8_t* v_plane;
   22.80 +	uint8_t* u_plane;
   22.81 +
   22.82 +	uint8_t* dstBuffer;
   22.83 +
   22.84 +	unsigned int src_pixel_width;
   22.85 +	unsigned int src_pixel_height;
   22.86 +
   22.87 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
   22.88 +	char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)];
   22.89 +} __attribute__((aligned(128)));
   22.90 +
   22.91 +/* bilin_scaler ppu/spu exchange parms */
   22.92 +struct scale_parms_t {
   22.93 +	uint8_t* y_plane;
   22.94 +	uint8_t* v_plane;
   22.95 +	uint8_t* u_plane;
   22.96 +
   22.97 +	uint8_t* dstBuffer;
   22.98 +
   22.99 +	unsigned int src_pixel_width;
  22.100 +	unsigned int src_pixel_height;
  22.101 +
  22.102 +	unsigned int dst_pixel_width;
  22.103 +	unsigned int dst_pixel_height;
  22.104 +
  22.105 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
  22.106 +	char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)];
  22.107 +} __attribute__((aligned(128)));
  22.108 +
  22.109 +#endif /* _SPU_COMMON_H */
  22.110 +
  22.111 +
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/src/video/ps3/spulibs/yuv2rgb.c	Mon Sep 07 04:51:29 2009 +0000
    23.3 @@ -0,0 +1,662 @@
    23.4 +/*
    23.5 + * SDL - Simple DirectMedia Layer
    23.6 + * CELL BE Support for PS3 Framebuffer
    23.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    23.8 + *
    23.9 + * This library is free software; you can redistribute it and/or modify it
   23.10 + * under the terms of the GNU Lesser General Public License as published
   23.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   23.12 + * (at your option) any later version.
   23.13 + *
   23.14 + * This library is distributed in the hope that it will be useful, but
   23.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   23.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   23.17 + * Lesser General Public License for more details.
   23.18 + *
   23.19 + * You should have received a copy of the GNU Lesser General Public
   23.20 + * License along with this library; if not, write to the Free Software
   23.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   23.22 + * USA
   23.23 + *
   23.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   23.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   23.26 + *  SPE code based on research by:
   23.27 + *  Rene Becker
   23.28 + *  Thimo Emmerich
   23.29 + */
   23.30 +
   23.31 +#include "spu_common.h"
   23.32 +
   23.33 +#include <spu_intrinsics.h>
   23.34 +#include <spu_mfcio.h>
   23.35 +
   23.36 +// Debugging
   23.37 +//#define DEBUG
   23.38 +
   23.39 +// Test environment for /2 resolutions
   23.40 +//#define TESTING
   23.41 +
   23.42 +#ifdef DEBUG
   23.43 +#define deprintf(fmt, args... ) \
   23.44 +	fprintf( stdout, fmt, ##args ); \
   23.45 +	fflush( stdout );
   23.46 +#else
   23.47 +#define deprintf( fmt, args... )
   23.48 +#endif
   23.49 +
   23.50 +struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
   23.51 +
   23.52 +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
   23.53 + * there might be the need to retrieve misaligned data, adjust
   23.54 + * incoming v and u plane to be able to handle this (add 128)
   23.55 + */
   23.56 +unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
   23.57 +unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
   23.58 +unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
   23.59 +
   23.60 +/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
   23.61 +unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
   23.62 +
   23.63 +/* some vectors needed by the float to int conversion */
   23.64 +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
   23.65 +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
   23.66 +
   23.67 +void yuv_to_rgb_w16();
   23.68 +void yuv_to_rgb_w32();
   23.69 +
   23.70 +void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
   23.71 +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
   23.72 +
   23.73 +
   23.74 +int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
   23.75 +{
   23.76 +	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
   23.77 +	uint32_t ea_mfc, mbox;
   23.78 +	// send ready message
   23.79 +	spu_write_out_mbox(SPU_READY);
   23.80 +
   23.81 +	while (1) {
   23.82 +		/* Check mailbox */
   23.83 +		mbox = spu_read_in_mbox();
   23.84 +		deprintf("[SPU] Message is %u\n", mbox);
   23.85 +		switch (mbox) {
   23.86 +			case SPU_EXIT:
   23.87 +				deprintf("[SPU] yuv2rgb_converter goes down...\n");
   23.88 +				return 0;
   23.89 +			case SPU_START:
   23.90 +				break;
   23.91 +			default:
   23.92 +				deprintf("[SPU] Cannot handle message\n");
   23.93 +				continue;
   23.94 +		}
   23.95 +
   23.96 +		/* Tag Manager setup */
   23.97 +		unsigned int tag_id;
   23.98 +		tag_id = mfc_multi_tag_reserve(1);
   23.99 +		if (tag_id == MFC_TAG_INVALID) {
  23.100 +			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
  23.101 +			return 0;
  23.102 +		}
  23.103 +
  23.104 +		/* DMA transfer for the input parameters */
  23.105 +		ea_mfc = spu_read_in_mbox();
  23.106 +		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
  23.107 +		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
  23.108 +		DMA_WAIT_TAG(tag_id);
  23.109 +
  23.110 +		/* There are alignment issues that involve handling of special cases
  23.111 +		 * a width of 32 results in a width of 16 in the chrominance
  23.112 +		 * --> choose the proper handling to optimize the performance
  23.113 +		 */
  23.114 +		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
  23.115 +		if (!(parms_converter.src_pixel_width & 0x1f)) {
  23.116 +			deprintf("[SPU] Using yuv_to_rgb_w16\n");
  23.117 +			yuv_to_rgb_w16();
  23.118 +		} else {
  23.119 +			deprintf("[SPU] Using yuv_to_rgb_w32\n");
  23.120 +			yuv_to_rgb_w32();
  23.121 +		}
  23.122 +
  23.123 +		mfc_multi_tag_release(tag_id, 1);
  23.124 +		deprintf("[SPU] yuv2rgb_spu... done!\n");
  23.125 +		/* Send FIN message */
  23.126 +		spu_write_out_mbox(SPU_FIN);
  23.127 +	}
  23.128 +
  23.129 +	return 0;
  23.130 +}
  23.131 +
  23.132 +
  23.133 +/*
  23.134 + * float_to_char()
  23.135 + *
  23.136 + * converts a float to a character using saturated
  23.137 + * arithmetic
  23.138 + *
  23.139 + * @param s float for conversion
  23.140 + * @returns converted character
  23.141 + */
  23.142 +inline static unsigned char float_to_char(float s) {
  23.143 +	vector float vec_s = spu_splats(s);
  23.144 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  23.145 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  23.146 +
  23.147 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  23.148 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  23.149 +	return (unsigned char) spu_extract(vec_s,0);
  23.150 +}
  23.151 +
  23.152 +
  23.153 +/*
  23.154 + * vfloat_to_vuint()
  23.155 + *
  23.156 + * converts a float vector to an unsinged int vector using saturated
  23.157 + * arithmetic
  23.158 + *
  23.159 + * @param vec_s float vector for conversion
  23.160 + * @returns converted unsigned int vector
  23.161 + */
  23.162 +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
  23.163 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  23.164 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  23.165 +
  23.166 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  23.167 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  23.168 +	return spu_convtu(vec_s,0);
  23.169 +}
  23.170 +
  23.171 +
  23.172 +void yuv_to_rgb_w16() {
  23.173 +	// Pixel dimensions of the picture
  23.174 +	uint32_t width, height;
  23.175 +
  23.176 +	// Extract parameters
  23.177 +	width = parms_converter.src_pixel_width;
  23.178 +	height = parms_converter.src_pixel_height;
  23.179 +
  23.180 +	// Plane data management
  23.181 +	// Y
  23.182 +	unsigned char* ram_addr_y = parms_converter.y_plane;
  23.183 +	// V
  23.184 +	unsigned char* ram_addr_v = parms_converter.v_plane;
  23.185 +	// U
  23.186 +	unsigned char* ram_addr_u = parms_converter.u_plane;
  23.187 +
  23.188 +	// BGRA
  23.189 +	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
  23.190 +
  23.191 +	// Strides
  23.192 +	unsigned int stride_y = width;
  23.193 +	unsigned int stride_vu = width>>1;
  23.194 +
  23.195 +	// Buffer management
  23.196 +	unsigned int buf_idx = 0;
  23.197 +	unsigned int size_4lines_y = stride_y<<2;
  23.198 +	unsigned int size_2lines_y = stride_y<<1;
  23.199 +	unsigned int size_2lines_vu = stride_vu<<1;
  23.200 +
  23.201 +	// 2*width*4byte_per_pixel
  23.202 +	unsigned int size_2lines_bgra = width<<3;
  23.203 +
  23.204 +
  23.205 +	// start double-buffered processing
  23.206 +	// 4 lines y
  23.207 +	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.208 +
  23.209 +	// 2 lines v
  23.210 +	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.211 +
  23.212 +	// 2 lines u
  23.213 +	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.214 +
  23.215 +	// Wait for these transfers to be completed
  23.216 +	DMA_WAIT_TAG((RETR_BUF + buf_idx));
  23.217 +
  23.218 +	unsigned int i;
  23.219 +	for(i=0; i<(height>>2)-1; i++) {
  23.220 +
  23.221 +		buf_idx^=1;
  23.222 +
  23.223 +		// 4 lines y
  23.224 +		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.225 +
  23.226 +		// 2 lines v
  23.227 +		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.228 +
  23.229 +		// 2 lines u
  23.230 +		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  23.231 +
  23.232 +		DMA_WAIT_TAG((RETR_BUF + buf_idx));
  23.233 +
  23.234 +		buf_idx^=1;
  23.235 +
  23.236 +
  23.237 +		// Convert YUV to BGRA, store it back (first two lines)
  23.238 +#ifndef TESTING
  23.239 +		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.240 +
  23.241 +		// Next two lines
  23.242 +		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
  23.243 +				v_plane[buf_idx] + stride_vu,
  23.244 +				u_plane[buf_idx] + stride_vu,
  23.245 +				bgra + size_2lines_bgra,
  23.246 +				width);
  23.247 +#else
  23.248 +		yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.249 +
  23.250 +		// Next two lines
  23.251 +		yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y,
  23.252 +				v_plane[buf_idx] + stride_vu,
  23.253 +				u_plane[buf_idx] + stride_vu,
  23.254 +				bgra + size_2lines_bgra,
  23.255 +				width);
  23.256 +#endif
  23.257 +
  23.258 +		// Wait for previous storing transfer to be completed
  23.259 +		DMA_WAIT_TAG(STR_BUF);
  23.260 +
  23.261 +		// Store converted lines in two steps->max transfer size 16384
  23.262 +		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.263 +		ram_addr_bgra += size_2lines_bgra;
  23.264 +		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.265 +		ram_addr_bgra += size_2lines_bgra;
  23.266 +
  23.267 +		// Move 4 lines
  23.268 +		ram_addr_y += size_4lines_y;
  23.269 +		ram_addr_v += size_2lines_vu;
  23.270 +		ram_addr_u += size_2lines_vu;
  23.271 +
  23.272 +		buf_idx^=1;
  23.273 +	}
  23.274 +
  23.275 +#ifndef TESTING
  23.276 +	// Convert YUV to BGRA, store it back (first two lines)
  23.277 +	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.278 +
  23.279 +	// Next two lines
  23.280 +	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
  23.281 +			v_plane[buf_idx] + stride_vu,
  23.282 +			u_plane[buf_idx] + stride_vu,
  23.283 +			bgra + size_2lines_bgra,
  23.284 +			width);
  23.285 +#else
  23.286 +	// Convert YUV to BGRA, store it back (first two lines)
  23.287 +	yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.288 +
  23.289 +	// Next two lines
  23.290 +	yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y,
  23.291 +			v_plane[buf_idx] + stride_vu,
  23.292 +			u_plane[buf_idx] + stride_vu,
  23.293 +			bgra + size_2lines_bgra,
  23.294 +			width);
  23.295 +#endif
  23.296 +
  23.297 +	// Wait for previous storing transfer to be completed
  23.298 +	DMA_WAIT_TAG(STR_BUF);
  23.299 +	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.300 +	ram_addr_bgra += size_2lines_bgra;
  23.301 +	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.302 +
  23.303 +	// wait for previous storing transfer to be completed
  23.304 +	DMA_WAIT_TAG(STR_BUF);
  23.305 +
  23.306 +}
  23.307 +
  23.308 +
  23.309 +void yuv_to_rgb_w32() {
  23.310 +	// Pixel dimensions of the picture
  23.311 +	uint32_t width, height;
  23.312 +
  23.313 +	// Extract parameters
  23.314 +	width = parms_converter.src_pixel_width;
  23.315 +	height = parms_converter.src_pixel_height;
  23.316 +
  23.317 +	// Plane data management
  23.318 +	// Y
  23.319 +	unsigned char* ram_addr_y = parms_converter.y_plane;
  23.320 +	// V
  23.321 +	unsigned char* ram_addr_v = parms_converter.v_plane;
  23.322 +	// U
  23.323 +	unsigned char* ram_addr_u = parms_converter.u_plane;
  23.324 +
  23.325 +	// BGRA
  23.326 +	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
  23.327 +
  23.328 +	// Strides
  23.329 +	unsigned int stride_y = width;
  23.330 +	unsigned int stride_vu = width>>1;
  23.331 +
  23.332 +	// Buffer management
  23.333 +	unsigned int buf_idx = 0;
  23.334 +	unsigned int size_4lines_y = stride_y<<2;
  23.335 +	unsigned int size_2lines_y = stride_y<<1;
  23.336 +	unsigned int size_2lines_vu = stride_vu<<1;
  23.337 +
  23.338 +	// 2*width*4byte_per_pixel
  23.339 +	unsigned int size_2lines_bgra = width<<3;
  23.340 +
  23.341 +	// start double-buffered processing
  23.342 +	// 4 lines y
  23.343 +	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.344 +	// 2 lines v
  23.345 +	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.346 +	// 2 lines u
  23.347 +	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.348 +
  23.349 +	// Wait for these transfers to be completed
  23.350 +	DMA_WAIT_TAG((RETR_BUF + buf_idx));
  23.351 +
  23.352 +	unsigned int i;
  23.353 +	for(i=0; i < (height>>2)-1; i++) {
  23.354 +		buf_idx^=1;
  23.355 +		// 4 lines y
  23.356 +		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.357 +		deprintf("4lines = %d\n", size_4lines_y);
  23.358 +		// 2 lines v
  23.359 +		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.360 +		deprintf("2lines = %d\n", size_2lines_vu);
  23.361 +		// 2 lines u
  23.362 +		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  23.363 +		deprintf("2lines = %d\n", size_2lines_vu);
  23.364 +
  23.365 +		DMA_WAIT_TAG((RETR_BUF + buf_idx));
  23.366 +
  23.367 +		buf_idx^=1;
  23.368 +
  23.369 +		// Convert YUV to BGRA, store it back (first two lines)
  23.370 +		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.371 +
  23.372 +		// Next two lines
  23.373 +		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
  23.374 +				v_plane[buf_idx] + stride_vu,
  23.375 +				u_plane[buf_idx] + stride_vu,
  23.376 +				bgra + size_2lines_bgra,
  23.377 +				width);
  23.378 +
  23.379 +		// Wait for previous storing transfer to be completed
  23.380 +		DMA_WAIT_TAG(STR_BUF);
  23.381 +
  23.382 +		// Store converted lines in two steps->max transfer size 16384
  23.383 +		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.384 +		ram_addr_bgra += size_2lines_bgra;
  23.385 +		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.386 +		ram_addr_bgra += size_2lines_bgra;
  23.387 +
  23.388 +		// Move 4 lines
  23.389 +		ram_addr_y += size_4lines_y;
  23.390 +		ram_addr_v += size_2lines_vu;
  23.391 +		ram_addr_u += size_2lines_vu;
  23.392 +
  23.393 +		buf_idx^=1;
  23.394 +	}
  23.395 +
  23.396 +	// Convert YUV to BGRA, store it back (first two lines)
  23.397 +	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  23.398 +
  23.399 +	// Next two lines
  23.400 +	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
  23.401 +			v_plane[buf_idx] + stride_vu,
  23.402 +			u_plane[buf_idx] + stride_vu,
  23.403 +			bgra + size_2lines_bgra,
  23.404 +			width);
  23.405 +
  23.406 +	// Wait for previous storing transfer to be completed
  23.407 +	DMA_WAIT_TAG(STR_BUF);
  23.408 +	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.409 +	ram_addr_bgra += size_2lines_bgra;
  23.410 +	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  23.411 +
  23.412 +	// Wait for previous storing transfer to be completed
  23.413 +	DMA_WAIT_TAG(STR_BUF);
  23.414 +}
  23.415 +
  23.416 +
  23.417 +/* Some vectors needed by the yuv 2 rgb conversion algorithm */
  23.418 +const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
  23.419 +const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  23.420 +const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
  23.421 +const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
  23.422 +const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
  23.423 +const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
  23.424 +
  23.425 +const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
  23.426 +const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
  23.427 +const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
  23.428 +const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
  23.429 +
  23.430 +const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
  23.431 +
  23.432 +const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
  23.433 +const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
  23.434 +
  23.435 +
  23.436 +#ifdef TESTING
  23.437 +/*
  23.438 + * yuv_to_rgb_w2()
  23.439 + *
  23.440 + * - converts x * 4 pixels from YUV to RGB
  23.441 + * - two lines of YUV are taken as input.
  23.442 + * - width has to be a multiple of 2 (= 4 pixel)
  23.443 + *
  23.444 + * @param y_addr address of the y plane (local store)
  23.445 + * @param v_addr address of the v plane (local store)
  23.446 + * @param u_addr address of the u plane (local store)
  23.447 + * @param bgra_addr_char address of the bgra output buffer (local store)
  23.448 + * @param width the width of a line in pixel
  23.449 + */
  23.450 +void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_char, unsigned int width) {
  23.451 +	// each pixel is stored as an integer
  23.452 +	unsigned int* bgra_addr = (unsigned int*) bgra_addr_char;
  23.453 +
  23.454 +	unsigned int x;
  23.455 +	// Go through each line in steps of 2, because every U and V value is connected to 4 pixels Y (YUV 4:2:0)
  23.456 +	for(x = 0; x < width; x+=2) {
  23.457 +		// Get the 4 Y, 1 U and 1 V values
  23.458 +		const unsigned char Y_1 = *(y_addr + x);
  23.459 +		const unsigned char Y_2 = *(y_addr + x + 1);
  23.460 +		const unsigned char Y_3 = *(y_addr + x + width);
  23.461 +		const unsigned char Y_4 = *(y_addr + x + width + 1);
  23.462 +		const unsigned char U = *(u_addr + (x >> 1));
  23.463 +		const unsigned char V = *(v_addr + (x >> 1));
  23.464 +
  23.465 +		// Start converting
  23.466 +		float V_minus_128 = (float)((float)V - 128.0f);
  23.467 +		float U_minus_128 = (float)((float)U - 128.0f);
  23.468 +
  23.469 +		float R_precalculate = 1.403f * V_minus_128;
  23.470 +		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
  23.471 +		float B_precalculate = 1.773f * U_minus_128;
  23.472 +
  23.473 +		// Cast the results
  23.474 +		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
  23.475 +		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
  23.476 +		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
  23.477 +		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
  23.478 +		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
  23.479 +		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
  23.480 +		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
  23.481 +		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
  23.482 +		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
  23.483 +		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
  23.484 +		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
  23.485 +		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
  23.486 +
  23.487 +		// Write back
  23.488 +		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
  23.489 +		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
  23.490 +		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
  23.491 +		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
  23.492 +	}
  23.493 +}
  23.494 +#endif
  23.495 +
  23.496 +
  23.497 +/*
  23.498 + * yuv_to_rgb_w32()
  23.499 + *
  23.500 + * processes to line of yuv-input, width has to be a multiple of 32
  23.501 + * two lines of yuv are taken as input
  23.502 + *
  23.503 + * @param y_addr address of the y plane in local store
  23.504 + * @param v_addr address of the v plane in local store
  23.505 + * @param u_addr address of the u plane in local store
  23.506 + * @param bgra_addr_ address of the bgra output buffer
  23.507 + * @param width the width in pixel
  23.508 + */
  23.509 +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
  23.510 +	// each pixel is stored as an integer
  23.511 +	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
  23.512 +
  23.513 +	unsigned int x;
  23.514 +	for(x = 0; x < width; x+=32) {
  23.515 +		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
  23.516 +
  23.517 +		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
  23.518 +		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
  23.519 +		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
  23.520 +		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
  23.521 +		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
  23.522 +		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
  23.523 +
  23.524 +		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
  23.525 +		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
  23.526 +		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
  23.527 +		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
  23.528 +
  23.529 +		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
  23.530 +		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
  23.531 +		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
  23.532 +		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
  23.533 +
  23.534 +		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
  23.535 +		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
  23.536 +		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
  23.537 +		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
  23.538 +		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
  23.539 +		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
  23.540 +		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
  23.541 +		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
  23.542 +		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
  23.543 +		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
  23.544 +		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
  23.545 +		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
  23.546 +		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
  23.547 +		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
  23.548 +		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
  23.549 +		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
  23.550 +
  23.551 +		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
  23.552 +		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
  23.553 +		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
  23.554 +		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
  23.555 +
  23.556 +		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
  23.557 +		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
  23.558 +		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
  23.559 +		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
  23.560 +		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
  23.561 +		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
  23.562 +		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
  23.563 +		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
  23.564 +
  23.565 +
  23.566 +		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
  23.567 +		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
  23.568 +		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
  23.569 +		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
  23.570 +
  23.571 +		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
  23.572 +		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
  23.573 +		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
  23.574 +		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
  23.575 +		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
  23.576 +		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
  23.577 +		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
  23.578 +		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
  23.579 +
  23.580 +
  23.581 +		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
  23.582 +		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
  23.583 +		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
  23.584 +		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
  23.585 +
  23.586 +		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
  23.587 +		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
  23.588 +		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
  23.589 +		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
  23.590 +		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
  23.591 +		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
  23.592 +		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
  23.593 +		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
  23.594 +
  23.595 +
  23.596 +		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
  23.597 +		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
  23.598 +		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
  23.599 +		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
  23.600 +		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
  23.601 +		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
  23.602 +		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
  23.603 +		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
  23.604 +		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
  23.605 +		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
  23.606 +		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
  23.607 +		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
  23.608 +		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
  23.609 +		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
  23.610 +		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
  23.611 +		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
  23.612 +
  23.613 +		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
  23.614 +		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
  23.615 +		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
  23.616 +		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
  23.617 +		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
  23.618 +		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
  23.619 +		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
  23.620 +		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
  23.621 +		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
  23.622 +		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
  23.623 +		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
  23.624 +		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
  23.625 +		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
  23.626 +		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
  23.627 +		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
  23.628 +		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
  23.629 +
  23.630 +		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
  23.631 +		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
  23.632 +		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
  23.633 +		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
  23.634 +		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
  23.635 +		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
  23.636 +		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
  23.637 +		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
  23.638 +		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
  23.639 +		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
  23.640 +		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
  23.641 +		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
  23.642 +		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
  23.643 +		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
  23.644 +		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
  23.645 +		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
  23.646 +
  23.647 +		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
  23.648 +		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
  23.649 +		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
  23.650 +		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
  23.651 +		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
  23.652 +		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
  23.653 +		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
  23.654 +		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
  23.655 +		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
  23.656 +		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
  23.657 +		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
  23.658 +		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
  23.659 +		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
  23.660 +		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
  23.661 +		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
  23.662 +		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
  23.663 +	}
  23.664 +}
  23.665 +