Hello. SDL-1.2
authorSam Lantinga <slouken@libsdl.org>
Thu, 02 Apr 2009 04:06:55 +0000
branchSDL-1.2
changeset 41653b8ac3d311a2
parent 4164 7a4c511c980a
child 4166 72a00fe65ffe
Hello.

This patch provides basic support for video on the Sony PS3
Linux framebuffer. Scaling, format-conversion, and drawing is
done from the SPEs, so there is little performance impact to
PPE applications. This is by no means production quality code,
but it is a very good start and a good example of how to use the
PS3's hardware capabilities to accelerate video playback on
the box.

The driver has been verified to work with ffplay, mplayer and xine.
This piece of software has been developed at the IBM R&D Lab
in Boeblingen, Germany and is now returned to the community.

Enjoy !

Signed-off-by: D.Herrendoerfer < d.herrendoerfer [at] de [dot] ibm [dot] com >
README.PS3
configure.in
include/SDL_config.h.in
src/video/SDL_sysvideo.h
src/video/SDL_video.c
src/video/ps3/SDL_ps3events.c
src/video/ps3/SDL_ps3events_c.h
src/video/ps3/SDL_ps3video.c
src/video/ps3/SDL_ps3video.h
src/video/ps3/SDL_ps3yuv.c
src/video/ps3/SDL_ps3yuv_c.h
src/video/ps3/spulibs/Makefile
src/video/ps3/spulibs/bilin_scaler.c
src/video/ps3/spulibs/fb_writer.c
src/video/ps3/spulibs/spu_common.h
src/video/ps3/spulibs/yuv2rgb_converter.c
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/README.PS3	Thu Apr 02 04:06:55 2009 +0000
     1.3 @@ -0,0 +1,29 @@
     1.4 +
     1.5 +SDL on Sony Playstation3
     1.6 +------------------------
     1.7 +
     1.8 +Installation:
     1.9 +  First, you have to install the Cell SDK
    1.10 +  - Download the Cell SDK installer RPM and ISO images to
    1.11 +    a temporary directory such as /tmp/cellsdk.
    1.12 +  - Mount the image: mount -o loop CellSDK-Devel-Fedora_3.1.0.0.0.iso /tmp/cellsdk
    1.13 +  - Install the SDK installer: rpm -ivh cell-install-3.1.0-0.0.noarch.rpm
    1.14 +  - Install the SDK: cd /opt/cell && ./cellsdk --iso /tmp/cellsdkiso install
    1.15 +
    1.16 +  You need to install the SPU-libs before installing SDL
    1.17 +  - Go to SDL-1.2/src/video/ps3/spulibs/
    1.18 +  - Run make && make install
    1.19 +
    1.20 +  Finally, install SDL
    1.21 +  - Go to SDL-1.2/ and build SDL like any other GNU style package.
    1.22 +  e.g.
    1.23 +    - Build the configure-script with ./autogen.sh
    1.24 +    - Configure SDL for your needs: ./configure --enable-video-ps3 ...
    1.25 +    - Build and install it: make && make install
    1.26 +
    1.27 +
    1.28 +Todo:
    1.29 +  - mouse/keyboard/controller support
    1.30 +
    1.31 +Have fun!
    1.32 +  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot ibm [dot] com>
     2.1 --- a/configure.in	Tue Mar 17 03:58:53 2009 +0000
     2.2 +++ b/configure.in	Thu Apr 02 04:06:55 2009 +0000
     2.3 @@ -1287,6 +1287,33 @@
     2.4      fi
     2.5  }
     2.6  
     2.7 +dnl See if we're running on PlayStation 3 Cell hardware
     2.8 +CheckPS3()
     2.9 +{
    2.10 +  AC_ARG_ENABLE(video-ps3,
    2.11 +                AC_HELP_STRING([--enable-video-ps3], [use PlayStation 3 Cell driver [[default=yes]]]),
    2.12 +                , enable_video_ps3=yes)
    2.13 +  if test x$enable_video = xyes -a x$enable_video_ps3 = xyes; then
    2.14 +    AC_MSG_CHECKING(for PlayStation 3 Cell support)
    2.15 +    video_ps3=no
    2.16 +    AC_TRY_COMPILE([
    2.17 +      #include <linux/fb.h>
    2.18 +      #include <asm/ps3fb.h>
    2.19 +    ],[
    2.20 +    ],[
    2.21 +      video_ps3=yes
    2.22 +    ])
    2.23 +    AC_MSG_RESULT($video_ps3)
    2.24 +    if test x$video_ps3 = xyes; then
    2.25 +      AC_DEFINE(SDL_VIDEO_DRIVER_PS3)
    2.26 +      SOURCES="$SOURCES $srcdir/src/video/ps3/*.c"
    2.27 +      EXTRA_CFLAGS="$EXTRA_CFLAGS -I/opt/cell/sdk/usr/include"
    2.28 +      EXTRA_LDFLAGS="$EXTRA_LDFLAGS -lbilin_scaler_spu -lfb_writer_spu -lyuv2rgb_spu -L/opt/cell/sdk/usr/lib -lspe2"
    2.29 +      have_video=yes
    2.30 +    fi
    2.31 +  fi
    2.32 +}
    2.33 +
    2.34  dnl Find the GGI includes
    2.35  CheckGGI()
    2.36  {
    2.37 @@ -2251,6 +2278,7 @@
    2.38          CheckFBCON
    2.39          CheckDirectFB
    2.40          CheckPS2GS
    2.41 +        CheckPS3
    2.42          CheckGGI
    2.43          CheckSVGA
    2.44          CheckVGL
     3.1 --- a/include/SDL_config.h.in	Tue Mar 17 03:58:53 2009 +0000
     3.2 +++ b/include/SDL_config.h.in	Thu Apr 02 04:06:55 2009 +0000
     3.3 @@ -269,6 +269,7 @@
     3.4  #undef SDL_VIDEO_DRIVER_PHOTON
     3.5  #undef SDL_VIDEO_DRIVER_PICOGUI
     3.6  #undef SDL_VIDEO_DRIVER_PS2GS
     3.7 +#undef SDL_VIDEO_DRIVER_PS3
     3.8  #undef SDL_VIDEO_DRIVER_QTOPIA
     3.9  #undef SDL_VIDEO_DRIVER_QUARTZ
    3.10  #undef SDL_VIDEO_DRIVER_RISCOS
     4.1 --- a/src/video/SDL_sysvideo.h	Tue Mar 17 03:58:53 2009 +0000
     4.2 +++ b/src/video/SDL_sysvideo.h	Thu Apr 02 04:06:55 2009 +0000
     4.3 @@ -347,6 +347,9 @@
     4.4  #if SDL_VIDEO_DRIVER_PS2GS
     4.5  extern VideoBootStrap PS2GS_bootstrap;
     4.6  #endif
     4.7 +#if SDL_VIDEO_DRIVER_PS3
     4.8 +extern VideoBootStrap PS3_bootstrap;
     4.9 +#endif
    4.10  #if SDL_VIDEO_DRIVER_GGI
    4.11  extern VideoBootStrap GGI_bootstrap;
    4.12  #endif
     5.1 --- a/src/video/SDL_video.c	Tue Mar 17 03:58:53 2009 +0000
     5.2 +++ b/src/video/SDL_video.c	Thu Apr 02 04:06:55 2009 +0000
     5.3 @@ -63,6 +63,9 @@
     5.4  #if SDL_VIDEO_DRIVER_PS2GS
     5.5  	&PS2GS_bootstrap,
     5.6  #endif
     5.7 +#if SDL_VIDEO_DRIVER_PS3
     5.8 +	&PS3_bootstrap,
     5.9 +#endif
    5.10  #if SDL_VIDEO_DRIVER_GGI
    5.11  	&GGI_bootstrap,
    5.12  #endif
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/src/video/ps3/SDL_ps3events.c	Thu Apr 02 04:06:55 2009 +0000
     6.3 @@ -0,0 +1,44 @@
     6.4 +/*
     6.5 + * SDL - Simple DirectMedia Layer
     6.6 + * CELL BE Support for PS3 Framebuffer
     6.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
     6.8 + *
     6.9 + * This library is free software; you can redistribute it and/or modify it
    6.10 + * under the terms of the GNU Lesser General Public License as published
    6.11 + * by the Free Software Foundation; either version 2.1 of the License, or
    6.12 + * (at your option) any later version.
    6.13 + *
    6.14 + * This library is distributed in the hope that it will be useful, but
    6.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    6.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    6.17 + * Lesser General Public License for more details.
    6.18 + *
    6.19 + * You should have received a copy of the GNU Lesser General Public
    6.20 + * License along with this library; if not, write to the Free Software
    6.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
    6.22 + * USA
    6.23 + *
    6.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
    6.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
    6.26 + *  SPE code based on research by:
    6.27 + *  Rene Becker
    6.28 + *  Thimo Emmerich
    6.29 + */
    6.30 +
    6.31 +#include "SDL_config.h"
    6.32 +
    6.33 +#include "../../events/SDL_sysevents.h"
    6.34 +#include "../../events/SDL_events_c.h"
    6.35 +#include "SDL_ps3video.h"
    6.36 +#include "SDL_ps3events_c.h"
    6.37 +
    6.38 +void PS3_PumpEvents(_THIS)
    6.39 +{
    6.40 +	return;
    6.41 +}
    6.42 +
    6.43 +void PS3_InitOSKeymap(_THIS)
    6.44 +{
    6.45 +        return;
    6.46 +}
    6.47 +
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/src/video/ps3/SDL_ps3events_c.h	Thu Apr 02 04:06:55 2009 +0000
     7.3 @@ -0,0 +1,41 @@
     7.4 +/*
     7.5 + * SDL - Simple DirectMedia Layer
     7.6 + * CELL BE Support for PS3 Framebuffer
     7.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
     7.8 + *
     7.9 + * This library is free software; you can redistribute it and/or modify it
    7.10 + * under the terms of the GNU Lesser General Public License as published
    7.11 + * by the Free Software Foundation; either version 2.1 of the License, or
    7.12 + * (at your option) any later version.
    7.13 + *
    7.14 + * This library is distributed in the hope that it will be useful, but
    7.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    7.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    7.17 + * Lesser General Public License for more details.
    7.18 + *
    7.19 + * You should have received a copy of the GNU Lesser General Public
    7.20 + * License along with this library; if not, write to the Free Software
    7.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
    7.22 + * USA
    7.23 + *
    7.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
    7.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
    7.26 + *  SPE code based on research by:
    7.27 + *  Rene Becker
    7.28 + *  Thimo Emmerich
    7.29 + */
    7.30 +
    7.31 +#include "SDL_config.h"
    7.32 +
    7.33 +#ifndef _SDL_ps3events_h
    7.34 +#define _SDL_ps3events_h
    7.35 +
    7.36 +#include "SDL_ps3video.h"
    7.37 +
    7.38 +extern void PS3_InitOSKeymap(_THIS);
    7.39 +extern void PS3_PumpEvents(_THIS);
    7.40 +
    7.41 +extern void enable_cursor(int enable);
    7.42 +
    7.43 +#endif /* _SDL_ps3events_h */
    7.44 +
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/src/video/ps3/SDL_ps3video.c	Thu Apr 02 04:06:55 2009 +0000
     8.3 @@ -0,0 +1,621 @@
     8.4 +/*
     8.5 + * SDL - Simple DirectMedia Layer
     8.6 + * CELL BE Support for PS3 Framebuffer
     8.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
     8.8 + *
     8.9 + * This library is free software; you can redistribute it and/or modify it
    8.10 + * under the terms of the GNU Lesser General Public License as published
    8.11 + * by the Free Software Foundation; either version 2.1 of the License, or
    8.12 + * (at your option) any later version.
    8.13 + *
    8.14 + * This library is distributed in the hope that it will be useful, but
    8.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    8.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    8.17 + * Lesser General Public License for more details.
    8.18 + *
    8.19 + * You should have received a copy of the GNU Lesser General Public
    8.20 + * License along with this library; if not, write to the Free Software
    8.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
    8.22 + * USA
    8.23 + *
    8.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
    8.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
    8.26 + *  SPE code based on research by:
    8.27 + *  Rene Becker
    8.28 + *  Thimo Emmerich
    8.29 + */
    8.30 +
    8.31 +#include "SDL_config.h"
    8.32 +
    8.33 +#include "SDL_video.h"
    8.34 +#include "../SDL_sysvideo.h"
    8.35 +#include "SDL_ps3events_c.h"
    8.36 +#include "SDL_ps3video.h"
    8.37 +#include "SDL_ps3yuv_c.h"
    8.38 +#include "spulibs/spu_common.h"
    8.39 +
    8.40 +#include <fcntl.h>
    8.41 +#include <stdlib.h>
    8.42 +#include <sys/ioctl.h>
    8.43 +#include <linux/kd.h>
    8.44 +#include <sys/mman.h>
    8.45 +
    8.46 +#include <linux/fb.h>
    8.47 +#include <asm/ps3fb.h>
    8.48 +#include <libspe2.h>
    8.49 +#include <malloc.h>
    8.50 +
    8.51 +/* SDL_VideoDevice functions */
    8.52 +static int PS3_Available();
    8.53 +static SDL_VideoDevice *PS3_CreateDevice(int devindex);
    8.54 +static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat);
    8.55 +static void PS3_VideoQuit(_THIS);
    8.56 +static void PS3_DeleteDevice(SDL_VideoDevice * device);
    8.57 +static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags);
    8.58 +static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags);
    8.59 +
    8.60 +/* Hardware surface functions */
    8.61 +static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface);
    8.62 +static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface);
    8.63 +static int PS3_LockHWSurface(_THIS, SDL_Surface * surface);
    8.64 +static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface);
    8.65 +static int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface);
    8.66 +static void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects);
    8.67 +
    8.68 +/* SPU specific functions */
    8.69 +int SPE_Start(_THIS, spu_data_t * spe_data);
    8.70 +int SPE_Stop(_THIS, spu_data_t * spe_data);
    8.71 +int SPE_Boot(_THIS, spu_data_t * spe_data);
    8.72 +int SPE_Shutdown(_THIS, spu_data_t * spe_data);
    8.73 +int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
    8.74 +int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
    8.75 +void SPE_RunContext(void *thread_argp);
    8.76 +
    8.77 +/* Helpers */
    8.78 +void enable_cursor(int enable);
    8.79 +
    8.80 +/* Stores the SPE executable name of fb_writer_spu */
    8.81 +extern spe_program_handle_t fb_writer_spu;
    8.82 +
    8.83 +/* SDL PS3 bootstrap function for checking availability */
    8.84 +static int PS3_Available()
    8.85 +{
    8.86 +	return 1;
    8.87 +}
    8.88 +
    8.89 +/* SDL PS3 bootstrap function for creating the device */
    8.90 +static SDL_VideoDevice *PS3_CreateDevice(int devindex)
    8.91 +{
    8.92 +	SDL_VideoDevice *this;
    8.93 +
    8.94 +	/* Initialise SDL_VideoDevice */
    8.95 +	this = (SDL_VideoDevice *) SDL_malloc(sizeof(SDL_VideoDevice));
    8.96 +	if (this) {
    8.97 +		memset(this, 0, sizeof *this);
    8.98 +		this->hidden = (struct SDL_PrivateVideoData *)
    8.99 +		    SDL_malloc(sizeof(struct SDL_PrivateVideoData));
   8.100 +	}
   8.101 +	/* Error handling */
   8.102 +	if ((this == NULL) || (this->hidden == NULL)) {
   8.103 +		SDL_OutOfMemory();
   8.104 +		if (this)
   8.105 +			SDL_free(this);
   8.106 +		return 0;
   8.107 +	}
   8.108 +	memset(this->hidden, 0, sizeof(struct SDL_PrivateVideoData));
   8.109 +
   8.110 +	/* Set the function pointers */
   8.111 +	this->VideoInit = PS3_VideoInit;
   8.112 +	this->ListModes = PS3_ListModes;
   8.113 +	this->SetVideoMode = PS3_SetVideoMode;
   8.114 +	this->SetColors = 0;
   8.115 +	this->CreateYUVOverlay = PS3_CreateYUVOverlay;
   8.116 +	this->UpdateRects = 0;
   8.117 +	this->VideoQuit = PS3_VideoQuit;
   8.118 +	this->AllocHWSurface = PS3_AllocHWSurface;
   8.119 +	this->CheckHWBlit = 0;
   8.120 +	this->FillHWRect = 0;
   8.121 +	this->SetHWColorKey = 0;
   8.122 +	this->SetHWAlpha = 0;
   8.123 +	this->LockHWSurface = PS3_LockHWSurface;
   8.124 +	this->UnlockHWSurface = PS3_UnlockHWSurface;
   8.125 +	this->FlipHWSurface = PS3_FlipDoubleBuffer;
   8.126 +	this->FreeHWSurface = PS3_FreeHWSurface;
   8.127 +	this->SetCaption = 0;
   8.128 +	this->SetIcon = 0;
   8.129 +	this->IconifyWindow = 0;
   8.130 +	this->GrabInput = 0;
   8.131 +	this->GetWMInfo = 0;
   8.132 +	this->InitOSKeymap = PS3_InitOSKeymap;
   8.133 +	this->PumpEvents = PS3_PumpEvents;
   8.134 +
   8.135 +	this->free = PS3_DeleteDevice;
   8.136 +
   8.137 +	return this;
   8.138 +}
   8.139 +
   8.140 +
   8.141 +/* Bootstraping (see SDL_sysvideo.h) */
   8.142 +VideoBootStrap PS3_bootstrap = {
   8.143 +	"ps3", "PS3 Cell SPU Driver",
   8.144 +	PS3_Available, PS3_CreateDevice
   8.145 +};
   8.146 +
   8.147 +
   8.148 +/* Delete the device */
   8.149 +static void PS3_DeleteDevice(SDL_VideoDevice * device)
   8.150 +{
   8.151 +	free(device->hidden);
   8.152 +	free(device);
   8.153 +}
   8.154 +
   8.155 +
   8.156 +/* Initialise the PS3 video device */
   8.157 +static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat)
   8.158 +{
   8.159 +	/* Hide the cursor */
   8.160 +	enable_cursor(0);
   8.161 +
   8.162 +	/* Create SPU fb_parms and thread structure */
   8.163 +	fb_parms = (struct fb_writer_parms_t *)
   8.164 +	    memalign(16, sizeof(struct fb_writer_parms_t));
   8.165 +	fb_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
   8.166 +	if (fb_parms == NULL || fb_thread_data == NULL) {
   8.167 +		SDL_OutOfMemory();
   8.168 +		return -1;
   8.169 +	}
   8.170 +	fb_thread_data->program = fb_writer_spu;
   8.171 +	fb_thread_data->program_name = "fb_writer_spu";
   8.172 +	fb_thread_data->argp = (void *)fb_parms;
   8.173 +	fb_thread_data->keepalive = 1;
   8.174 +	fb_thread_data->booted = 0;
   8.175 +
   8.176 +	SPE_Start(this, fb_thread_data);
   8.177 +
   8.178 +	/* Open the device */
   8.179 +	fb_dev_fd = open(PS3_DEV_FB, O_RDWR);
   8.180 +	if (fb_dev_fd < 0) {
   8.181 +		SDL_SetError("[PS3] Unable to open device %s", PS3_DEV_FB);
   8.182 +		return -1;
   8.183 +	}
   8.184 +
   8.185 +	/* Get vscreeninfo */
   8.186 +	if (ioctl(fb_dev_fd, FBIOGET_VSCREENINFO, &fb_vinfo)) {
   8.187 +		SDL_SetError("[PS3] Can't get VSCREENINFO");
   8.188 +		if (fb_dev_fd >= 0)
   8.189 +			close(fb_dev_fd);
   8.190 +		fb_dev_fd = -1;
   8.191 +		return -1;
   8.192 +	}
   8.193 +
   8.194 +	/* Fill in our hardware acceleration capabilities */
   8.195 +	this->info.current_w = fb_vinfo.xres;
   8.196 +	this->info.current_h = fb_vinfo.yres;
   8.197 +	this->info.wm_available = 0;
   8.198 +	this->info.hw_available = 1;
   8.199 +
   8.200 +	/* Backup the original vinfo to restore later */
   8.201 +	fb_orig_vinfo = fb_vinfo;
   8.202 +
   8.203 +	/* 16 and 15 bpp is reported as 16 bpp */
   8.204 +	fb_bits_per_pixel = fb_vinfo.bits_per_pixel;
   8.205 +	if (fb_bits_per_pixel == 16)
   8.206 +		fb_bits_per_pixel =
   8.207 +		    fb_vinfo.red.length + fb_vinfo.green.length +
   8.208 +		    fb_vinfo.blue.length;
   8.209 +
   8.210 +	/* Set SDL_PixelFormat */
   8.211 +	vformat->BitsPerPixel = fb_vinfo.bits_per_pixel;
   8.212 +
   8.213 +	fb_vinfo.xres_virtual = fb_vinfo.xres;
   8.214 +	fb_vinfo.yres_virtual = fb_vinfo.yres;
   8.215 +
   8.216 +	/* Put vscreeninfo */
   8.217 +	if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
   8.218 +		SDL_SetError("[PS3] Can't put VSCREENINFO");
   8.219 +		if (fb_dev_fd >= 0)
   8.220 +			close(fb_dev_fd);
   8.221 +		fb_dev_fd = -1;
   8.222 +		return -1;
   8.223 +	}
   8.224 +
   8.225 +	s_fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
   8.226 +
   8.227 +	s_writeable_width = fb_vinfo.xres;
   8.228 +	s_writeable_height = fb_vinfo.yres;
   8.229 +
   8.230 +	/* Get ps3 screeninfo */
   8.231 +	if (ioctl(fb_dev_fd, PS3FB_IOCTL_SCREENINFO, (unsigned long)&res) < 0) {
   8.232 +		SDL_SetError("[PS3] PS3FB_IOCTL_SCREENINFO failed");
   8.233 +	}
   8.234 +	deprintf(1, "[PS3] xres:%d yres:%d xoff:%d yoff:%d\n", res.xres, res.yres, res.xoff, res.yoff);
   8.235 +
   8.236 +	/* Only use double buffering if enough fb memory is available */
   8.237 +	if (res.num_frames < 2) {
   8.238 +		double_buffering = 0;
   8.239 +	} else {
   8.240 +		double_buffering = 1;
   8.241 +	}
   8.242 +
   8.243 +	real_width = res.xres;
   8.244 +	real_height = res.yres;
   8.245 +
   8.246 +	/*
   8.247 +	 * Take control of frame buffer from kernel, for details see
   8.248 +	 * http://felter.org/wesley/files/ps3/linux-20061110-docs/ApplicationProgrammingEnvironment.html
   8.249 +	 * kernel will no longer flip the screen itself
   8.250 +	 */
   8.251 +	ioctl(fb_dev_fd, PS3FB_IOCTL_ON, 0);
   8.252 +
   8.253 +	/* Unblank screen */
   8.254 +	ioctl(fb_dev_fd, FBIOBLANK, 0);
   8.255 +
   8.256 +	return 0;
   8.257 +}
   8.258 +
   8.259 +
   8.260 +/* List available PS3 resolutions */
   8.261 +static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags)
   8.262 +{
   8.263 +	/* A list of video resolutions that we query for (sorted largest to
   8.264 +	 * smallest)
   8.265 +	 */
   8.266 +	static SDL_Rect PS3_resolutions[] = {
   8.267 +		{0, 0, 1920, 1080}, // 1080p 16:9 HD
   8.268 +		{0, 0, 1600, 1200}, // WUXGA
   8.269 +		{0, 0, 1280, 1024}, // SXGA
   8.270 +		{0, 0, 1280, 720},  // 720p 16:9 HD
   8.271 +		{0, 0, 1024, 768},  // WXGA
   8.272 +		{0, 0, 1024, 576},  // 576p 16:9
   8.273 +		{0, 0, 853, 480},   // 480p 16:9
   8.274 +		{0, 0, 720, 576},   // 576p 4:3 (PAL)
   8.275 +		{0, 0, 720, 480},   // 480p 16:9 (NTSC)
   8.276 +	};
   8.277 +	static SDL_Rect *PS3_modes[] = {
   8.278 +		&PS3_resolutions[0],
   8.279 +		&PS3_resolutions[1],
   8.280 +		&PS3_resolutions[2],
   8.281 +		&PS3_resolutions[3],
   8.282 +		&PS3_resolutions[4],
   8.283 +		&PS3_resolutions[5],
   8.284 +		&PS3_resolutions[6],
   8.285 +		&PS3_resolutions[7],
   8.286 +		&PS3_resolutions[8],
   8.287 +		NULL
   8.288 +	};
   8.289 +	SDL_Rect **modes = PS3_modes;
   8.290 +
   8.291 +	return modes;
   8.292 +}
   8.293 +
   8.294 +
   8.295 +/* Get a list of the available display modes */
   8.296 +static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags)
   8.297 +{
   8.298 +	s_bounded_input_width = width < s_writeable_width ? width : s_writeable_width;
   8.299 +	s_bounded_input_height = height < s_writeable_height ? height : s_writeable_height;
   8.300 +	s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
   8.301 +	s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
   8.302 +	s_input_line_length = width * s_fb_pixel_size;
   8.303 +
   8.304 +	current->flags |= flags;
   8.305 +
   8.306 +	if (ioctl(fb_dev_fd, FBIOGET_FSCREENINFO, &fb_finfo)) {
   8.307 +		SDL_SetError("[PS3] Can't get fixed screeninfo");
   8.308 +		return NULL;
   8.309 +	}
   8.310 +
   8.311 +	if (fb_finfo.type != FB_TYPE_PACKED_PIXELS) {
   8.312 +		SDL_SetError("[PS3] type %s not supported",
   8.313 +			     fb_finfo.type);
   8.314 +		return NULL;
   8.315 +	}
   8.316 +
   8.317 +	/* Note: on PS3, fb_finfo.smem_len is enough for double buffering */
   8.318 +	if ((frame_buffer =
   8.319 +	     (uint8_t *) mmap(0, fb_finfo.smem_len,
   8.320 +			      PROT_READ | PROT_WRITE, MAP_SHARED,
   8.321 +			      fb_dev_fd, 0)) == (uint8_t *) - 1) {
   8.322 +		SDL_SetError("[PS3] Can't mmap for %s", PS3_DEV_FB);
   8.323 +		return NULL;
   8.324 +	} else {
   8.325 +		current->flags |= SDL_DOUBLEBUF;
   8.326 +	}
   8.327 +	if (!SDL_ReallocFormat(current, fb_bits_per_pixel, 0, 0, 0, 0)) {
   8.328 +		return (NULL);
   8.329 +	}
   8.330 +
   8.331 +	/* Blank screen */
   8.332 +	memset(frame_buffer, 0x00, fb_finfo.smem_len);
   8.333 +
   8.334 +	/* Centering */
   8.335 +	s_center[0] =
   8.336 +	    frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
   8.337 +	    s_bounded_input_height_offset * fb_finfo.line_length;
   8.338 +	s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
   8.339 +	s_center_index = 0;
   8.340 +
   8.341 +	current->flags |= SDL_FULLSCREEN;
   8.342 +	current->w = width;
   8.343 +	current->h = height;
   8.344 +	current->pitch = SDL_CalculatePitch(current);
   8.345 +
   8.346 +	/* Alloc aligned mem for current->pixels */
   8.347 +	s_pixels = memalign(16, current->h * current->pitch);
   8.348 +	current->pixels = (void *)s_pixels;
   8.349 +	if (!current->pixels) {
   8.350 +		SDL_OutOfMemory();
   8.351 +		return NULL;
   8.352 +	}
   8.353 +
   8.354 +	/* Set the update rectangle function */
   8.355 +	this->UpdateRects = PS3_DoubleBufferUpdate;
   8.356 +
   8.357 +	return current;
   8.358 +}
   8.359 +
   8.360 +
   8.361 +/* Copy screen to framebuffer and flip */
   8.362 +void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects)
   8.363 +{
   8.364 +	if (converter_thread_data && converter_thread_data->booted)
   8.365 +		SPE_WaitForMsg(this, converter_thread_data, SPU_FIN);
   8.366 +
   8.367 +	/* Adjust centering */
   8.368 +	s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
   8.369 +	s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
   8.370 +	s_center[0] = frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
   8.371 +		s_bounded_input_height_offset * fb_finfo.line_length;
   8.372 +	s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
   8.373 +
   8.374 +	/* Set SPU parms for copying the surface to framebuffer */
   8.375 +	fb_parms->data = (unsigned char *)s_pixels;
   8.376 +	fb_parms->center = s_center[s_center_index];
   8.377 +	fb_parms->out_line_stride = fb_finfo.line_length;
   8.378 +	fb_parms->in_line_stride = s_input_line_length;
   8.379 +	fb_parms->bounded_input_height = s_bounded_input_height;
   8.380 +	fb_parms->bounded_input_width = s_bounded_input_width;
   8.381 +	fb_parms->fb_pixel_size = s_fb_pixel_size;
   8.382 +
   8.383 +	deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", fb_thread_data->argp);
   8.384 +
   8.385 +	/* Copying.. */
   8.386 +	SPE_SendMsg(this, fb_thread_data, SPU_START);
   8.387 +	SPE_SendMsg(this, fb_thread_data, (unsigned int)fb_thread_data->argp);
   8.388 +
   8.389 +	SPE_WaitForMsg(this, fb_thread_data, SPU_FIN);
   8.390 +
   8.391 +	/* Flip the pages */
   8.392 +	if (double_buffering)
   8.393 +		s_center_index = s_center_index ^ 0x01;
   8.394 +	PS3_FlipDoubleBuffer(this, this->screen);
   8.395 +}
   8.396 +
   8.397 +
   8.398 +/* Enable/Disable cursor */
   8.399 +void enable_cursor(int enable)
   8.400 +{
   8.401 +	int fd = open("/dev/console", O_RDWR | O_NONBLOCK);
   8.402 +	if (fd >= 0) {
   8.403 +		ioctl(fd, KDSETMODE, enable ? KD_TEXT : KD_GRAPHICS);
   8.404 +		close(fd);
   8.405 +	}
   8.406 +}
   8.407 +
   8.408 +
   8.409 +static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface)
   8.410 +{
   8.411 +	return -1;
   8.412 +}
   8.413 +
   8.414 +
   8.415 +static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface)
   8.416 +{
   8.417 +	return;
   8.418 +}
   8.419 +
   8.420 +
   8.421 +static int PS3_LockHWSurface(_THIS, SDL_Surface * surface)
   8.422 +{
   8.423 +	return 0;
   8.424 +}
   8.425 +
   8.426 +
   8.427 +static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface)
   8.428 +{
   8.429 +	return;
   8.430 +}
   8.431 +
   8.432 +
   8.433 +/* Blit/Flip buffer to the screen. Must be called after each frame! */
   8.434 +int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface)
   8.435 +{
   8.436 +	unsigned long crt = 0;
   8.437 +	/* Wait for vsync */
   8.438 +	deprintf(1, "[PS3] Wait for vsync\n");
   8.439 +	ioctl(fb_dev_fd, FBIO_WAITFORVSYNC, &crt);
   8.440 +	/* Page flip */
   8.441 +	deprintf(1, "[PS3] Page flip to buffer #%u 0x%x\n", s_center_index, s_center[s_center_index]);
   8.442 +	ioctl(fb_dev_fd, PS3FB_IOCTL_FSEL, (unsigned long)&s_center_index);
   8.443 +	return 1;
   8.444 +}
   8.445 +
   8.446 +
   8.447 +/* Start the SPE thread */
   8.448 +int SPE_Start(_THIS, spu_data_t * spe_data)
   8.449 +{
   8.450 +	deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
   8.451 +	if (!(spe_data->booted))
   8.452 +		SPE_Boot(this, spe_data);
   8.453 +
   8.454 +	/* To allow re-running of context, spe_ctx_entry has to be set before each call */
   8.455 +	spe_data->entry = SPE_DEFAULT_ENTRY;
   8.456 +	spe_data->error_code = 0;
   8.457 +
   8.458 +	/* Create SPE thread and run */
   8.459 +	deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
   8.460 +	if (pthread_create
   8.461 +	    (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
   8.462 +		deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
   8.463 +		SDL_SetError("[PS3->SPU] Could not create pthread for spe");
   8.464 +		return -1;
   8.465 +	}
   8.466 +
   8.467 +	if (spe_data->keepalive)
   8.468 +		SPE_WaitForMsg(this, spe_data, SPU_READY);
   8.469 +}
   8.470 +
   8.471 +
   8.472 +/* Stop the SPE thread */
   8.473 +int SPE_Stop(_THIS, spu_data_t * spe_data)
   8.474 +{
   8.475 +	deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
   8.476 +	/* Wait for SPE thread to complete */
   8.477 +	deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
   8.478 +	if (pthread_join(spe_data->thread, NULL)) {
   8.479 +		deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
   8.480 +		SDL_SetError("[PS3->SPU] Failed joining the thread");
   8.481 +		return -1;
   8.482 +	}
   8.483 +
   8.484 +	return 0;
   8.485 +}
   8.486 +
   8.487 +
   8.488 +/* Create SPE context and load program */
   8.489 +int SPE_Boot(_THIS, spu_data_t * spe_data)
   8.490 +{
   8.491 +	/* Create SPE context */
   8.492 +	deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
   8.493 +	spe_data->ctx = spe_context_create(0, NULL);
   8.494 +	if (spe_data->ctx == NULL) {
   8.495 +		deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
   8.496 +		SDL_SetError("[PS3->SPU] Failed creating SPE context");
   8.497 +		return -1;
   8.498 +	}
   8.499 +
   8.500 +	/* Load SPE object into SPE local store */
   8.501 +	deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
   8.502 +	if (spe_program_load(spe_data->ctx, &spe_data->program)) {
   8.503 +		deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
   8.504 +		SDL_SetError
   8.505 +		    ("[PS3->SPU] Failed loading program into SPE context");
   8.506 +		return -1;
   8.507 +	}
   8.508 +	spe_data->booted = 1;
   8.509 +	deprintf(2, "[PS3->SPU] SPE boot successful\n");
   8.510 +
   8.511 +	return 0;
   8.512 +}
   8.513 +
   8.514 +/* (Stop and) shutdown the SPE */
   8.515 +int SPE_Shutdown(_THIS, spu_data_t * spe_data)
   8.516 +{
   8.517 +	if (spe_data->keepalive && spe_data->booted) {
   8.518 +		SPE_SendMsg(this, spe_data, SPU_EXIT);
   8.519 +		SPE_Stop(this, spe_data);
   8.520 +	}
   8.521 +
   8.522 +	/* Destroy SPE context */
   8.523 +	deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
   8.524 +	if (spe_context_destroy(spe_data->ctx)) {
   8.525 +		deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
   8.526 +		SDL_SetError("[PS3->SPU] Failed destroying context");
   8.527 +		return -1;
   8.528 +	}
   8.529 +	deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
   8.530 +	return 0;
   8.531 +}
   8.532 +
   8.533 +
   8.534 +/* Send message to the SPE via mailboxe */
   8.535 +int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
   8.536 +{
   8.537 +	deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
   8.538 +	/* Send one message, block until message was sent */
   8.539 +	unsigned int spe_in_mbox_msgs[1];
   8.540 +	spe_in_mbox_msgs[0] = msg;
   8.541 +	int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
   8.542 +
   8.543 +	if (1 > in_mbox_write) {
   8.544 +		deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
   8.545 +		SDL_SetError("[PS3->SPU] No message could be written");
   8.546 +		return -1;
   8.547 +	}
   8.548 +	return 0;
   8.549 +}
   8.550 +
   8.551 +
   8.552 +/* Read 1 message from SPE, block until at least 1 message was received */
   8.553 +int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
   8.554 +{
   8.555 +	deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
   8.556 +	unsigned int out_messages[1];
   8.557 +	while (!spe_out_mbox_status(spe_data->ctx));
   8.558 +	int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
   8.559 +	deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
   8.560 +	if (out_messages[0] == msg)
   8.561 +		return 0;
   8.562 +	else
   8.563 +		return -1;
   8.564 +}
   8.565 +
   8.566 +
   8.567 +/* Re-runnable invocation of the spe_context_run call */
   8.568 +void SPE_RunContext(void *thread_argp)
   8.569 +{
   8.570 +	/* argp is the pointer to argument to be passed to the SPE program */
   8.571 +	spu_data_t *args = (spu_data_t *) thread_argp;
   8.572 +	deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
   8.573 +
   8.574 +	/* Run it.. */
   8.575 +	deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
   8.576 +	if (spe_context_run
   8.577 +	    (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
   8.578 +	     NULL) < 0) {
   8.579 +		deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
   8.580 +		SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
   8.581 +		exit(1);
   8.582 +	}
   8.583 +
   8.584 +	pthread_exit(NULL);
   8.585 +}
   8.586 +
   8.587 +
   8.588 +/* Quits the video driver */
   8.589 +static void PS3_VideoQuit(_THIS)
   8.590 +{
   8.591 +	if (fb_dev_fd > 0) {
   8.592 +		/* Restore the original video mode */
   8.593 +		if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_orig_vinfo))
   8.594 +			SDL_SetError("[PS3] Can't restore original fb_var_screeninfo");
   8.595 +
   8.596 +		/* Give control of frame buffer to kernel */
   8.597 +		ioctl(fb_dev_fd, PS3FB_IOCTL_OFF, 0);
   8.598 +		close(fb_dev_fd);
   8.599 +		fb_dev_fd = -1;
   8.600 +	}
   8.601 +
   8.602 +	if (frame_buffer) {
   8.603 +		munmap(frame_buffer, fb_finfo.smem_len);
   8.604 +		frame_buffer = 0;
   8.605 +	}
   8.606 +
   8.607 +	if (fb_parms)
   8.608 +		free((void *)fb_parms);
   8.609 +	if (fb_thread_data) {
   8.610 +		SPE_Shutdown(this, fb_thread_data);
   8.611 +		free((void *)fb_thread_data);
   8.612 +	}
   8.613 +
   8.614 +	if (this->screen) {
   8.615 +		if (double_buffering && this->screen->pixels) {
   8.616 +			free(this->screen->pixels);
   8.617 +		}
   8.618 +		this->screen->pixels = NULL;
   8.619 +	}
   8.620 +
   8.621 +	enable_cursor(1);
   8.622 +	deprintf(1, "[PS3] VideoQuit\n");
   8.623 +}
   8.624 +
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/src/video/ps3/SDL_ps3video.h	Thu Apr 02 04:06:55 2009 +0000
     9.3 @@ -0,0 +1,165 @@
     9.4 +/*
     9.5 + * SDL - Simple DirectMedia Layer
     9.6 + * CELL BE Support for PS3 Framebuffer
     9.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
     9.8 + *
     9.9 + * This library is free software; you can redistribute it and/or modify it
    9.10 + * under the terms of the GNU Lesser General Public License as published
    9.11 + * by the Free Software Foundation; either version 2.1 of the License, or
    9.12 + * (at your option) any later version.
    9.13 + *
    9.14 + * This library is distributed in the hope that it will be useful, but
    9.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
    9.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    9.17 + * Lesser General Public License for more details.
    9.18 + *
    9.19 + * You should have received a copy of the GNU Lesser General Public
    9.20 + * License along with this library; if not, write to the Free Software
    9.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
    9.22 + * USA
    9.23 + *
    9.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
    9.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
    9.26 + *  SPE code based on research by:
    9.27 + *  Rene Becker
    9.28 + *  Thimo Emmerich
    9.29 + */
    9.30 +
    9.31 +#include "SDL_config.h"
    9.32 +#include "../SDL_sysvideo.h"
    9.33 +#include "SDL_mouse.h"
    9.34 +#include "SDL_mutex.h"
    9.35 +#include "spulibs/spu_common.h"
    9.36 +
    9.37 +#include <libspe2.h>
    9.38 +#include <pthread.h>
    9.39 +#include <linux/types.h>
    9.40 +#include <linux/fb.h>
    9.41 +#include <asm/ps3fb.h>
    9.42 +#include <linux/vt.h>
    9.43 +#include <termios.h>
    9.44 +
    9.45 +#ifndef _SDL_ps3video_h
    9.46 +#define _SDL_ps3video_h
    9.47 +
    9.48 +/* Debugging
    9.49 + * 0: No debug messages
    9.50 + * 1: Video debug messages
    9.51 + * 2: SPE debug messages
    9.52 + * 3: Memory adresses
    9.53 + */
    9.54 +#define DEBUG_LEVEL 0
    9.55 +
    9.56 +#ifdef DEBUG_LEVEL
    9.57 +#define deprintf( level, fmt, args... ) \
    9.58 +    do \
    9.59 +{ \
    9.60 +    if ( (unsigned)(level) <= DEBUG_LEVEL ) \
    9.61 +    { \
    9.62 +        fprintf( stdout, fmt, ##args ); \
    9.63 +        fflush( stdout ); \
    9.64 +    } \
    9.65 +} while ( 0 )
    9.66 +#else
    9.67 +#define deprintf( level, fmt, args... )
    9.68 +#endif
    9.69 +
    9.70 +/* Framebuffer device */
    9.71 +#define PS3_DEV_FB "/dev/fb0"
    9.72 +
    9.73 +/* Hidden "this" pointer for the video functions */
    9.74 +#define _THIS   SDL_VideoDevice * this
    9.75 +
    9.76 +/* SPU thread data */
    9.77 +typedef struct spu_data {
    9.78 +    spe_context_ptr_t ctx;
    9.79 +    pthread_t thread;
    9.80 +    spe_program_handle_t program;
    9.81 +    char * program_name;
    9.82 +    unsigned int booted;
    9.83 +    unsigned int keepalive;
    9.84 +    unsigned int entry;
    9.85 +    int error_code;
    9.86 +    void * argp;
    9.87 +} spu_data_t;
    9.88 +
    9.89 +/* Private video driver data needed for Cell support */
    9.90 +struct SDL_PrivateVideoData
    9.91 +{
    9.92 +    const char * const fb_dev_name; /* FB-device name */
    9.93 +    int fb_dev_fd; /* Descriptor-handle for fb_dev_name */
    9.94 +    uint8_t * frame_buffer; /* mmap'd access to fbdev */
    9.95 +
    9.96 +    /* SPE threading stuff */
    9.97 +    spu_data_t * fb_thread_data;
    9.98 +    spu_data_t * scaler_thread_data;
    9.99 +    spu_data_t * converter_thread_data;
   9.100 +
   9.101 +    /* screeninfo (from linux/fb.h) */
   9.102 +    struct fb_fix_screeninfo fb_finfo;
   9.103 +    struct fb_var_screeninfo fb_vinfo;
   9.104 +    struct fb_var_screeninfo fb_orig_vinfo;
   9.105 +
   9.106 +    /* screeninfo (from asm/ps3fb.h) */
   9.107 +    struct ps3fb_ioctl_res res;
   9.108 +
   9.109 +    unsigned int double_buffering;
   9.110 +    uint32_t real_width;      // real width of screen
   9.111 +    uint32_t real_height;     // real height of screen
   9.112 +
   9.113 +    uint32_t s_fb_pixel_size;   // 32:  4  24:  3  16:  2  15:  2
   9.114 +    uint32_t fb_bits_per_pixel;   // 32: 32  24: 24  16: 16  15: 15
   9.115 +
   9.116 +    uint32_t config_count;
   9.117 +
   9.118 +    uint32_t s_input_line_length;   // precalculated: input_width * fb_pixel_size
   9.119 +    uint32_t s_bounded_input_width; // width of input (bounded by writeable width)
   9.120 +    uint32_t s_bounded_input_height;// height of input (bounded by writeable height)
   9.121 +    uint32_t s_bounded_input_width_offset;  // offset from the left side (used for centering)
   9.122 +    uint32_t s_bounded_input_height_offset; // offset from the upper side (used for centering)
   9.123 +    uint32_t s_writeable_width; // width of screen which is writeable
   9.124 +    uint32_t s_writeable_height;    // height of screen which is writeable
   9.125 +
   9.126 +    uint8_t * s_center[2]; // where to begin writing our image (centered?)
   9.127 +    uint32_t s_center_index;
   9.128 +
   9.129 +    volatile void * s_pixels __attribute__((aligned(128)));
   9.130 +
   9.131 +    /* Framebuffer data */
   9.132 +    volatile struct fb_writer_parms_t * fb_parms __attribute__((aligned(128)));
   9.133 +};
   9.134 +
   9.135 +#define fb_dev_name     (this->hidden->fb_dev_name)
   9.136 +#define fb_dev_fd       (this->hidden->fb_dev_fd)
   9.137 +#define frame_buffer       (this->hidden->frame_buffer)
   9.138 +#define fb_thread_data      (this->hidden->fb_thread_data)
   9.139 +#define scaler_thread_data      (this->hidden->scaler_thread_data)
   9.140 +#define converter_thread_data      (this->hidden->converter_thread_data)
   9.141 +#define fb_parms           (this->hidden->fb_parms)
   9.142 +#define SDL_nummodes		(this->hidden->SDL_nummodes)
   9.143 +#define SDL_modelist		(this->hidden->SDL_modelist)
   9.144 +#define SDL_videomode		(this->hidden->SDL_videomode)
   9.145 +#define fb_finfo        (this->hidden->fb_finfo)
   9.146 +#define fb_vinfo        (this->hidden->fb_vinfo)
   9.147 +#define fb_orig_vinfo   (this->hidden->fb_orig_vinfo)
   9.148 +#define res             (this->hidden->res)
   9.149 +#define double_buffering (this->hidden->double_buffering)
   9.150 +#define real_width      (this->hidden->real_width)
   9.151 +#define real_height     (this->hidden->real_height)
   9.152 +#define s_fb_pixel_size   (this->hidden->s_fb_pixel_size)
   9.153 +#define fb_bits_per_pixel (this->hidden->fb_bits_per_pixel)
   9.154 +#define config_count (this->hidden->config_count)
   9.155 +#define s_input_line_length (this->hidden->s_input_line_length)
   9.156 +#define s_bounded_input_width (this->hidden->s_bounded_input_width)
   9.157 +#define s_bounded_input_height (this->hidden->s_bounded_input_height)
   9.158 +#define s_bounded_input_width_offset (this->hidden->s_bounded_input_width_offset)
   9.159 +#define s_bounded_input_height_offset (this->hidden->s_bounded_input_height_offset)
   9.160 +#define s_writeable_width (this->hidden->s_writeable_width)
   9.161 +#define s_writeable_height (this->hidden->s_writeable_height)
   9.162 +#define s_center          (this->hidden->s_center)
   9.163 +#define s_center_index    (this->hidden->s_center_index)
   9.164 +#define s_pixels           (this->hidden->s_pixels)
   9.165 +
   9.166 +#endif /* _SDL_ps3video_h */
   9.167 +
   9.168 +
    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/src/video/ps3/SDL_ps3yuv.c	Thu Apr 02 04:06:55 2009 +0000
    10.3 @@ -0,0 +1,340 @@
    10.4 +/*
    10.5 + * SDL - Simple DirectMedia Layer
    10.6 + * CELL BE Support for PS3 Framebuffer
    10.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    10.8 + *
    10.9 + * This library is free software; you can redistribute it and/or modify it
   10.10 + * under the terms of the GNU Lesser General Public License as published
   10.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   10.12 + * (at your option) any later version.
   10.13 + *
   10.14 + * This library is distributed in the hope that it will be useful, but
   10.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   10.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   10.17 + * Lesser General Public License for more details.
   10.18 + *
   10.19 + * You should have received a copy of the GNU Lesser General Public
   10.20 + * License along with this library; if not, write to the Free Software
   10.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   10.22 + * USA
   10.23 + *
   10.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   10.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   10.26 + *  SPE code based on research by:
   10.27 + *  Rene Becker
   10.28 + *  Thimo Emmerich
   10.29 + */
   10.30 +
   10.31 +#include "SDL_config.h"
   10.32 +
   10.33 +#include "SDL_video.h"
   10.34 +#include "SDL_ps3video.h"
   10.35 +#include "SDL_ps3yuv_c.h"
   10.36 +#include "../SDL_yuvfuncs.h"
   10.37 +#include "spulibs/spu_common.h"
   10.38 +
   10.39 +/* Stores the executable name */
   10.40 +extern spe_program_handle_t yuv2rgb_spu;
   10.41 +extern spe_program_handle_t bilin_scaler_spu;
   10.42 +
   10.43 +int SPE_Start(_THIS, spu_data_t * spe_data);
   10.44 +int SPE_Stop(_THIS, spu_data_t * spe_data);
   10.45 +int SPE_Boot(_THIS, spu_data_t * spe_data);
   10.46 +int SPE_Shutdown(_THIS, spu_data_t * spe_data);
   10.47 +int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
   10.48 +int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
   10.49 +void SPE_RunContext(void *thread_argp);
   10.50 +
   10.51 +
   10.52 +/* The functions used to manipulate software video overlays */
   10.53 +static struct private_yuvhwfuncs ps3_yuvfuncs = {
   10.54 +  PS3_LockYUVOverlay,
   10.55 +  PS3_UnlockYUVOverlay,
   10.56 +  PS3_DisplayYUVOverlay,
   10.57 +  PS3_FreeYUVOverlay
   10.58 +};
   10.59 +
   10.60 +
   10.61 +struct private_yuvhwdata {
   10.62 +	SDL_Surface *display;
   10.63 +	SDL_Surface *stretch;
   10.64 +    volatile void * pixels __attribute__((aligned(128)));
   10.65 +
   10.66 +	/* These are just so we don't have to allocate them separately */
   10.67 +	Uint16 pitches[3];
   10.68 +	Uint8 * planes[3];
   10.69 +
   10.70 +	unsigned int scale;
   10.71 +
   10.72 +	/* Scaled YUV picture */
   10.73 +	Uint8 * scaler_out __attribute__((aligned(128)));
   10.74 +
   10.75 +	/* YUV2RGB converter data */
   10.76 +    volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128)));
   10.77 +
   10.78 +	/* Scaler data */
   10.79 +    volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128)));
   10.80 +
   10.81 +	Uint8 locked;
   10.82 +};
   10.83 +
   10.84 +
   10.85 +SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display) {
   10.86 +	/* Only RGB packed pixel conversion supported */
   10.87 +	if ((display->format->BytesPerPixel != 2) &&
   10.88 +			(display->format->BytesPerPixel != 3) &&
   10.89 +			(display->format->BytesPerPixel != 4))
   10.90 +	{
   10.91 +		SDL_SetError ("Can't use YUV data on non 16/24/32 bit surfaces");
   10.92 +		return NULL;
   10.93 +	}
   10.94 +
   10.95 +	/* Double-check the requested format. We'll only support YV12 */
   10.96 +	switch (format) {
   10.97 +	    case SDL_IYUV_OVERLAY:
   10.98 +		case SDL_YV12_OVERLAY:
   10.99 +			/* Supported YUV format */
  10.100 +			break;
  10.101 +		default:
  10.102 +			SDL_SetError("Unsupported YUV format");
  10.103 +			return NULL;
  10.104 +	}
  10.105 +
  10.106 +	SDL_Overlay* overlay;
  10.107 +	struct private_yuvhwdata* hwdata;
  10.108 +
  10.109 +	/* Create the overlay structure */
  10.110 +	overlay = (SDL_Overlay *) SDL_calloc(1, sizeof(SDL_Overlay));
  10.111 +	if (overlay == NULL) {
  10.112 +		SDL_OutOfMemory();
  10.113 +		return NULL;
  10.114 +	}
  10.115 +	SDL_memset(overlay, 0, (sizeof *overlay));
  10.116 +
  10.117 +	/* Set the basic attributes */
  10.118 +	overlay->format = format;
  10.119 +	overlay->w = width;
  10.120 +	overlay->h = height;
  10.121 +	overlay->hwdata = NULL;
  10.122 +
  10.123 +	/* Set up the PS3 YUV surface function structure */
  10.124 +	overlay->hwfuncs = &ps3_yuvfuncs;
  10.125 +
  10.126 +	/* Create the pixel data and lookup tables */
  10.127 +	hwdata = (struct private_yuvhwdata *) SDL_calloc(1, sizeof(struct private_yuvhwdata));
  10.128 +	if (hwdata == NULL) {
  10.129 +		SDL_OutOfMemory();
  10.130 +		SDL_FreeYUVOverlay(overlay);
  10.131 +		return NULL;
  10.132 +	}
  10.133 +	overlay->hwdata = hwdata;
  10.134 +
  10.135 +	hwdata->stretch = NULL;
  10.136 +	hwdata->display = display;
  10.137 +
  10.138 +	/* Create SPU parms structure */
  10.139 +	hwdata->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t));
  10.140 +	hwdata->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t));
  10.141 +	if (hwdata->converter_parms == NULL || hwdata->scaler_parms == NULL) {
  10.142 +		SDL_FreeYUVOverlay(overlay);
  10.143 +		SDL_OutOfMemory();
  10.144 +		return(NULL);
  10.145 +	}
  10.146 +
  10.147 +	/* Set up the SPEs */
  10.148 +	scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
  10.149 +	converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
  10.150 +	if (converter_thread_data == NULL || scaler_thread_data == NULL) {
  10.151 +		SDL_FreeYUVOverlay(overlay);
  10.152 +		SDL_OutOfMemory();
  10.153 +		return(NULL);
  10.154 +	}
  10.155 +
  10.156 +	scaler_thread_data->program = bilin_scaler_spu;
  10.157 +	scaler_thread_data->program_name = "bilin_scaler_spu";
  10.158 +	scaler_thread_data->keepalive = 0;
  10.159 +	scaler_thread_data->booted = 0;
  10.160 +
  10.161 +	converter_thread_data->program = yuv2rgb_spu;
  10.162 +	converter_thread_data->program_name = "yuv2rgb_spu";
  10.163 +	converter_thread_data->keepalive = 1;
  10.164 +	converter_thread_data->booted = 0;
  10.165 +
  10.166 +	SPE_Start(this, converter_thread_data);
  10.167 +
  10.168 +	hwdata->pixels = (Uint8 *) memalign(16, width * height + ((width * height) >> 1));
  10.169 +	if (hwdata->pixels == NULL) {
  10.170 +		SDL_FreeYUVOverlay(overlay);
  10.171 +		SDL_OutOfMemory();
  10.172 +		return(NULL);
  10.173 +	}
  10.174 +
  10.175 +	/* Find the pitch and offset values for the overlay */
  10.176 +	overlay->pitches = hwdata->pitches;
  10.177 +	overlay->pixels = hwdata->planes;
  10.178 +	switch (format) {
  10.179 +	    case SDL_YV12_OVERLAY:
  10.180 +	    case SDL_IYUV_OVERLAY:
  10.181 +			overlay->pitches[0] = overlay->w;
  10.182 +			overlay->pitches[1] = overlay->pitches[0] / 2;
  10.183 +			overlay->pitches[2] = overlay->pitches[0] / 2;
  10.184 +			overlay->pixels[0] = (Uint8 *)hwdata->pixels;
  10.185 +			overlay->pixels[1] = overlay->pixels[0] +
  10.186 +				overlay->pitches[0] * overlay->h;
  10.187 +			overlay->pixels[2] = overlay->pixels[1] +
  10.188 +				overlay->pitches[1] * overlay->h / 2;
  10.189 +			overlay->planes = 3;
  10.190 +		break;
  10.191 +	    default:
  10.192 +		/* We should never get here (caught above) */
  10.193 +		break;
  10.194 +	}
  10.195 +
  10.196 +	/* We're all done.. */
  10.197 +	return overlay;
  10.198 +}
  10.199 +
  10.200 +
  10.201 +int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay) {
  10.202 +	if (overlay == NULL) {
  10.203 +		return -1;
  10.204 +	}
  10.205 +	overlay->hwdata->locked = 1;
  10.206 +
  10.207 +	return 0;
  10.208 +}
  10.209 +
  10.210 +
  10.211 +void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay) {
  10.212 +	if (overlay == NULL) {
  10.213 +		return;
  10.214 +	}
  10.215 +	overlay->hwdata->locked = 0;
  10.216 +
  10.217 +	return;
  10.218 +}
  10.219 +
  10.220 +
  10.221 +int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst) {
  10.222 +	if ((overlay == NULL) || (overlay->hwdata == NULL)) {
  10.223 +		return -1;
  10.224 +	}
  10.225 +
  10.226 +	Uint8 *lum, *Cr, *Cb;
  10.227 +	struct private_yuvhwdata *hwdata;
  10.228 +	SDL_Surface *display;
  10.229 +
  10.230 +	hwdata = overlay->hwdata;
  10.231 +	display = hwdata->display;
  10.232 +
  10.233 +	/* Do we have to scale? */
  10.234 +	if ((src->w != dst->w) || (src->h != dst->h) ) {
  10.235 +		hwdata->scale = 1;
  10.236 +		deprintf(1, "[PS3] We need to scale\n");
  10.237 +	} else {
  10.238 +		hwdata->scale = 0;
  10.239 +		deprintf(1, "[PS3] No scaling\n");
  10.240 +	}
  10.241 +
  10.242 +	/* Find out where the various portions of the image are */
  10.243 +	switch (overlay->format) {
  10.244 +		case SDL_YV12_OVERLAY:
  10.245 +			lum = (Uint8 *)overlay->pixels[0];
  10.246 +			Cr =  (Uint8 *)overlay->pixels[1];
  10.247 +			Cb =  (Uint8 *)overlay->pixels[2];
  10.248 +			break;
  10.249 +		case SDL_IYUV_OVERLAY:
  10.250 +			lum = (Uint8 *)overlay->pixels[0];
  10.251 +			Cr =  (Uint8 *)overlay->pixels[2];
  10.252 +			Cb =  (Uint8 *)overlay->pixels[1];
  10.253 +			break;
  10.254 +		default:
  10.255 +			SDL_SetError("Unsupported YUV format in blit");
  10.256 +			return -1;
  10.257 +	}
  10.258 +
  10.259 +	if (hwdata->scale) {
  10.260 +		/* Alloc mem for scaled YUV picture */
  10.261 +		hwdata->scaler_out = (Uint8 *) memalign(16, dst->w * dst->h + ((dst->w * dst->h) >> 1));
  10.262 +		if (hwdata->scaler_out == NULL) {
  10.263 +			SDL_FreeYUVOverlay(overlay);
  10.264 +			SDL_OutOfMemory();
  10.265 +			return -1;
  10.266 +		}
  10.267 +
  10.268 +		/* Set parms for scaling */
  10.269 +		hwdata->scaler_parms->src_pixel_width = src->w;
  10.270 +		hwdata->scaler_parms->src_pixel_height = src->h;
  10.271 +		hwdata->scaler_parms->dst_pixel_width = dst->w;
  10.272 +		hwdata->scaler_parms->dst_pixel_height = dst->h;
  10.273 +		hwdata->scaler_parms->y_plane = lum;
  10.274 +		hwdata->scaler_parms->v_plane = Cr;
  10.275 +		hwdata->scaler_parms->u_plane = Cb;
  10.276 +		hwdata->scaler_parms->dstBuffer = hwdata->scaler_out;
  10.277 +		scaler_thread_data->argp = (void *)hwdata->scaler_parms;
  10.278 +
  10.279 +		/* Scale the YUV overlay to given size */
  10.280 +		SPE_Start(this, scaler_thread_data);
  10.281 +		SPE_Stop(this, scaler_thread_data);
  10.282 +
  10.283 +		/* Set parms for converting after scaling */
  10.284 +		hwdata->converter_parms->y_plane = hwdata->scaler_out;
  10.285 +		hwdata->converter_parms->v_plane = hwdata->scaler_out + dst->w * dst->h;
  10.286 +		hwdata->converter_parms->u_plane = hwdata->scaler_out + dst->w * dst->h + ((dst->w * dst->h) >> 2);
  10.287 +	} else {
  10.288 +		/* Set parms for converting */
  10.289 +		hwdata->converter_parms->y_plane = lum;
  10.290 +		hwdata->converter_parms->v_plane = Cr;
  10.291 +		hwdata->converter_parms->u_plane = Cb;
  10.292 +	}
  10.293 +
  10.294 +	hwdata->converter_parms->src_pixel_width = dst->w;
  10.295 +	hwdata->converter_parms->src_pixel_height = dst->h;
  10.296 +	hwdata->converter_parms->dstBuffer = (Uint8 *) s_pixels;
  10.297 +	converter_thread_data->argp = (void *)hwdata->converter_parms;
  10.298 +
  10.299 +	/* Convert YUV overlay to RGB */
  10.300 +	SPE_SendMsg(this, converter_thread_data, SPU_START);
  10.301 +	SPE_SendMsg(this, converter_thread_data, (unsigned int)converter_thread_data->argp);
  10.302 +
  10.303 +	/* Centering */
  10.304 +	s_bounded_input_width = dst->w;
  10.305 +	s_bounded_input_height = dst->h;
  10.306 +
  10.307 +	/* UpdateRects() will do the rest.. */
  10.308 +	SDL_UpdateRects(display, 1, dst);
  10.309 +
  10.310 +	if (hwdata->scale)
  10.311 +		SDL_free((void *)hwdata->scaler_out);
  10.312 +
  10.313 +	return 0;
  10.314 +}
  10.315 +
  10.316 +
  10.317 +void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay) {
  10.318 +	if (overlay == NULL) {
  10.319 +		return;
  10.320 +	}
  10.321 +
  10.322 +	if (overlay->hwdata == NULL) {
  10.323 +		return;
  10.324 +	}
  10.325 +
  10.326 +	struct private_yuvhwdata * hwdata;
  10.327 +	hwdata = overlay->hwdata;
  10.328 +
  10.329 +	if (scaler_thread_data)
  10.330 +		SDL_free(scaler_thread_data);
  10.331 +	if (converter_thread_data) {
  10.332 +		SPE_Shutdown(this, converter_thread_data);
  10.333 +		SDL_free(converter_thread_data);
  10.334 +	}
  10.335 +
  10.336 +	if (hwdata) {
  10.337 +		if (hwdata->pixels)
  10.338 +			SDL_free((void *)hwdata->pixels);
  10.339 +		SDL_free(hwdata);
  10.340 +	}
  10.341 +	return;
  10.342 +}
  10.343 +
    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/src/video/ps3/SDL_ps3yuv_c.h	Thu Apr 02 04:06:55 2009 +0000
    11.3 @@ -0,0 +1,44 @@
    11.4 +/*
    11.5 + * SDL - Simple DirectMedia Layer
    11.6 + * CELL BE Support for PS3 Framebuffer
    11.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    11.8 + *
    11.9 + * This library is free software; you can redistribute it and/or modify it
   11.10 + * under the terms of the GNU Lesser General Public License as published
   11.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   11.12 + * (at your option) any later version.
   11.13 + *
   11.14 + * This library is distributed in the hope that it will be useful, but
   11.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   11.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   11.17 + * Lesser General Public License for more details.
   11.18 + *
   11.19 + * You should have received a copy of the GNU Lesser General Public
   11.20 + * License along with this library; if not, write to the Free Software
   11.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   11.22 + * USA
   11.23 + *
   11.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   11.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   11.26 + *  SPE code based on research by:
   11.27 + *  Rene Becker
   11.28 + *  Thimo Emmerich
   11.29 + */
   11.30 +
   11.31 +#include "SDL_config.h"
   11.32 +
   11.33 +#ifndef _SDL_ps3yuv_h
   11.34 +#define _SDL_ps3yuv_h
   11.35 +
   11.36 +/* This is the PS3 implementation of YUV video overlays */
   11.37 +
   11.38 +#include "SDL_video.h"
   11.39 +
   11.40 +extern SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display);
   11.41 +extern int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst);
   11.42 +extern int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay);
   11.43 +extern void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay);
   11.44 +extern void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay);
   11.45 +
   11.46 +#endif /* _SDL_ps3yuv_h */
   11.47 +
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/src/video/ps3/spulibs/Makefile	Thu Apr 02 04:06:55 2009 +0000
    12.3 @@ -0,0 +1,83 @@
    12.4 +# This Makefile is for building the CELL BE SPU libs
    12.5 +# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so
    12.6 +
    12.7 +# Toolchain
    12.8 +SPU_GCC=/usr/bin/spu-gcc
    12.9 +PPU_GCC=/usr/bin/gcc
   12.10 +PPU_EMBEDSPU=/usr/bin/embedspu
   12.11 +PPU_AR=/usr/bin/ar
   12.12 +PPU_LD=/usr/bin/ld
   12.13 +INSTALL=/usr/bin/install
   12.14 +
   12.15 +SPU_CFLAGS=-W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
   12.16 +
   12.17 +# Usually /usr/lib, depending on your distribution
   12.18 +PREFIX=/usr/lib
   12.19 +
   12.20 +
   12.21 +all: libfb_writer_spu.a libfb_writer_spu.so \
   12.22 +				libyuv2rgb_spu.so libyuv2rgb_spu.a \
   12.23 +				libbilin_scaler_spu.so libbilin_scaler_spu.a
   12.24 +
   12.25 +
   12.26 +# fb_writer
   12.27 +fb_writer_spu-embed.o: fb_writer.c spu_common.h
   12.28 +	$(SPU_GCC) $(SPU_CFLAGS) -o fb_writer_spu fb_writer.c -lm
   12.29 +	$(PPU_EMBEDSPU) -m32 fb_writer_spu fb_writer_spu fb_writer_spu-embed.o
   12.30 +
   12.31 +libfb_writer_spu.so: fb_writer_spu-embed.o
   12.32 +	$(PPU_LD) -o libfb_writer_spu.so -shared -soname=libfb_writer_spu.so fb_writer_spu-embed.o
   12.33 +
   12.34 +libfb_writer_spu.a: fb_writer_spu-embed.o
   12.35 +	$(PPU_AR) -qcs libfb_writer_spu.a fb_writer_spu-embed.o
   12.36 +
   12.37 +
   12.38 +# yuv2rgb_converter
   12.39 +yuv2rgb_spu-embed.o: yuv2rgb_converter.c spu_common.h
   12.40 +	$(SPU_GCC) $(SPU_CFLAGS) -o yuv2rgb_spu yuv2rgb_converter.c -lm
   12.41 +	$(PPU_EMBEDSPU) -m32 yuv2rgb_spu yuv2rgb_spu yuv2rgb_spu-embed.o
   12.42 +
   12.43 +libyuv2rgb_spu.a: yuv2rgb_spu-embed.o
   12.44 +	$(PPU_AR) -qcs libyuv2rgb_spu.a yuv2rgb_spu-embed.o
   12.45 +
   12.46 +libyuv2rgb_spu.so: yuv2rgb_spu-embed.o
   12.47 +	$(PPU_LD) -o libyuv2rgb_spu.so -shared -soname=libyuv2rgb_spu.so yuv2rgb_spu-embed.o
   12.48 +
   12.49 +
   12.50 +# bilin_scaler
   12.51 +bilin_scaler_spu-embed.o: bilin_scaler.c spu_common.h
   12.52 +	$(SPU_GCC) $(SPU_CFLAGS) -o bilin_scaler_spu bilin_scaler.c -lm
   12.53 +	$(PPU_EMBEDSPU) -m32 bilin_scaler_spu bilin_scaler_spu bilin_scaler_spu-embed.o
   12.54 +
   12.55 +libbilin_scaler_spu.a: bilin_scaler_spu-embed.o
   12.56 +	$(PPU_AR) -qcs libbilin_scaler_spu.a bilin_scaler_spu-embed.o
   12.57 +
   12.58 +libbilin_scaler_spu.so: bilin_scaler_spu-embed.o
   12.59 +	$(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o
   12.60 +
   12.61 +install: libfb_writer_spu.a libfb_writer_spu.so \
   12.62 +				libyuv2rgb_spu.so libyuv2rgb_spu.a \
   12.63 +				libbilin_scaler_spu.so libbilin_scaler_spu.a
   12.64 +	$(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/.
   12.65 +	$(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/.
   12.66 +	$(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
   12.67 +	$(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
   12.68 +	$(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/.
   12.69 +	$(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/.
   12.70 +
   12.71 +
   12.72 +uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \
   12.73 +		$(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \
   12.74 +		$(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a
   12.75 +	rm -f $(PREFIX)/libfb_writer_spu.a
   12.76 +	rm -f $(PREFIX)/libfb_writer_spu.so
   12.77 +	rm -f $(PREFIX)/libyuv2rgb_spu.so
   12.78 +	rm -f $(PREFIX)/libyuv2rgb_spu.a
   12.79 +	rm -f $(PREFIX)/libbilin_scaler_spu.so
   12.80 +	rm -f $(PREFIX)/libbilin_scaler_spu.a
   12.81 +
   12.82 +
   12.83 +clean:
   12.84 +	rm -f bilin_scaler_spu-embed.o libbilin_scaler_spu.so libbilin_scaler_spu.a bilin_scaler_spu
   12.85 +	rm -f yuv2rgb_spu-embed.o libyuv2rgb_spu.so libyuv2rgb_spu.a yuv2rgb_spu
   12.86 +	rm -f fb_writer_spu-embed.o libfb_writer_spu.so libfb_writer_spu.a fb_writer_spu
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/src/video/ps3/spulibs/bilin_scaler.c	Thu Apr 02 04:06:55 2009 +0000
    13.3 @@ -0,0 +1,2050 @@
    13.4 +/*
    13.5 + * SDL - Simple DirectMedia Layer
    13.6 + * CELL BE Support for PS3 Framebuffer
    13.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    13.8 + *
    13.9 + * This library is free software; you can redistribute it and/or modify it
   13.10 + * under the terms of the GNU Lesser General Public License as published
   13.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   13.12 + * (at your option) any later version.
   13.13 + *
   13.14 + * This library is distributed in the hope that it will be useful, but
   13.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   13.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13.17 + * Lesser General Public License for more details.
   13.18 + *
   13.19 + * You should have received a copy of the GNU Lesser General Public
   13.20 + * License along with this library; if not, write to the Free Software
   13.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   13.22 + * USA
   13.23 + *
   13.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   13.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   13.26 + *  SPE code based on research by:
   13.27 + *  Rene Becker
   13.28 + *  Thimo Emmerich
   13.29 + */
   13.30 +
   13.31 +#include "spu_common.h"
   13.32 +
   13.33 +#include <spu_intrinsics.h>
   13.34 +#include <spu_mfcio.h>
   13.35 +
   13.36 +// Debugging
   13.37 +//#define DEBUG
   13.38 +
   13.39 +#ifdef DEBUG
   13.40 +#define deprintf(fmt, args... ) \
   13.41 +	fprintf( stdout, fmt, ##args ); \
   13.42 +	fflush( stdout );
   13.43 +#else
   13.44 +#define deprintf( fmt, args... )
   13.45 +#endif
   13.46 +
   13.47 +struct scale_parms_t parms __attribute__((aligned(128)));
   13.48 +
   13.49 +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
   13.50 + * there might be the need to retrieve misaligned data, adjust
   13.51 + * incoming v and u plane to be able to handle this (add 128)
   13.52 + */
   13.53 +unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
   13.54 +unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
   13.55 +unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
   13.56 +
   13.57 +/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
   13.58 +unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
   13.59 +unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
   13.60 +unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
   13.61 +
   13.62 +/* some vectors needed by the float to int conversion */
   13.63 +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
   13.64 +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
   13.65 +
   13.66 +void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
   13.67 +void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
   13.68 +
   13.69 +void scale_srcw16_dstw16();
   13.70 +void scale_srcw16_dstw32();
   13.71 +void scale_srcw32_dstw16();
   13.72 +void scale_srcw32_dstw32();
   13.73 +
   13.74 +int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
   13.75 +{
   13.76 +	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
   13.77 +	/* DMA transfer for the input parameters */
   13.78 +	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
   13.79 +	DMA_WAIT_TAG(TAG_INIT);
   13.80 +
   13.81 +	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
   13.82 +			parms.dst_pixel_width, parms.dst_pixel_height);
   13.83 +
   13.84 +	if(parms.src_pixel_width & 0x1f) {
   13.85 +		if(parms.dst_pixel_width & 0x1F) {
   13.86 +			deprintf("[SPU] Using scale_srcw16_dstw16\n");
   13.87 +			scale_srcw16_dstw16();
   13.88 +		} else {
   13.89 +			deprintf("[SPU] Using scale_srcw16_dstw32\n");
   13.90 +			scale_srcw16_dstw32();
   13.91 +		}
   13.92 +	} else {
   13.93 +		if(parms.dst_pixel_width & 0x1F) {
   13.94 +			deprintf("[SPU] Using scale_srcw32_dstw16\n");
   13.95 +			scale_srcw32_dstw16();
   13.96 +		} else {
   13.97 +			deprintf("[SPU] Using scale_srcw32_dstw32\n");
   13.98 +			scale_srcw32_dstw32();
   13.99 +		}
  13.100 +	}
  13.101 +	deprintf("[SPU] bilin_scaler_spu... done!\n");
  13.102 +
  13.103 +	return 0;
  13.104 +}
  13.105 +
  13.106 +
  13.107 +/*
  13.108 + * vfloat_to_vuint()
  13.109 + *
  13.110 + * converts a float vector to an unsinged int vector using saturated
  13.111 + * arithmetic
  13.112 + *
  13.113 + * @param vec_s float vector for conversion
  13.114 + * @returns converted unsigned int vector
  13.115 + */
  13.116 +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
  13.117 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  13.118 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  13.119 +
  13.120 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  13.121 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  13.122 +	return spu_convtu(vec_s,0);
  13.123 +}
  13.124 +
  13.125 +
  13.126 +/*
  13.127 + * scale_srcw16_dstw16()
  13.128 + *
  13.129 + * processes an input image of width 16
  13.130 + * scaling is done to a width 16
  13.131 + * result stored in RAM
  13.132 + */
  13.133 +void scale_srcw16_dstw16() {
  13.134 +	// extract parameters
  13.135 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  13.136 +
  13.137 +	unsigned int src_width = parms.src_pixel_width;
  13.138 +	unsigned int src_height = parms.src_pixel_height;
  13.139 +	unsigned int dst_width = parms.dst_pixel_width;
  13.140 +	unsigned int dst_height = parms.dst_pixel_height;
  13.141 +
  13.142 +	// YVU
  13.143 +	unsigned int src_linestride_y = src_width;
  13.144 +	unsigned int src_dbl_linestride_y = src_width<<1;
  13.145 +	unsigned int src_linestride_vu = src_width>>1;
  13.146 +	unsigned int src_dbl_linestride_vu = src_width;
  13.147 +
  13.148 +	// scaled YVU
  13.149 +	unsigned int scaled_src_linestride_y = dst_width;
  13.150 +
  13.151 +	// ram addresses
  13.152 +	unsigned char* src_addr_y = parms.y_plane;
  13.153 +	unsigned char* src_addr_v = parms.v_plane;
  13.154 +	unsigned char* src_addr_u = parms.u_plane;
  13.155 +
  13.156 +	// for handling misalignment, addresses are precalculated
  13.157 +	unsigned char* precalc_src_addr_v = src_addr_v;
  13.158 +	unsigned char* precalc_src_addr_u = src_addr_u;
  13.159 +
  13.160 +	unsigned int dst_picture_size = dst_width*dst_height;
  13.161 +
  13.162 +	// Sizes for destination
  13.163 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  13.164 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  13.165 +
  13.166 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  13.167 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  13.168 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  13.169 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  13.170 +
  13.171 +	// calculate scale factors
  13.172 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  13.173 +	float y_scale = (float)src_height/(float)dst_height;
  13.174 +
  13.175 +	// double buffered processing
  13.176 +	// buffer switching
  13.177 +	unsigned int curr_src_idx = 0;
  13.178 +	unsigned int curr_dst_idx = 0;
  13.179 +	unsigned int next_src_idx, next_dst_idx;
  13.180 +
  13.181 +	// 2 lines y as output, upper and lowerline
  13.182 +	unsigned int curr_interpl_y_upper = 0;
  13.183 +	unsigned int next_interpl_y_upper;
  13.184 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  13.185 +	// only 1 line v/u output, both planes have the same dimension
  13.186 +	unsigned int curr_interpl_vu = 0;
  13.187 +	unsigned int next_interpl_vu;
  13.188 +
  13.189 +	// weights, calculated in every loop iteration
  13.190 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.191 +	vector float vf_next_NSweight_y_upper;
  13.192 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  13.193 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.194 +	vector float vf_next_NSweight_vu;
  13.195 +
  13.196 +	// line indices for the src picture
  13.197 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  13.198 +	float curr_src_y_lower, next_src_y_lower;
  13.199 +	float curr_src_vu = 0.0f, next_src_vu;
  13.200 +
  13.201 +	// line indices for the dst picture
  13.202 +	unsigned int dst_y=0, dst_vu=0;
  13.203 +
  13.204 +	// offset for the v and u plane to handle misalignement
  13.205 +	unsigned int curr_lsoff_v = 0, next_lsoff_v;
  13.206 +	unsigned int curr_lsoff_u = 0, next_lsoff_u;
  13.207 +
  13.208 +	// calculate lower line indices
  13.209 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  13.210 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  13.211 +	// lower line weight
  13.212 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  13.213 +
  13.214 +
  13.215 +	// start partially double buffered processing
  13.216 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  13.217 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  13.218 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.219 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  13.220 +			src_dbl_linestride_y,
  13.221 +			RETR_BUF,
  13.222 +			0, 0 );
  13.223 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.224 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.225 +
  13.226 +	/* iteration loop
  13.227 +	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  13.228 +	 * the scaled output is 2 lines y, 1 line v, 1 line u
  13.229 +	 * the yuv2rgb-converted output is stored to RAM
  13.230 +	 */
  13.231 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  13.232 +		dst_y = dst_vu<<1;
  13.233 +
  13.234 +		// calculate next indices
  13.235 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  13.236 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  13.237 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  13.238 +
  13.239 +		next_interpl_vu = (unsigned int) next_src_vu;
  13.240 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  13.241 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  13.242 +
  13.243 +		// calculate weight NORTH-SOUTH
  13.244 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  13.245 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  13.246 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  13.247 +
  13.248 +		// get next lines
  13.249 +		next_src_idx = curr_src_idx^1;
  13.250 +		next_dst_idx = curr_dst_idx^1;
  13.251 +
  13.252 +		// 4 lines y
  13.253 +		mfc_get( y_plane[next_src_idx],
  13.254 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  13.255 +				src_dbl_linestride_y,
  13.256 +				RETR_BUF+next_src_idx,
  13.257 +				0, 0 );
  13.258 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  13.259 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  13.260 +				src_dbl_linestride_y,
  13.261 +				RETR_BUF+next_src_idx,
  13.262 +				0, 0 );
  13.263 +
  13.264 +		// 2 lines v
  13.265 +		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
  13.266 +		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
  13.267 +		mfc_get( v_plane[next_src_idx],
  13.268 +				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
  13.269 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  13.270 +				RETR_BUF+next_src_idx,
  13.271 +				0, 0 );
  13.272 +		// 2 lines u
  13.273 +		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
  13.274 +		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
  13.275 +		mfc_get( u_plane[next_src_idx],
  13.276 +				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
  13.277 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  13.278 +				RETR_BUF+next_src_idx,
  13.279 +				0, 0 );
  13.280 +
  13.281 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.282 +
  13.283 +		// scaling
  13.284 +		// work line y_upper
  13.285 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.286 +				scaled_y_plane[curr_src_idx],
  13.287 +				dst_width,
  13.288 +				vf_x_scale,
  13.289 +				vf_curr_NSweight_y_upper,
  13.290 +				src_linestride_y );
  13.291 +		// work line y_lower
  13.292 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.293 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.294 +				dst_width,
  13.295 +				vf_x_scale,
  13.296 +				vf_curr_NSweight_y_lower,
  13.297 +				src_linestride_y );
  13.298 +		// work line v
  13.299 +		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  13.300 +				scaled_v_plane[curr_src_idx],
  13.301 +				dst_width>>1,
  13.302 +				vf_x_scale,
  13.303 +				vf_curr_NSweight_vu,
  13.304 +				src_linestride_vu );
  13.305 +		// work line u
  13.306 +		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  13.307 +				scaled_u_plane[curr_src_idx],
  13.308 +				dst_width>>1,
  13.309 +				vf_x_scale,
  13.310 +				vf_curr_NSweight_vu,
  13.311 +				src_linestride_vu );
  13.312 +
  13.313 +
  13.314 +		// Store the result back to main memory into a destination buffer in YUV format
  13.315 +		//---------------------------------------------------------------------------------------------
  13.316 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.317 +
  13.318 +		// Perform three DMA transfers to 3 different locations in the main memory!
  13.319 +		// dst_width:	Pixel width of destination image
  13.320 +		// dst_addr:	Destination address in main memory
  13.321 +		// dst_vu:	Counter which is incremented one by one
  13.322 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.323 +		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
  13.324 +				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  13.325 +				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
  13.326 +				STR_BUF+curr_dst_idx,						// Tag
  13.327 +				0, 0 );
  13.328 +
  13.329 +		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
  13.330 +				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.331 +				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
  13.332 +				STR_BUF+curr_dst_idx,						// Tag
  13.333 +				0, 0 );
  13.334 +
  13.335 +		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
  13.336 +				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.337 +				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
  13.338 +				STR_BUF+curr_dst_idx,						// Tag
  13.339 +				0, 0 );
  13.340 +		//---------------------------------------------------------------------------------------------
  13.341 +
  13.342 +
  13.343 +		// update for next cycle
  13.344 +		curr_src_idx = next_src_idx;
  13.345 +		curr_dst_idx = next_dst_idx;
  13.346 +
  13.347 +		curr_interpl_y_upper = next_interpl_y_upper;
  13.348 +		curr_interpl_y_lower = next_interpl_y_lower;
  13.349 +		curr_interpl_vu = next_interpl_vu;
  13.350 +
  13.351 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  13.352 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  13.353 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  13.354 +
  13.355 +		curr_src_y_upper = next_src_y_upper;
  13.356 +		curr_src_y_lower = next_src_y_lower;
  13.357 +		curr_src_vu = next_src_vu;
  13.358 +
  13.359 +		curr_lsoff_v = next_lsoff_v;
  13.360 +		curr_lsoff_u = next_lsoff_u;
  13.361 +	}
  13.362 +
  13.363 +
  13.364 +
  13.365 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.366 +
  13.367 +	// scaling
  13.368 +	// work line y_upper
  13.369 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.370 +			scaled_y_plane[curr_src_idx],
  13.371 +			dst_width,
  13.372 +			vf_x_scale,
  13.373 +			vf_curr_NSweight_y_upper,
  13.374 +			src_linestride_y );
  13.375 +	// work line y_lower
  13.376 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.377 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.378 +			dst_width,
  13.379 +			vf_x_scale,
  13.380 +			vf_curr_NSweight_y_lower,
  13.381 +			src_linestride_y );
  13.382 +	// work line v
  13.383 +	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  13.384 +			scaled_v_plane[curr_src_idx],
  13.385 +			dst_width>>1,
  13.386 +			vf_x_scale,
  13.387 +			vf_curr_NSweight_vu,
  13.388 +			src_linestride_vu );
  13.389 +	// work line u
  13.390 +	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  13.391 +			scaled_u_plane[curr_src_idx],
  13.392 +			dst_width>>1,
  13.393 +			vf_x_scale,
  13.394 +			vf_curr_NSweight_vu,
  13.395 +			src_linestride_vu );
  13.396 +
  13.397 +
  13.398 +	// Store the result back to main memory into a destination buffer in YUV format
  13.399 +	//---------------------------------------------------------------------------------------------
  13.400 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.401 +
  13.402 +	// Perform three DMA transfers to 3 different locations in the main memory!
  13.403 +	// dst_width:	Pixel width of destination image
  13.404 +	// dst_addr:	Destination address in main memory
  13.405 +	// dst_vu:	Counter which is incremented one by one
  13.406 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.407 +	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
  13.408 +			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  13.409 +			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
  13.410 +			STR_BUF+curr_dst_idx,						// Tag
  13.411 +			0, 0 );
  13.412 +
  13.413 +	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
  13.414 +			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.415 +			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
  13.416 +			STR_BUF+curr_dst_idx,						// Tag
  13.417 +			0, 0 );
  13.418 +
  13.419 +	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
  13.420 +			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.421 +			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
  13.422 +			STR_BUF+curr_dst_idx,						// Tag
  13.423 +			0, 0 );
  13.424 +
  13.425 +	// wait for completion
  13.426 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.427 +	//---------------------------------------------------------------------------------------------
  13.428 +}
  13.429 +
  13.430 +
  13.431 +/*
  13.432 + * scale_srcw16_dstw32()
  13.433 + *
  13.434 + * processes an input image of width 16
  13.435 + * scaling is done to a width 32
  13.436 + * yuv2rgb conversion on a width of 32
  13.437 + * result stored in RAM
  13.438 + */
  13.439 +void scale_srcw16_dstw32() {
  13.440 +	// extract parameters
  13.441 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  13.442 +
  13.443 +	unsigned int src_width = parms.src_pixel_width;
  13.444 +	unsigned int src_height = parms.src_pixel_height;
  13.445 +	unsigned int dst_width = parms.dst_pixel_width;
  13.446 +	unsigned int dst_height = parms.dst_pixel_height;
  13.447 +
  13.448 +	// YVU
  13.449 +	unsigned int src_linestride_y = src_width;
  13.450 +	unsigned int src_dbl_linestride_y = src_width<<1;
  13.451 +	unsigned int src_linestride_vu = src_width>>1;
  13.452 +	unsigned int src_dbl_linestride_vu = src_width;
  13.453 +	// scaled YVU
  13.454 +	unsigned int scaled_src_linestride_y = dst_width;
  13.455 +
  13.456 +	// ram addresses
  13.457 +	unsigned char* src_addr_y = parms.y_plane;
  13.458 +	unsigned char* src_addr_v = parms.v_plane;
  13.459 +	unsigned char* src_addr_u = parms.u_plane;
  13.460 +
  13.461 +	unsigned int dst_picture_size = dst_width*dst_height;
  13.462 +
  13.463 +	// Sizes for destination
  13.464 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  13.465 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  13.466 +
  13.467 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  13.468 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  13.469 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  13.470 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  13.471 +
  13.472 +
  13.473 +	// for handling misalignment, addresses are precalculated
  13.474 +	unsigned char* precalc_src_addr_v = src_addr_v;
  13.475 +	unsigned char* precalc_src_addr_u = src_addr_u;
  13.476 +
  13.477 +	// calculate scale factors
  13.478 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  13.479 +	float y_scale = (float)src_height/(float)dst_height;
  13.480 +
  13.481 +	// double buffered processing
  13.482 +	// buffer switching
  13.483 +	unsigned int curr_src_idx = 0;
  13.484 +	unsigned int curr_dst_idx = 0;
  13.485 +	unsigned int next_src_idx, next_dst_idx;
  13.486 +
  13.487 +	// 2 lines y as output, upper and lowerline
  13.488 +	unsigned int curr_interpl_y_upper = 0;
  13.489 +	unsigned int next_interpl_y_upper;
  13.490 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  13.491 +	// only 1 line v/u output, both planes have the same dimension
  13.492 +	unsigned int curr_interpl_vu = 0;
  13.493 +	unsigned int next_interpl_vu;
  13.494 +
  13.495 +	// weights, calculated in every loop iteration
  13.496 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.497 +	vector float vf_next_NSweight_y_upper;
  13.498 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  13.499 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.500 +	vector float vf_next_NSweight_vu;
  13.501 +
  13.502 +	// line indices for the src picture
  13.503 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  13.504 +	float curr_src_y_lower, next_src_y_lower;
  13.505 +	float curr_src_vu = 0.0f, next_src_vu;
  13.506 +
  13.507 +	// line indices for the dst picture
  13.508 +	unsigned int dst_y=0, dst_vu=0;
  13.509 +
  13.510 +	// offset for the v and u plane to handle misalignement
  13.511 +	unsigned int curr_lsoff_v = 0, next_lsoff_v;
  13.512 +	unsigned int curr_lsoff_u = 0, next_lsoff_u;
  13.513 +
  13.514 +	// calculate lower line idices
  13.515 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  13.516 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  13.517 +	// lower line weight
  13.518 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  13.519 +
  13.520 +
  13.521 +	// start partially double buffered processing
  13.522 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  13.523 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  13.524 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.525 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  13.526 +			src_dbl_linestride_y,
  13.527 +			RETR_BUF,
  13.528 +			0, 0 );
  13.529 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.530 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.531 +
  13.532 +	// iteration loop
  13.533 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  13.534 +	// the scaled output is 2 lines y, 1 line v, 1 line u
  13.535 +	// the yuv2rgb-converted output is stored to RAM
  13.536 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  13.537 +		dst_y = dst_vu<<1;
  13.538 +
  13.539 +		// calculate next indices
  13.540 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  13.541 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  13.542 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  13.543 +
  13.544 +		next_interpl_vu = (unsigned int) next_src_vu;
  13.545 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  13.546 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  13.547 +
  13.548 +		// calculate weight NORTH-SOUTH
  13.549 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  13.550 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  13.551 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  13.552 +
  13.553 +		// get next lines
  13.554 +		next_src_idx = curr_src_idx^1;
  13.555 +		next_dst_idx = curr_dst_idx^1;
  13.556 +
  13.557 +		// 4 lines y
  13.558 +		mfc_get( y_plane[next_src_idx],
  13.559 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  13.560 +				src_dbl_linestride_y,
  13.561 +				RETR_BUF+next_src_idx,
  13.562 +				0, 0 );
  13.563 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  13.564 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  13.565 +				src_dbl_linestride_y,
  13.566 +				RETR_BUF+next_src_idx,
  13.567 +				0, 0 );
  13.568 +
  13.569 +		// 2 lines v
  13.570 +		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
  13.571 +		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
  13.572 +		mfc_get( v_plane[next_src_idx],
  13.573 +				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
  13.574 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  13.575 +				RETR_BUF+next_src_idx,
  13.576 +				0, 0 );
  13.577 +		// 2 lines u
  13.578 +		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
  13.579 +		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
  13.580 +		mfc_get( u_plane[next_src_idx],
  13.581 +				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
  13.582 +				src_dbl_linestride_vu+(next_lsoff_v<<1),
  13.583 +				RETR_BUF+next_src_idx,
  13.584 +				0, 0 );
  13.585 +
  13.586 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.587 +
  13.588 +		// scaling
  13.589 +		// work line y_upper
  13.590 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.591 +				scaled_y_plane[curr_src_idx],
  13.592 +				dst_width,
  13.593 +				vf_x_scale,
  13.594 +				vf_curr_NSweight_y_upper,
  13.595 +				src_linestride_y );
  13.596 +		// work line y_lower
  13.597 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.598 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.599 +				dst_width,
  13.600 +				vf_x_scale,
  13.601 +				vf_curr_NSweight_y_lower,
  13.602 +				src_linestride_y );
  13.603 +		// work line v
  13.604 +		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  13.605 +				scaled_v_plane[curr_src_idx],
  13.606 +				dst_width>>1,
  13.607 +				vf_x_scale,
  13.608 +				vf_curr_NSweight_vu,
  13.609 +				src_linestride_vu );
  13.610 +		// work line u
  13.611 +		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  13.612 +				scaled_u_plane[curr_src_idx],
  13.613 +				dst_width>>1,
  13.614 +				vf_x_scale,
  13.615 +				vf_curr_NSweight_vu,
  13.616 +				src_linestride_vu );
  13.617 +
  13.618 +		//---------------------------------------------------------------------------------------------
  13.619 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.620 +
  13.621 +		// Perform three DMA transfers to 3 different locations in the main memory!
  13.622 +		// dst_width:	Pixel width of destination image
  13.623 +		// dst_addr:	Destination address in main memory
  13.624 +		// dst_vu:	Counter which is incremented one by one
  13.625 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.626 +
  13.627 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  13.628 +				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  13.629 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  13.630 +				STR_BUF+curr_dst_idx,								// Tag
  13.631 +				0, 0 );
  13.632 +
  13.633 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  13.634 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.635 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  13.636 +				STR_BUF+curr_dst_idx,								// Tag
  13.637 +				0, 0 );
  13.638 +
  13.639 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  13.640 +				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.641 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  13.642 +				STR_BUF+curr_dst_idx,								// Tag
  13.643 +				0, 0 );
  13.644 +		//---------------------------------------------------------------------------------------------
  13.645 +
  13.646 +
  13.647 +		// update for next cycle
  13.648 +		curr_src_idx = next_src_idx;
  13.649 +		curr_dst_idx = next_dst_idx;
  13.650 +
  13.651 +		curr_interpl_y_upper = next_interpl_y_upper;
  13.652 +		curr_interpl_y_lower = next_interpl_y_lower;
  13.653 +		curr_interpl_vu = next_interpl_vu;
  13.654 +
  13.655 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  13.656 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  13.657 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  13.658 +
  13.659 +		curr_src_y_upper = next_src_y_upper;
  13.660 +		curr_src_y_lower = next_src_y_lower;
  13.661 +		curr_src_vu = next_src_vu;
  13.662 +
  13.663 +		curr_lsoff_v = next_lsoff_v;
  13.664 +		curr_lsoff_u = next_lsoff_u;
  13.665 +	}
  13.666 +
  13.667 +
  13.668 +
  13.669 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.670 +
  13.671 +	// scaling
  13.672 +	// work line y_upper
  13.673 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.674 +			scaled_y_plane[curr_src_idx],
  13.675 +			dst_width,
  13.676 +			vf_x_scale,
  13.677 +			vf_curr_NSweight_y_upper,
  13.678 +			src_linestride_y );
  13.679 +	// work line y_lower
  13.680 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.681 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.682 +			dst_width,
  13.683 +			vf_x_scale,
  13.684 +			vf_curr_NSweight_y_lower,
  13.685 +			src_linestride_y );
  13.686 +	// work line v
  13.687 +	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
  13.688 +			scaled_v_plane[curr_src_idx],
  13.689 +			dst_width>>1,
  13.690 +			vf_x_scale,
  13.691 +			vf_curr_NSweight_vu,
  13.692 +			src_linestride_vu );
  13.693 +	// work line u
  13.694 +	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
  13.695 +			scaled_u_plane[curr_src_idx],
  13.696 +			dst_width>>1,
  13.697 +			vf_x_scale,
  13.698 +			vf_curr_NSweight_vu,
  13.699 +			src_linestride_vu );
  13.700 +
  13.701 +	//---------------------------------------------------------------------------------------------
  13.702 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.703 +
  13.704 +	// Perform three DMA transfers to 3 different locations in the main memory!
  13.705 +	// dst_width:	Pixel width of destination image
  13.706 +	// dst_addr:	Destination address in main memory
  13.707 +	// dst_vu:	Counter which is incremented one by one
  13.708 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.709 +
  13.710 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  13.711 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  13.712 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  13.713 +			STR_BUF+curr_dst_idx,								// Tag
  13.714 +			0, 0 );
  13.715 +
  13.716 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  13.717 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.718 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  13.719 +			STR_BUF+curr_dst_idx,								// Tag
  13.720 +			0, 0 );
  13.721 +
  13.722 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  13.723 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.724 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  13.725 +			STR_BUF+curr_dst_idx,								// Tag
  13.726 +			0, 0 );
  13.727 +
  13.728 +	// wait for completion
  13.729 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.730 +	//---------------------------------------------------------------------------------------------
  13.731 +}
  13.732 +
  13.733 +
  13.734 +/*
  13.735 + * scale_srcw32_dstw16()
  13.736 + *
  13.737 + * processes an input image of width 32
  13.738 + * scaling is done to a width 16
  13.739 + * yuv2rgb conversion on a width of 16
  13.740 + * result stored in RAM
  13.741 + */
  13.742 +void scale_srcw32_dstw16() {
  13.743 +	// extract parameters
  13.744 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
  13.745 +
  13.746 +	unsigned int src_width = parms.src_pixel_width;
  13.747 +	unsigned int src_height = parms.src_pixel_height;
  13.748 +	unsigned int dst_width = parms.dst_pixel_width;
  13.749 +	unsigned int dst_height = parms.dst_pixel_height;
  13.750 +
  13.751 +	// YVU
  13.752 +	unsigned int src_linestride_y = src_width;
  13.753 +	unsigned int src_dbl_linestride_y = src_width<<1;
  13.754 +	unsigned int src_linestride_vu = src_width>>1;
  13.755 +	unsigned int src_dbl_linestride_vu = src_width;
  13.756 +	// scaled YVU
  13.757 +	unsigned int scaled_src_linestride_y = dst_width;
  13.758 +
  13.759 +	// ram addresses
  13.760 +	unsigned char* src_addr_y = parms.y_plane;
  13.761 +	unsigned char* src_addr_v = parms.v_plane;
  13.762 +	unsigned char* src_addr_u = parms.u_plane;
  13.763 +
  13.764 +	unsigned int dst_picture_size = dst_width*dst_height;
  13.765 +
  13.766 +	// Sizes for destination
  13.767 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
  13.768 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
  13.769 +
  13.770 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
  13.771 +	unsigned char* dst_addr_main_memory_y = dst_addr;
  13.772 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
  13.773 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
  13.774 +
  13.775 +	// calculate scale factors
  13.776 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
  13.777 +	float y_scale = (float)src_height/(float)dst_height;
  13.778 +
  13.779 +	// double buffered processing
  13.780 +	// buffer switching
  13.781 +	unsigned int curr_src_idx = 0;
  13.782 +	unsigned int curr_dst_idx = 0;
  13.783 +	unsigned int next_src_idx, next_dst_idx;
  13.784 +
  13.785 +	// 2 lines y as output, upper and lowerline
  13.786 +	unsigned int curr_interpl_y_upper = 0;
  13.787 +	unsigned int next_interpl_y_upper;
  13.788 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
  13.789 +	// only 1 line v/u output, both planes have the same dimension
  13.790 +	unsigned int curr_interpl_vu = 0;
  13.791 +	unsigned int next_interpl_vu;
  13.792 +
  13.793 +	// weights, calculated in every loop iteration
  13.794 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.795 +	vector float vf_next_NSweight_y_upper;
  13.796 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
  13.797 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
  13.798 +	vector float vf_next_NSweight_vu;
  13.799 +
  13.800 +	// line indices for the src picture
  13.801 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
  13.802 +	float curr_src_y_lower, next_src_y_lower;
  13.803 +	float curr_src_vu = 0.0f, next_src_vu;
  13.804 +
  13.805 +	// line indices for the dst picture
  13.806 +	unsigned int dst_y=0, dst_vu=0;
  13.807 +
  13.808 +	// calculate lower line idices
  13.809 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
  13.810 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
  13.811 +	// lower line weight
  13.812 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
  13.813 +
  13.814 +
  13.815 +	// start partially double buffered processing
  13.816 +	// get initial data, 2 sets of y, 1 set v, 1 set u
  13.817 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
  13.818 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.819 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
  13.820 +			src_dbl_linestride_y,
  13.821 +			RETR_BUF,
  13.822 +			0, 0 );
  13.823 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.824 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
  13.825 +
  13.826 +	// iteration loop
  13.827 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
  13.828 +	// the scaled output is 2 lines y, 1 line v, 1 line u
  13.829 +	// the yuv2rgb-converted output is stored to RAM
  13.830 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
  13.831 +		dst_y = dst_vu<<1;
  13.832 +
  13.833 +		// calculate next indices
  13.834 +		next_src_vu = ((float)dst_vu+1)*y_scale;
  13.835 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
  13.836 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
  13.837 +
  13.838 +		next_interpl_vu = (unsigned int) next_src_vu;
  13.839 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
  13.840 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
  13.841 +
  13.842 +		// calculate weight NORTH-SOUTH
  13.843 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
  13.844 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
  13.845 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
  13.846 +
  13.847 +		// get next lines
  13.848 +		next_src_idx = curr_src_idx^1;
  13.849 +		next_dst_idx = curr_dst_idx^1;
  13.850 +
  13.851 +		// 4 lines y
  13.852 +		mfc_get( y_plane[next_src_idx],
  13.853 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
  13.854 +				src_dbl_linestride_y,
  13.855 +				RETR_BUF+next_src_idx,
  13.856 +				0, 0 );
  13.857 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
  13.858 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
  13.859 +				src_dbl_linestride_y,
  13.860 +				RETR_BUF+next_src_idx,
  13.861 +				0, 0 );
  13.862 +
  13.863 +		// 2 lines v
  13.864 +		mfc_get( v_plane[next_src_idx],
  13.865 +				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
  13.866 +				src_dbl_linestride_vu,
  13.867 +				RETR_BUF+next_src_idx,
  13.868 +				0, 0 );
  13.869 +		// 2 lines u
  13.870 +		mfc_get( u_plane[next_src_idx],
  13.871 +				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
  13.872 +				src_dbl_linestride_vu,
  13.873 +				RETR_BUF+next_src_idx,
  13.874 +				0, 0 );
  13.875 +
  13.876 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.877 +
  13.878 +		// scaling
  13.879 +		// work line y_upper
  13.880 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.881 +				scaled_y_plane[curr_src_idx],
  13.882 +				dst_width,
  13.883 +				vf_x_scale,
  13.884 +				vf_curr_NSweight_y_upper,
  13.885 +				src_linestride_y );
  13.886 +		// work line y_lower
  13.887 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.888 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.889 +				dst_width,
  13.890 +				vf_x_scale,
  13.891 +				vf_curr_NSweight_y_lower,
  13.892 +				src_linestride_y );
  13.893 +		// work line v
  13.894 +		bilinear_scale_line_w16( v_plane[curr_src_idx],
  13.895 +				scaled_v_plane[curr_src_idx],
  13.896 +				dst_width>>1,
  13.897 +				vf_x_scale,
  13.898 +				vf_curr_NSweight_vu,
  13.899 +				src_linestride_vu );
  13.900 +		// work line u
  13.901 +		bilinear_scale_line_w16( u_plane[curr_src_idx],
  13.902 +				scaled_u_plane[curr_src_idx],
  13.903 +				dst_width>>1,
  13.904 +				vf_x_scale,
  13.905 +				vf_curr_NSweight_vu,
  13.906 +				src_linestride_vu );
  13.907 +
  13.908 +		//---------------------------------------------------------------------------------------------
  13.909 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.910 +
  13.911 +		// Perform three DMA transfers to 3 different locations in the main memory!
  13.912 +		// dst_width:	Pixel width of destination image
  13.913 +		// dst_addr:	Destination address in main memory
  13.914 +		// dst_vu:	Counter which is incremented one by one
  13.915 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.916 +
  13.917 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  13.918 +				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
  13.919 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
  13.920 +				STR_BUF+curr_dst_idx,								// Tag
  13.921 +				0, 0 );
  13.922 +
  13.923 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
  13.924 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
  13.925 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
  13.926 +				STR_BUF+curr_dst_idx,								// Tag
  13.927 +				0, 0 );
  13.928 +
  13.929 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
  13.930 +				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
  13.931 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
  13.932 +				STR_BUF+curr_dst_idx,								// Tag
  13.933 +				0, 0 );
  13.934 +		//---------------------------------------------------------------------------------------------
  13.935 +
  13.936 +
  13.937 +		// update for next cycle
  13.938 +		curr_src_idx = next_src_idx;
  13.939 +		curr_dst_idx = next_dst_idx;
  13.940 +
  13.941 +		curr_interpl_y_upper = next_interpl_y_upper;
  13.942 +		curr_interpl_y_lower = next_interpl_y_lower;
  13.943 +		curr_interpl_vu = next_interpl_vu;
  13.944 +
  13.945 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
  13.946 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
  13.947 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
  13.948 +
  13.949 +		curr_src_y_upper = next_src_y_upper;
  13.950 +		curr_src_y_lower = next_src_y_lower;
  13.951 +		curr_src_vu = next_src_vu;
  13.952 +	}
  13.953 +
  13.954 +
  13.955 +
  13.956 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
  13.957 +
  13.958 +	// scaling
  13.959 +	// work line y_upper
  13.960 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
  13.961 +			scaled_y_plane[curr_src_idx],
  13.962 +			dst_width,
  13.963 +			vf_x_scale,
  13.964 +			vf_curr_NSweight_y_upper,
  13.965 +			src_linestride_y );
  13.966 +	// work line y_lower
  13.967 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
  13.968 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
  13.969 +			dst_width,
  13.970 +			vf_x_scale,
  13.971 +			vf_curr_NSweight_y_lower,
  13.972 +			src_linestride_y );
  13.973 +	// work line v
  13.974 +	bilinear_scale_line_w16( v_plane[curr_src_idx],
  13.975 +			scaled_v_plane[curr_src_idx],
  13.976 +			dst_width>>1,
  13.977 +			vf_x_scale,
  13.978 +			vf_curr_NSweight_vu,
  13.979 +			src_linestride_vu );
  13.980 +	// work line u
  13.981 +	bilinear_scale_line_w16( u_plane[curr_src_idx],
  13.982 +			scaled_u_plane[curr_src_idx],
  13.983 +			dst_width>>1,
  13.984 +			vf_x_scale,
  13.985 +			vf_curr_NSweight_vu,
  13.986 +			src_linestride_vu );
  13.987 +
  13.988 +
  13.989 +	//---------------------------------------------------------------------------------------------
  13.990 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
  13.991 +
  13.992 +	// Perform three DMA transfers to 3 different locations in the main memory!
  13.993 +	// dst_width:	Pixel width of destination image
  13.994 +	// dst_addr:	Destination address in main memory
  13.995 +	// dst_vu:	Counter which is incremented one by one
  13.996 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
  13.997 +
  13.998 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
  13.999 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 13.1000 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 13.1001 +			STR_BUF+curr_dst_idx,								// Tag
 13.1002 +			0, 0 );
 13.1003 +
 13.1004 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 13.1005 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 13.1006 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 13.1007 +			STR_BUF+curr_dst_idx,								// Tag
 13.1008 +			0, 0 );
 13.1009 +
 13.1010 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 13.1011 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 13.1012 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 13.1013 +			STR_BUF+curr_dst_idx,								// Tag
 13.1014 +			0, 0 );
 13.1015 +
 13.1016 +	// wait for completion
 13.1017 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 13.1018 +	//---------------------------------------------------------------------------------------------
 13.1019 +}
 13.1020 +
 13.1021 +
 13.1022 +/**
 13.1023 + * scale_srcw32_dstw32()
 13.1024 + *
 13.1025 + * processes an input image of width 32
 13.1026 + * scaling is done to a width 32
 13.1027 + * yuv2rgb conversion on a width of 32
 13.1028 + * result stored in RAM
 13.1029 + */
 13.1030 +void scale_srcw32_dstw32() {
 13.1031 +	// extract parameters
 13.1032 +	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
 13.1033 +
 13.1034 +	unsigned int src_width = parms.src_pixel_width;
 13.1035 +	unsigned int src_height = parms.src_pixel_height;
 13.1036 +	unsigned int dst_width = parms.dst_pixel_width;
 13.1037 +	unsigned int dst_height = parms.dst_pixel_height;
 13.1038 +
 13.1039 +	// YVU
 13.1040 +	unsigned int src_linestride_y = src_width;
 13.1041 +	unsigned int src_dbl_linestride_y = src_width<<1;
 13.1042 +	unsigned int src_linestride_vu = src_width>>1;
 13.1043 +	unsigned int src_dbl_linestride_vu = src_width;
 13.1044 +
 13.1045 +	// scaled YVU
 13.1046 +	unsigned int scaled_src_linestride_y = dst_width;
 13.1047 +
 13.1048 +	// ram addresses
 13.1049 +	unsigned char* src_addr_y = parms.y_plane;
 13.1050 +	unsigned char* src_addr_v = parms.v_plane;
 13.1051 +	unsigned char* src_addr_u = parms.u_plane;
 13.1052 +
 13.1053 +	unsigned int dst_picture_size = dst_width*dst_height;
 13.1054 +
 13.1055 +	// Sizes for destination
 13.1056 +	unsigned int dst_dbl_linestride_y = dst_width<<1;
 13.1057 +	unsigned int dst_dbl_linestride_vu = dst_width>>1;
 13.1058 +
 13.1059 +	// Perform address calculation for Y, V and U in main memory with dst_addr as base
 13.1060 +	unsigned char* dst_addr_main_memory_y = dst_addr;
 13.1061 +	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
 13.1062 +	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
 13.1063 +
 13.1064 +	// calculate scale factors
 13.1065 +	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
 13.1066 +	float y_scale = (float)src_height/(float)dst_height;
 13.1067 +
 13.1068 +	// double buffered processing
 13.1069 +	// buffer switching
 13.1070 +	unsigned int curr_src_idx = 0;
 13.1071 +	unsigned int curr_dst_idx = 0;
 13.1072 +	unsigned int next_src_idx, next_dst_idx;
 13.1073 +
 13.1074 +	// 2 lines y as output, upper and lowerline
 13.1075 +	unsigned int curr_interpl_y_upper = 0;
 13.1076 +	unsigned int next_interpl_y_upper;
 13.1077 +	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
 13.1078 +	// only 1 line v/u output, both planes have the same dimension
 13.1079 +	unsigned int curr_interpl_vu = 0;
 13.1080 +	unsigned int next_interpl_vu;
 13.1081 +
 13.1082 +	// weights, calculated in every loop iteration
 13.1083 +	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
 13.1084 +	vector float vf_next_NSweight_y_upper;
 13.1085 +	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
 13.1086 +	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
 13.1087 +	vector float vf_next_NSweight_vu;
 13.1088 +
 13.1089 +	// line indices for the src picture
 13.1090 +	float curr_src_y_upper = 0.0f, next_src_y_upper;
 13.1091 +	float curr_src_y_lower, next_src_y_lower;
 13.1092 +	float curr_src_vu = 0.0f, next_src_vu;
 13.1093 +
 13.1094 +	// line indices for the dst picture
 13.1095 +	unsigned int dst_y=0, dst_vu=0;
 13.1096 +
 13.1097 +	// calculate lower line idices
 13.1098 +	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
 13.1099 +	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
 13.1100 +	// lower line weight
 13.1101 +	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
 13.1102 +
 13.1103 +
 13.1104 +	// start partially double buffered processing
 13.1105 +	// get initial data, 2 sets of y, 1 set v, 1 set u
 13.1106 +	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
 13.1107 +	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
 13.1108 +			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
 13.1109 +			src_dbl_linestride_y,
 13.1110 +			RETR_BUF,
 13.1111 +			0, 0 );
 13.1112 +	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 13.1113 +	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
 13.1114 +
 13.1115 +	// iteration loop
 13.1116 +	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
 13.1117 +	// the scaled output is 2 lines y, 1 line v, 1 line u
 13.1118 +	// the yuv2rgb-converted output is stored to RAM
 13.1119 +	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
 13.1120 +		dst_y = dst_vu<<1;
 13.1121 +
 13.1122 +		// calculate next indices
 13.1123 +		next_src_vu = ((float)dst_vu+1)*y_scale;
 13.1124 +		next_src_y_upper = ((float)dst_y+2)*y_scale;
 13.1125 +		next_src_y_lower = ((float)dst_y+3)*y_scale;
 13.1126 +
 13.1127 +		next_interpl_vu = (unsigned int) next_src_vu;
 13.1128 +		next_interpl_y_upper = (unsigned int) next_src_y_upper;
 13.1129 +		next_interpl_y_lower = (unsigned int) next_src_y_lower;
 13.1130 +
 13.1131 +		// calculate weight NORTH-SOUTH
 13.1132 +		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
 13.1133 +		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
 13.1134 +		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
 13.1135 +
 13.1136 +		// get next lines
 13.1137 +		next_src_idx = curr_src_idx^1;
 13.1138 +		next_dst_idx = curr_dst_idx^1;
 13.1139 +
 13.1140 +		// 4 lines y
 13.1141 +		mfc_get( y_plane[next_src_idx],
 13.1142 +				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
 13.1143 +				src_dbl_linestride_y,
 13.1144 +				RETR_BUF+next_src_idx,
 13.1145 +				0, 0 );
 13.1146 +		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
 13.1147 +				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
 13.1148 +				src_dbl_linestride_y,
 13.1149 +				RETR_BUF+next_src_idx,
 13.1150 +				0, 0 );
 13.1151 +
 13.1152 +		// 2 lines v
 13.1153 +		mfc_get( v_plane[next_src_idx],
 13.1154 +				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
 13.1155 +				src_dbl_linestride_vu,
 13.1156 +				RETR_BUF+next_src_idx,
 13.1157 +				0, 0 );
 13.1158 +		// 2 lines u
 13.1159 +		mfc_get( u_plane[next_src_idx],
 13.1160 +				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
 13.1161 +				src_dbl_linestride_vu,
 13.1162 +				RETR_BUF+next_src_idx,
 13.1163 +				0, 0 );
 13.1164 +
 13.1165 +		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 13.1166 +
 13.1167 +		// scaling
 13.1168 +		// work line y_upper
 13.1169 +		bilinear_scale_line_w16( y_plane[curr_src_idx],
 13.1170 +				scaled_y_plane[curr_src_idx],
 13.1171 +				dst_width,
 13.1172 +				vf_x_scale,
 13.1173 +				vf_curr_NSweight_y_upper,
 13.1174 +				src_linestride_y );
 13.1175 +		// work line y_lower
 13.1176 +		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 13.1177 +				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 13.1178 +				dst_width,
 13.1179 +				vf_x_scale,
 13.1180 +				vf_curr_NSweight_y_lower,
 13.1181 +				src_linestride_y );
 13.1182 +		// work line v
 13.1183 +		bilinear_scale_line_w16( v_plane[curr_src_idx],
 13.1184 +				scaled_v_plane[curr_src_idx],
 13.1185 +				dst_width>>1,
 13.1186 +				vf_x_scale,
 13.1187 +				vf_curr_NSweight_vu,
 13.1188 +				src_linestride_vu );
 13.1189 +		// work line u
 13.1190 +		bilinear_scale_line_w16( u_plane[curr_src_idx],
 13.1191 +				scaled_u_plane[curr_src_idx],
 13.1192 +				dst_width>>1,
 13.1193 +				vf_x_scale,
 13.1194 +				vf_curr_NSweight_vu,
 13.1195 +				src_linestride_vu );
 13.1196 +
 13.1197 +
 13.1198 +
 13.1199 +		// Store the result back to main memory into a destination buffer in YUV format
 13.1200 +		//---------------------------------------------------------------------------------------------
 13.1201 +		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 13.1202 +
 13.1203 +		// Perform three DMA transfers to 3 different locations in the main memory!
 13.1204 +		// dst_width:	Pixel width of destination image
 13.1205 +		// dst_addr:	Destination address in main memory
 13.1206 +		// dst_vu:	Counter which is incremented one by one
 13.1207 +		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 13.1208 +
 13.1209 +		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 13.1210 +				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 13.1211 +				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 13.1212 +				STR_BUF+curr_dst_idx,								// Tag
 13.1213 +				0, 0 );
 13.1214 +
 13.1215 +		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 13.1216 +				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 13.1217 +				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 13.1218 +				STR_BUF+curr_dst_idx,								// Tag
 13.1219 +				0, 0 );
 13.1220 +
 13.1221 +		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 13.1222 +				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 13.1223 +				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 13.1224 +				STR_BUF+curr_dst_idx,								// Tag
 13.1225 +				0, 0 );
 13.1226 +		//---------------------------------------------------------------------------------------------
 13.1227 +
 13.1228 +
 13.1229 +		// update for next cycle
 13.1230 +		curr_src_idx = next_src_idx;
 13.1231 +		curr_dst_idx = next_dst_idx;
 13.1232 +
 13.1233 +		curr_interpl_y_upper = next_interpl_y_upper;
 13.1234 +		curr_interpl_y_lower = next_interpl_y_lower;
 13.1235 +		curr_interpl_vu = next_interpl_vu;
 13.1236 +
 13.1237 +		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
 13.1238 +		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
 13.1239 +		vf_curr_NSweight_vu = vf_next_NSweight_vu;
 13.1240 +
 13.1241 +		curr_src_y_upper = next_src_y_upper;
 13.1242 +		curr_src_y_lower = next_src_y_lower;
 13.1243 +		curr_src_vu = next_src_vu;
 13.1244 +	}
 13.1245 +
 13.1246 +
 13.1247 +
 13.1248 +	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
 13.1249 +
 13.1250 +	// scaling
 13.1251 +	// work line y_upper
 13.1252 +	bilinear_scale_line_w16( y_plane[curr_src_idx],
 13.1253 +			scaled_y_plane[curr_src_idx],
 13.1254 +			dst_width,
 13.1255 +			vf_x_scale,
 13.1256 +			vf_curr_NSweight_y_upper,
 13.1257 +			src_linestride_y );
 13.1258 +	// work line y_lower
 13.1259 +	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
 13.1260 +			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
 13.1261 +			dst_width,
 13.1262 +			vf_x_scale,
 13.1263 +			vf_curr_NSweight_y_lower,
 13.1264 +			src_linestride_y );
 13.1265 +	// work line v
 13.1266 +	bilinear_scale_line_w16( v_plane[curr_src_idx],
 13.1267 +			scaled_v_plane[curr_src_idx],
 13.1268 +			dst_width>>1,
 13.1269 +			vf_x_scale,
 13.1270 +			vf_curr_NSweight_vu,
 13.1271 +			src_linestride_vu );
 13.1272 +	// work line u
 13.1273 +	bilinear_scale_line_w16( u_plane[curr_src_idx],
 13.1274 +			scaled_u_plane[curr_src_idx],
 13.1275 +			dst_width>>1,
 13.1276 +			vf_x_scale,
 13.1277 +			vf_curr_NSweight_vu,
 13.1278 +			src_linestride_vu );
 13.1279 +
 13.1280 +
 13.1281 +	// Store the result back to main memory into a destination buffer in YUV format
 13.1282 +	//---------------------------------------------------------------------------------------------
 13.1283 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 13.1284 +
 13.1285 +	// Perform three DMA transfers to 3 different locations in the main memory!
 13.1286 +	// dst_width:	Pixel width of destination image
 13.1287 +	// dst_addr:	Destination address in main memory
 13.1288 +	// dst_vu:	Counter which is incremented one by one
 13.1289 +	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
 13.1290 +
 13.1291 +	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
 13.1292 +			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
 13.1293 +			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
 13.1294 +			STR_BUF+curr_dst_idx,								// Tag
 13.1295 +			0, 0 );
 13.1296 +
 13.1297 +	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
 13.1298 +			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
 13.1299 +			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
 13.1300 +			STR_BUF+curr_dst_idx,								// Tag
 13.1301 +			0, 0 );
 13.1302 +
 13.1303 +	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
 13.1304 +			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
 13.1305 +			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
 13.1306 +			STR_BUF+curr_dst_idx,								// Tag
 13.1307 +			0, 0 );
 13.1308 +
 13.1309 +	// wait for completion
 13.1310 +	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
 13.1311 +	//---------------------------------------------------------------------------------------------
 13.1312 +}
 13.1313 +
 13.1314 +
 13.1315 +/*
 13.1316 + * bilinear_scale_line_w8()
 13.1317 + *
 13.1318 + * processes a line of yuv-input, width has to be a multiple of 8
 13.1319 + * scaled yuv-output is written to local store buffer
 13.1320 + *
 13.1321 + * @param src buffer for 2 lines input
 13.1322 + * @param dst_ buffer for 1 line output
 13.1323 + * @param dst_width the width of the destination line
 13.1324 + * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 13.1325 + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 13.1326 + * @param src_linestride the stride of the srcline
 13.1327 + */
 13.1328 +void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
 13.1329 +
 13.1330 +	unsigned char* dst = dst_;
 13.1331 +
 13.1332 +	unsigned int dst_x;
 13.1333 +	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
 13.1334 +		// address calculation for loading the 4 surrounding pixel of each calculated
 13.1335 +		// destination pixel
 13.1336 +		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 13.1337 +		// lower range->first 4 pixel
 13.1338 +		// upper range->next 4 pixel
 13.1339 +		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
 13.1340 +		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
 13.1341 +		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
 13.1342 +		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
 13.1343 +
 13.1344 +		// calculate weight EAST-WEST
 13.1345 +		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
 13.1346 +		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
 13.1347 +		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
 13.1348 +		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
 13.1349 +		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
 13.1350 +		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
 13.1351 +		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
 13.1352 +		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
 13.1353 +		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
 13.1354 +		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
 13.1355 +
 13.1356 +		// calculate address offset
 13.1357 +		//
 13.1358 +		// pixel NORTH WEST
 13.1359 +		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
 13.1360 +		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
 13.1361 +
 13.1362 +		// pixel NORTH EAST-->(offpixelNW+1)
 13.1363 +		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 13.1364 +		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
 13.1365 +		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
 13.1366 +
 13.1367 +		// SOUTH-WEST-->(offpixelNW+src_linestride)
 13.1368 +		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 13.1369 +		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
 13.1370 +		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
 13.1371 +
 13.1372 +		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 13.1373 +		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
 13.1374 +		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
 13.1375 +
 13.1376 +		// calculate each address
 13.1377 +		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 13.1378 +		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
 13.1379 +		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
 13.1380 +		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
 13.1381 +		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
 13.1382 +
 13.1383 +		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
 13.1384 +		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
 13.1385 +		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
 13.1386 +		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
 13.1387 +
 13.1388 +		// get each pixel
 13.1389 +		//
 13.1390 +		// scalar load, afterwards insertion into the right position
 13.1391 +		// NORTH WEST
 13.1392 +		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 13.1393 +		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
 13.1394 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
 13.1395 +		vuc_pixel_NW_lower_range = spu_insert(
 13.1396 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
 13.1397 +				vuc_pixel_NW_lower_range, 7 );
 13.1398 +		vuc_pixel_NW_lower_range = spu_insert(
 13.1399 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
 13.1400 +				vuc_pixel_NW_lower_range, 11 );
 13.1401 +		vuc_pixel_NW_lower_range = spu_insert(
 13.1402 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
 13.1403 +				vuc_pixel_NW_lower_range, 15 );
 13.1404 +
 13.1405 +		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
 13.1406 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
 13.1407 +		vuc_pixel_NW_upper_range = spu_insert(
 13.1408 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
 13.1409 +				vuc_pixel_NW_upper_range, 7 );
 13.1410 +		vuc_pixel_NW_upper_range = spu_insert(
 13.1411 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
 13.1412 +				vuc_pixel_NW_upper_range, 11 );
 13.1413 +		vuc_pixel_NW_upper_range = spu_insert(
 13.1414 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
 13.1415 +				vuc_pixel_NW_upper_range, 15 );
 13.1416 +
 13.1417 +		// NORTH EAST
 13.1418 +		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
 13.1419 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
 13.1420 +		vuc_pixel_NE_lower_range = spu_insert(
 13.1421 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
 13.1422 +				vuc_pixel_NE_lower_range, 7 );
 13.1423 +		vuc_pixel_NE_lower_range = spu_insert(
 13.1424 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
 13.1425 +				vuc_pixel_NE_lower_range, 11 );
 13.1426 +		vuc_pixel_NE_lower_range = spu_insert(
 13.1427 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
 13.1428 +				vuc_pixel_NE_lower_range, 15 );
 13.1429 +
 13.1430 +		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
 13.1431 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
 13.1432 +		vuc_pixel_NE_upper_range = spu_insert(
 13.1433 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
 13.1434 +				vuc_pixel_NE_upper_range, 7 );
 13.1435 +		vuc_pixel_NE_upper_range = spu_insert(
 13.1436 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
 13.1437 +				vuc_pixel_NE_upper_range, 11 );
 13.1438 +		vuc_pixel_NE_upper_range = spu_insert(
 13.1439 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
 13.1440 +				vuc_pixel_NE_upper_range, 15 );
 13.1441 +
 13.1442 +
 13.1443 +		// SOUTH WEST
 13.1444 +		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
 13.1445 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
 13.1446 +		vuc_pixel_SW_lower_range = spu_insert(
 13.1447 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
 13.1448 +				vuc_pixel_SW_lower_range, 7 );
 13.1449 +		vuc_pixel_SW_lower_range = spu_insert(
 13.1450 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
 13.1451 +				vuc_pixel_SW_lower_range, 11 );
 13.1452 +		vuc_pixel_SW_lower_range = spu_insert(
 13.1453 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
 13.1454 +				vuc_pixel_SW_lower_range, 15 );
 13.1455 +
 13.1456 +		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
 13.1457 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
 13.1458 +		vuc_pixel_SW_upper_range = spu_insert(
 13.1459 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
 13.1460 +				vuc_pixel_SW_upper_range, 7 );
 13.1461 +		vuc_pixel_SW_upper_range = spu_insert(
 13.1462 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
 13.1463 +				vuc_pixel_SW_upper_range, 11 );
 13.1464 +		vuc_pixel_SW_upper_range = spu_insert(
 13.1465 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
 13.1466 +				vuc_pixel_SW_upper_range, 15 );
 13.1467 +
 13.1468 +		// SOUTH EAST
 13.1469 +		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
 13.1470 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
 13.1471 +		vuc_pixel_SE_lower_range = spu_insert(
 13.1472 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
 13.1473 +				vuc_pixel_SE_lower_range, 7 );
 13.1474 +		vuc_pixel_SE_lower_range = spu_insert(
 13.1475 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
 13.1476 +				vuc_pixel_SE_lower_range, 11 );
 13.1477 +		vuc_pixel_SE_lower_range = spu_insert(
 13.1478 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
 13.1479 +				vuc_pixel_SE_lower_range, 15 );
 13.1480 +
 13.1481 +		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
 13.1482 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
 13.1483 +		vuc_pixel_SE_upper_range = spu_insert(
 13.1484 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
 13.1485 +				vuc_pixel_SE_upper_range, 7 );
 13.1486 +		vuc_pixel_SE_upper_range = spu_insert(
 13.1487 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
 13.1488 +				vuc_pixel_SE_upper_range, 11 );
 13.1489 +		vuc_pixel_SE_upper_range = spu_insert(
 13.1490 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
 13.1491 +				vuc_pixel_SE_upper_range, 15 );
 13.1492 +
 13.1493 +
 13.1494 +		// convert to float
 13.1495 +		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
 13.1496 +		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
 13.1497 +
 13.1498 +		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
 13.1499 +		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
 13.1500 +
 13.1501 +		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
 13.1502 +		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
 13.1503 +
 13.1504 +		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
 13.1505 +		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
 13.1506 +
 13.1507 +
 13.1508 +
 13.1509 +		// first linear interpolation: EWtop
 13.1510 +		// EWtop = NW + EWweight*(NE-NW)
 13.1511 +		//
 13.1512 +		// lower range
 13.1513 +		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
 13.1514 +		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
 13.1515 +								vf_EWtop_lower_range_tmp,
 13.1516 +								vf_pixel_NW_lower_range );
 13.1517 +
 13.1518 +		// upper range
 13.1519 +		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
 13.1520 +		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
 13.1521 +								vf_EWtop_upper_range_tmp,
 13.1522 +								vf_pixel_NW_upper_range );
 13.1523 +
 13.1524 +
 13.1525 +
 13.1526 +		// second linear interpolation: EWbottom
 13.1527 +		// EWbottom = SW + EWweight*(SE-SW)
 13.1528 +		//
 13.1529 +		// lower range
 13.1530 +		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
 13.1531 +		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
 13.1532 +								vf_EWbottom_lower_range_tmp,
 13.1533 +								vf_pixel_SW_lower_range );
 13.1534 +
 13.1535 +		// upper range
 13.1536 +		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
 13.1537 +		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
 13.1538 +								vf_EWbottom_upper_range_tmp,
 13.1539 +								vf_pixel_SW_upper_range );
 13.1540 +
 13.1541 +
 13.1542 +
 13.1543 +		// third linear interpolation: the bilinear interpolated value
 13.1544 +		// result = EWtop + NSweight*(EWbottom-EWtop);
 13.1545 +		//
 13.1546 +		// lower range
 13.1547 +		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
 13.1548 +		vector float vf_result_lower_range = spu_madd( vf_NSweight,
 13.1549 +								vf_result_lower_range_tmp,
 13.1550 +								vf_EWtop_lower_range );
 13.1551 +
 13.1552 +		// upper range
 13.1553 +		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
 13.1554 +		vector float vf_result_upper_range = spu_madd( vf_NSweight,
 13.1555 +								vf_result_upper_range_tmp,
 13.1556 +								vf_EWtop_upper_range );
 13.1557 +
 13.1558 +
 13.1559 +		// convert back: using saturated arithmetic
 13.1560 +		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
 13.1561 +		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
 13.1562 +
 13.1563 +		// merge results->lower,upper
 13.1564 +		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
 13.1565 +							       0x13, 0x17, 0x1B, 0x1F,
 13.1566 +							       0x00, 0x00, 0x00, 0x00,
 13.1567 +							       0x00, 0x00, 0x00, 0x00 };
 13.1568 +
 13.1569 +		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
 13.1570 +								(vector unsigned char) vui_result_upper_range,
 13.1571 +								vuc_mask_merge_result );
 13.1572 +
 13.1573 +		// partial storing
 13.1574 +		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
 13.1575 +						      0x00, 0x00, 0x00, 0x00,
 13.1576 +						      0xFF, 0xFF, 0xFF, 0xFF,
 13.1577 +						      0xFF, 0xFF, 0xFF, 0xFF };
 13.1578 +
 13.1579 +
 13.1580 +		// get currently stored data
 13.1581 +		vector unsigned char vuc_orig = *((vector unsigned char*)dst);
 13.1582 +
 13.1583 +		// clear currently stored data
 13.1584 +		vuc_orig = spu_and( vuc_orig,
 13.1585 +				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
 13.1586 +
 13.1587 +		// rotate result according to storing address
 13.1588 +		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
 13.1589 +
 13.1590 +		// store result
 13.1591 +		*((vector unsigned char*)dst) = spu_or( vuc_result,
 13.1592 +							vuc_orig );
 13.1593 +		dst += 8;
 13.1594 +	}
 13.1595 +}
 13.1596 +
 13.1597 +
 13.1598 +/*
 13.1599 + * bilinear_scale_line_w16()
 13.1600 + *
 13.1601 + * processes a line of yuv-input, width has to be a multiple of 16
 13.1602 + * scaled yuv-output is written to local store buffer
 13.1603 + *
 13.1604 + * @param src buffer for 2 lines input
 13.1605 + * @param dst_ buffer for 1 line output
 13.1606 + * @param dst_width the width of the destination line
 13.1607 + * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 13.1608 + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 13.1609 + * @param src_linestride the stride of the srcline
 13.1610 + */
 13.1611 +void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
 13.1612 +
 13.1613 +	unsigned char* dst = dst_;
 13.1614 +
 13.1615 +	unsigned int dst_x;
 13.1616 +	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
 13.1617 +		// address calculation for loading the 4 surrounding pixel of each calculated
 13.1618 +		// destination pixel
 13.1619 +		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
 13.1620 +		// parallelised processing
 13.1621 +		// first range->pixel 1 2 3 4
 13.1622 +		// second range->pixel 5 6 7 8
 13.1623 +		// third range->pixel 9 10 11 12
 13.1624 +		// fourth range->pixel 13 14 15 16
 13.1625 +		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
 13.1626 +		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
 13.1627 +		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
 13.1628 +		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
 13.1629 +		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
 13.1630 +		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
 13.1631 +		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
 13.1632 +		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
 13.1633 +
 13.1634 +		// calculate weight EAST-WEST
 13.1635 +		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
 13.1636 +		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
 13.1637 +		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
 13.1638 +		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
 13.1639 +		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
 13.1640 +		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
 13.1641 +		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
 13.1642 +		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
 13.1643 +		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
 13.1644 +		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
 13.1645 +		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
 13.1646 +		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
 13.1647 +		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
 13.1648 +		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
 13.1649 +		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
 13.1650 +		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
 13.1651 +		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
 13.1652 +		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
 13.1653 +		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
 13.1654 +		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
 13.1655 +
 13.1656 +		// calculate address offset
 13.1657 +		//
 13.1658 +		// pixel NORTH WEST
 13.1659 +		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
 13.1660 +		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
 13.1661 +		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
 13.1662 +		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
 13.1663 +
 13.1664 +		// pixel NORTH EAST-->(offpixelNW+1)
 13.1665 +		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
 13.1666 +		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
 13.1667 +		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
 13.1668 +		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
 13.1669 +		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
 13.1670 +
 13.1671 +		// SOUTH-WEST-->(offpixelNW+src_linestride)
 13.1672 +		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
 13.1673 +		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
 13.1674 +		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
 13.1675 +		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
 13.1676 +		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
 13.1677 +
 13.1678 +		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
 13.1679 +		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
 13.1680 +		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
 13.1681 +		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
 13.1682 +		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
 13.1683 +
 13.1684 +		// calculate each address
 13.1685 +		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
 13.1686 +		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
 13.1687 +		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
 13.1688 +		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
 13.1689 +		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
 13.1690 +
 13.1691 +		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
 13.1692 +		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
 13.1693 +		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
 13.1694 +		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
 13.1695 +
 13.1696 +		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
 13.1697 +		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
 13.1698 +		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
 13.1699 +		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
 13.1700 +
 13.1701 +		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
 13.1702 +		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
 13.1703 +		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
 13.1704 +		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
 13.1705 +
 13.1706 +
 13.1707 +		// get each pixel
 13.1708 +		//
 13.1709 +		// scalar load, afterwards insertion into the right position
 13.1710 +		// NORTH WEST
 13.1711 +		// first range
 13.1712 +		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
 13.1713 +		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
 13.1714 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
 13.1715 +		vuc_pixel_NW_first_range = spu_insert(
 13.1716 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
 13.1717 +				vuc_pixel_NW_first_range, 7 );
 13.1718 +		vuc_pixel_NW_first_range = spu_insert(
 13.1719 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
 13.1720 +				vuc_pixel_NW_first_range, 11 );
 13.1721 +		vuc_pixel_NW_first_range = spu_insert(
 13.1722 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
 13.1723 +				vuc_pixel_NW_first_range, 15 );
 13.1724 +		// second range
 13.1725 +		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
 13.1726 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
 13.1727 +		vuc_pixel_NW_second_range = spu_insert(
 13.1728 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
 13.1729 +				vuc_pixel_NW_second_range, 7 );
 13.1730 +		vuc_pixel_NW_second_range = spu_insert(
 13.1731 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
 13.1732 +				vuc_pixel_NW_second_range, 11 );
 13.1733 +		vuc_pixel_NW_second_range = spu_insert(
 13.1734 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
 13.1735 +				vuc_pixel_NW_second_range, 15 );
 13.1736 +		// third range
 13.1737 +		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
 13.1738 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
 13.1739 +		vuc_pixel_NW_third_range = spu_insert(
 13.1740 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
 13.1741 +				vuc_pixel_NW_third_range, 7 );
 13.1742 +		vuc_pixel_NW_third_range = spu_insert(
 13.1743 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
 13.1744 +				vuc_pixel_NW_third_range, 11 );
 13.1745 +		vuc_pixel_NW_third_range = spu_insert(
 13.1746 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
 13.1747 +				vuc_pixel_NW_third_range, 15 );
 13.1748 +		// fourth range
 13.1749 +		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
 13.1750 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
 13.1751 +		vuc_pixel_NW_fourth_range = spu_insert(
 13.1752 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
 13.1753 +				vuc_pixel_NW_fourth_range, 7 );
 13.1754 +		vuc_pixel_NW_fourth_range = spu_insert(
 13.1755 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
 13.1756 +				vuc_pixel_NW_fourth_range, 11 );
 13.1757 +		vuc_pixel_NW_fourth_range = spu_insert(
 13.1758 +				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
 13.1759 +				vuc_pixel_NW_fourth_range, 15 );
 13.1760 +
 13.1761 +		// NORTH EAST
 13.1762 +		// first range
 13.1763 +		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
 13.1764 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
 13.1765 +		vuc_pixel_NE_first_range = spu_insert(
 13.1766 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
 13.1767 +				vuc_pixel_NE_first_range, 7 );
 13.1768 +		vuc_pixel_NE_first_range = spu_insert(
 13.1769 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
 13.1770 +				vuc_pixel_NE_first_range, 11 );
 13.1771 +		vuc_pixel_NE_first_range = spu_insert(
 13.1772 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
 13.1773 +				vuc_pixel_NE_first_range, 15 );
 13.1774 +		// second range
 13.1775 +		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
 13.1776 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
 13.1777 +		vuc_pixel_NE_second_range = spu_insert(
 13.1778 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
 13.1779 +				vuc_pixel_NE_second_range, 7 );
 13.1780 +		vuc_pixel_NE_second_range = spu_insert(
 13.1781 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
 13.1782 +				vuc_pixel_NE_second_range, 11 );
 13.1783 +		vuc_pixel_NE_second_range = spu_insert(
 13.1784 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
 13.1785 +				vuc_pixel_NE_second_range, 15 );
 13.1786 +		// third range
 13.1787 +		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
 13.1788 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
 13.1789 +		vuc_pixel_NE_third_range = spu_insert(
 13.1790 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
 13.1791 +				vuc_pixel_NE_third_range, 7 );
 13.1792 +		vuc_pixel_NE_third_range = spu_insert(
 13.1793 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
 13.1794 +				vuc_pixel_NE_third_range, 11 );
 13.1795 +		vuc_pixel_NE_third_range = spu_insert(
 13.1796 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
 13.1797 +				vuc_pixel_NE_third_range, 15 );
 13.1798 +		// fourth range
 13.1799 +		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
 13.1800 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
 13.1801 +		vuc_pixel_NE_fourth_range = spu_insert(
 13.1802 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
 13.1803 +				vuc_pixel_NE_fourth_range, 7 );
 13.1804 +		vuc_pixel_NE_fourth_range = spu_insert(
 13.1805 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
 13.1806 +				vuc_pixel_NE_fourth_range, 11 );
 13.1807 +		vuc_pixel_NE_fourth_range = spu_insert(
 13.1808 +				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
 13.1809 +				vuc_pixel_NE_fourth_range, 15 );
 13.1810 +
 13.1811 +		// SOUTH WEST
 13.1812 +		// first range
 13.1813 +		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
 13.1814 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
 13.1815 +		vuc_pixel_SW_first_range = spu_insert(
 13.1816 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
 13.1817 +				vuc_pixel_SW_first_range, 7 );
 13.1818 +		vuc_pixel_SW_first_range = spu_insert(
 13.1819 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
 13.1820 +				vuc_pixel_SW_first_range, 11 );
 13.1821 +		vuc_pixel_SW_first_range = spu_insert(
 13.1822 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
 13.1823 +				vuc_pixel_SW_first_range, 15 );
 13.1824 +		// second range
 13.1825 +		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
 13.1826 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
 13.1827 +		vuc_pixel_SW_second_range = spu_insert(
 13.1828 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
 13.1829 +				vuc_pixel_SW_second_range, 7 );
 13.1830 +		vuc_pixel_SW_second_range = spu_insert(
 13.1831 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
 13.1832 +				vuc_pixel_SW_second_range, 11 );
 13.1833 +		vuc_pixel_SW_second_range = spu_insert(
 13.1834 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
 13.1835 +				vuc_pixel_SW_second_range, 15 );
 13.1836 +		// third range
 13.1837 +		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
 13.1838 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
 13.1839 +		vuc_pixel_SW_third_range = spu_insert(
 13.1840 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
 13.1841 +				vuc_pixel_SW_third_range, 7 );
 13.1842 +		vuc_pixel_SW_third_range = spu_insert(
 13.1843 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
 13.1844 +				vuc_pixel_SW_third_range, 11 );
 13.1845 +		vuc_pixel_SW_third_range = spu_insert(
 13.1846 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
 13.1847 +				vuc_pixel_SW_third_range, 15 );
 13.1848 +		// fourth range
 13.1849 +		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
 13.1850 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
 13.1851 +		vuc_pixel_SW_fourth_range = spu_insert(
 13.1852 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
 13.1853 +				vuc_pixel_SW_fourth_range, 7 );
 13.1854 +		vuc_pixel_SW_fourth_range = spu_insert(
 13.1855 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
 13.1856 +				vuc_pixel_SW_fourth_range, 11 );
 13.1857 +		vuc_pixel_SW_fourth_range = spu_insert(
 13.1858 +				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
 13.1859 +				vuc_pixel_SW_fourth_range, 15 );
 13.1860 +
 13.1861 +		// NORTH EAST
 13.1862 +		// first range
 13.1863 +		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
 13.1864 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
 13.1865 +		vuc_pixel_SE_first_range = spu_insert(
 13.1866 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
 13.1867 +				vuc_pixel_SE_first_range, 7 );
 13.1868 +		vuc_pixel_SE_first_range = spu_insert(
 13.1869 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
 13.1870 +				vuc_pixel_SE_first_range, 11 );
 13.1871 +		vuc_pixel_SE_first_range = spu_insert(
 13.1872 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
 13.1873 +				vuc_pixel_SE_first_range, 15 );
 13.1874 +		// second range
 13.1875 +		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
 13.1876 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
 13.1877 +		vuc_pixel_SE_second_range = spu_insert(
 13.1878 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
 13.1879 +				vuc_pixel_SE_second_range, 7 );
 13.1880 +		vuc_pixel_SE_second_range = spu_insert(
 13.1881 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
 13.1882 +				vuc_pixel_SE_second_range, 11 );
 13.1883 +		vuc_pixel_SE_second_range = spu_insert(
 13.1884 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
 13.1885 +				vuc_pixel_SE_second_range, 15 );
 13.1886 +		// third range
 13.1887 +		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
 13.1888 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
 13.1889 +		vuc_pixel_SE_third_range = spu_insert(
 13.1890 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
 13.1891 +				vuc_pixel_SE_third_range, 7 );
 13.1892 +		vuc_pixel_SE_third_range = spu_insert(
 13.1893 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
 13.1894 +				vuc_pixel_SE_third_range, 11 );
 13.1895 +		vuc_pixel_SE_third_range = spu_insert(
 13.1896 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
 13.1897 +				vuc_pixel_SE_third_range, 15 );
 13.1898 +		// fourth range
 13.1899 +		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
 13.1900 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
 13.1901 +		vuc_pixel_SE_fourth_range = spu_insert(
 13.1902 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
 13.1903 +				vuc_pixel_SE_fourth_range, 7 );
 13.1904 +		vuc_pixel_SE_fourth_range = spu_insert(
 13.1905 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
 13.1906 +				vuc_pixel_SE_fourth_range, 11 );
 13.1907 +		vuc_pixel_SE_fourth_range = spu_insert(
 13.1908 +				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
 13.1909 +				vuc_pixel_SE_fourth_range, 15 );
 13.1910 +
 13.1911 +
 13.1912 +
 13.1913 +		// convert to float
 13.1914 +		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
 13.1915 +		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
 13.1916 +		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
 13.1917 +		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
 13.1918 +
 13.1919 +		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
 13.1920 +		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
 13.1921 +		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
 13.1922 +		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
 13.1923 +
 13.1924 +		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
 13.1925 +		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
 13.1926 +		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
 13.1927 +		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
 13.1928 +
 13.1929 +		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
 13.1930 +		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
 13.1931 +		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
 13.1932 +		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
 13.1933 +
 13.1934 +		// first linear interpolation: EWtop
 13.1935 +		// EWtop = NW + EWweight*(NE-NW)
 13.1936 +		//
 13.1937 +		// first range
 13.1938 +		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
 13.1939 +		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
 13.1940 +								vf_EWtop_first_range_tmp,
 13.1941 +								vf_pixel_NW_first_range );
 13.1942 +
 13.1943 +		// second range
 13.1944 +		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
 13.1945 +		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
 13.1946 +								vf_EWtop_second_range_tmp,
 13.1947 +								vf_pixel_NW_second_range );
 13.1948 +
 13.1949 +		// third range
 13.1950 +		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
 13.1951 +		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
 13.1952 +								vf_EWtop_third_range_tmp,
 13.1953 +								vf_pixel_NW_third_range );
 13.1954 +
 13.1955 +		// fourth range
 13.1956 +		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
 13.1957 +		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
 13.1958 +								vf_EWtop_fourth_range_tmp,
 13.1959 +								vf_pixel_NW_fourth_range );
 13.1960 +
 13.1961 +
 13.1962 +
 13.1963 +		// second linear interpolation: EWbottom
 13.1964 +		// EWbottom = SW + EWweight*(SE-SW)
 13.1965 +		//
 13.1966 +		// first range
 13.1967 +		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
 13.1968 +		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
 13.1969 +								vf_EWbottom_first_range_tmp,
 13.1970 +								vf_pixel_SW_first_range );
 13.1971 +
 13.1972 +		// second range
 13.1973 +		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
 13.1974 +		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
 13.1975 +								vf_EWbottom_second_range_tmp,
 13.1976 +								vf_pixel_SW_second_range );
 13.1977 +		// first range
 13.1978 +		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
 13.1979 +		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
 13.1980 +								vf_EWbottom_third_range_tmp,
 13.1981 +								vf_pixel_SW_third_range );
 13.1982 +
 13.1983 +		// first range
 13.1984 +		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
 13.1985 +		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
 13.1986 +								vf_EWbottom_fourth_range_tmp,
 13.1987 +								vf_pixel_SW_fourth_range );
 13.1988 +
 13.1989 +
 13.1990 +
 13.1991 +		// third linear interpolation: the bilinear interpolated value
 13.1992 +		// result = EWtop + NSweight*(EWbottom-EWtop);
 13.1993 +		//
 13.1994 +		// first range
 13.1995 +		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
 13.1996 +		vector float vf_result_first_range = spu_madd( vf_NSweight,
 13.1997 +								vf_result_first_range_tmp,
 13.1998 +								vf_EWtop_first_range );
 13.1999 +
 13.2000 +		// second range
 13.2001 +		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
 13.2002 +		vector float vf_result_second_range = spu_madd( vf_NSweight,
 13.2003 +								vf_result_second_range_tmp,
 13.2004 +								vf_EWtop_second_range );
 13.2005 +
 13.2006 +		// third range
 13.2007 +		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
 13.2008 +		vector float vf_result_third_range = spu_madd( vf_NSweight,
 13.2009 +								vf_result_third_range_tmp,
 13.2010 +								vf_EWtop_third_range );
 13.2011 +
 13.2012 +		// fourth range
 13.2013 +		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
 13.2014 +		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
 13.2015 +								vf_result_fourth_range_tmp,
 13.2016 +								vf_EWtop_fourth_range );
 13.2017 +
 13.2018 +
 13.2019 +
 13.2020 +		// convert back: using saturated arithmetic
 13.2021 +		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
 13.2022 +		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
 13.2023 +		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
 13.2024 +		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
 13.2025 +
 13.2026 +		// merge results->lower,upper
 13.2027 +		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
 13.2028 +							       		    0x13, 0x17, 0x1B, 0x1F,
 13.2029 +							       		    0x00, 0x00, 0x00, 0x00,
 13.2030 +							       		    0x00, 0x00, 0x00, 0x00 };
 13.2031 +
 13.2032 +		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
 13.2033 +							       		    0x00, 0x00, 0x00, 0x00,
 13.2034 +									    0x03, 0x07, 0x0B, 0x0F,
 13.2035 +							       		    0x13, 0x17, 0x1B, 0x1F };
 13.2036 +
 13.2037 +		vector unsigned char vuc_result_first_second =
 13.2038 +						spu_shuffle( (vector unsigned char) vui_result_first_range,
 13.2039 +								 (vector unsigned char) vui_result_second_range,
 13.2040 +								vuc_mask_merge_result_first_second );
 13.2041 +
 13.2042 +		vector unsigned char vuc_result_third_fourth =
 13.2043 +						spu_shuffle( (vector unsigned char) vui_result_third_range,
 13.2044 +								 (vector unsigned char) vui_result_fourth_range,
 13.2045 +								vuc_mask_merge_result_third_fourth );
 13.2046 +
 13.2047 +		// store result
 13.2048 +		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
 13.2049 +							vuc_result_third_fourth );
 13.2050 +		dst += 16;
 13.2051 +	}
 13.2052 +}
 13.2053 +
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/src/video/ps3/spulibs/fb_writer.c	Thu Apr 02 04:06:55 2009 +0000
    14.3 @@ -0,0 +1,193 @@
    14.4 +/*
    14.5 + * SDL - Simple DirectMedia Layer
    14.6 + * CELL BE Support for PS3 Framebuffer
    14.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    14.8 + *
    14.9 + * This library is free software; you can redistribute it and/or modify it
   14.10 + * under the terms of the GNU Lesser General Public License as published
   14.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   14.12 + * (at your option) any later version.
   14.13 + *
   14.14 + * This library is distributed in the hope that it will be useful, but
   14.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   14.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14.17 + * Lesser General Public License for more details.
   14.18 + *
   14.19 + * You should have received a copy of the GNU Lesser General Public
   14.20 + * License along with this library; if not, write to the Free Software
   14.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   14.22 + * USA
   14.23 + *
   14.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   14.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   14.26 + *  SPE code based on research by:
   14.27 + *  Rene Becker
   14.28 + *  Thimo Emmerich
   14.29 + */
   14.30 +
   14.31 +#include "spu_common.h"
   14.32 +
   14.33 +#include <spu_intrinsics.h>
   14.34 +#include <spu_mfcio.h>
   14.35 +#include <stdio.h>
   14.36 +#include <string.h>
   14.37 +
   14.38 +// Debugging
   14.39 +//#define DEBUG
   14.40 +
   14.41 +#ifdef DEBUG
   14.42 +#define deprintf(fmt, args... ) \
   14.43 +	fprintf( stdout, fmt, ##args ); \
   14.44 +	fflush( stdout );
   14.45 +#else
   14.46 +#define deprintf( fmt, args... )
   14.47 +#endif
   14.48 +
   14.49 +void cpy_to_fb(unsigned int);
   14.50 +
   14.51 +/* fb_writer_spu parms */
   14.52 +static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128)));
   14.53 +
   14.54 +/* Code running on SPU */
   14.55 +int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused)))
   14.56 +{
   14.57 +	deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id);
   14.58 +	uint32_t ea_mfc, mbox;
   14.59 +	// send ready message
   14.60 +	spu_write_out_mbox(SPU_READY);
   14.61 +
   14.62 +	while (1) {
   14.63 +		/* Check mailbox */
   14.64 +		mbox = spu_read_in_mbox();
   14.65 +		deprintf("[SPU] Message is %u\n", mbox);
   14.66 +		switch (mbox) {
   14.67 +			case SPU_EXIT:
   14.68 +				deprintf("[SPU] fb_writer goes down...\n");
   14.69 +				return 0;
   14.70 +			case SPU_START:
   14.71 +				break;
   14.72 +			default:
   14.73 +				deprintf("[SPU] Cannot handle message\n");
   14.74 +				continue;
   14.75 +		}
   14.76 +
   14.77 +		/* Tag Manager setup */
   14.78 +		unsigned int tags;
   14.79 +		tags = mfc_multi_tag_reserve(5);
   14.80 +		if (tags == MFC_TAG_INVALID) {
   14.81 +			deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n");
   14.82 +			return 0;
   14.83 +		}
   14.84 +
   14.85 +		/* Framebuffer parms */
   14.86 +		ea_mfc = spu_read_in_mbox();
   14.87 +		deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc);
   14.88 +		spu_mfcdma32(&parms, (unsigned int)ea_mfc,
   14.89 +				sizeof(struct fb_writer_parms_t), tags,
   14.90 +				MFC_GET_CMD);
   14.91 +		deprintf("[SPU] argp = %u\n", (unsigned int)argp);
   14.92 +		DMA_WAIT_TAG(tags);
   14.93 +
   14.94 +		/* Copy parms->data to framebuffer */
   14.95 +		deprintf("[SPU] Copying to framebuffer started\n");
   14.96 +		cpy_to_fb(tags);
   14.97 +		deprintf("[SPU] Copying to framebuffer done!\n");
   14.98 +
   14.99 +		mfc_multi_tag_release(tags, 5);
  14.100 +		deprintf("[SPU] fb_writer_spu... done!\n");
  14.101 +		/* Send FIN msg */
  14.102 +		spu_write_out_mbox(SPU_FIN);
  14.103 +	}
  14.104 +
  14.105 +	return 0;
  14.106 +}
  14.107 +
  14.108 +void cpy_to_fb(unsigned int tag_id_base)
  14.109 +{
  14.110 +	unsigned int i;
  14.111 +	unsigned char current_buf;
  14.112 +	uint8_t *in = parms.data;
  14.113 +
  14.114 +	/* Align fb pointer which was centered before */
  14.115 +	uint8_t *fb =
  14.116 +	    (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0);
  14.117 +
  14.118 +	uint32_t bounded_input_height = parms.bounded_input_height;
  14.119 +	uint32_t bounded_input_width = parms.bounded_input_width;
  14.120 +	uint32_t fb_pixel_size = parms.fb_pixel_size;
  14.121 +
  14.122 +	uint32_t out_line_stride = parms.out_line_stride;
  14.123 +	uint32_t in_line_stride = parms.in_line_stride;
  14.124 +	uint32_t in_line_size = bounded_input_width * fb_pixel_size;
  14.125 +
  14.126 +	current_buf = 0;
  14.127 +
  14.128 +	/* Local store buffer */
  14.129 +	static volatile uint8_t buf[4][BUFFER_SIZE]
  14.130 +	    __attribute__ ((aligned(128)));
  14.131 +	/* do 4-times multibuffering using DMA list, process in two steps */
  14.132 +	for (i = 0; i < bounded_input_height >> 2; i++) {
  14.133 +		/* first buffer */
  14.134 +		DMA_WAIT_TAG(tag_id_base + 1);
  14.135 +		// retrieve buffer
  14.136 +		spu_mfcdma32(buf[0], (unsigned int)in, in_line_size,
  14.137 +			     tag_id_base + 1, MFC_GETB_CMD);
  14.138 +		DMA_WAIT_TAG(tag_id_base + 1);
  14.139 +		// store buffer
  14.140 +		spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size,
  14.141 +			     tag_id_base + 1, MFC_PUTB_CMD);
  14.142 +		in += in_line_stride;
  14.143 +		fb += out_line_stride;
  14.144 +		deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in,
  14.145 +		       fb);
  14.146 +
  14.147 +		/* second buffer */
  14.148 +		DMA_WAIT_TAG(tag_id_base + 2);
  14.149 +		// retrieve buffer
  14.150 +		spu_mfcdma32(buf[1], (unsigned int)in, in_line_size,
  14.151 +			     tag_id_base + 2, MFC_GETB_CMD);
  14.152 +		DMA_WAIT_TAG(tag_id_base + 2);
  14.153 +		// store buffer
  14.154 +		spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size,
  14.155 +			     tag_id_base + 2, MFC_PUTB_CMD);
  14.156 +		in += in_line_stride;
  14.157 +		fb += out_line_stride;
  14.158 +		deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in,
  14.159 +		       fb);
  14.160 +
  14.161 +		/* third buffer */
  14.162 +		DMA_WAIT_TAG(tag_id_base + 3);
  14.163 +		// retrieve buffer
  14.164 +		spu_mfcdma32(buf[2], (unsigned int)in, in_line_size,
  14.165 +			     tag_id_base + 3, MFC_GETB_CMD);
  14.166 +		DMA_WAIT_TAG(tag_id_base + 3);
  14.167 +		// store buffer
  14.168 +		spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size,
  14.169 +			     tag_id_base + 3, MFC_PUTB_CMD);
  14.170 +		in += in_line_stride;
  14.171 +		fb += out_line_stride;
  14.172 +		deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in,
  14.173 +		       fb);
  14.174 +
  14.175 +		/* fourth buffer */
  14.176 +		DMA_WAIT_TAG(tag_id_base + 4);
  14.177 +		// retrieve buffer
  14.178 +		spu_mfcdma32(buf[3], (unsigned int)in, in_line_size,
  14.179 +			     tag_id_base + 4, MFC_GETB_CMD);
  14.180 +		DMA_WAIT_TAG(tag_id_base + 4);
  14.181 +		// store buffer
  14.182 +		spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size,
  14.183 +			     tag_id_base + 4, MFC_PUTB_CMD);
  14.184 +		in += in_line_stride;
  14.185 +		fb += out_line_stride;
  14.186 +		deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in,
  14.187 +		       fb);
  14.188 +		deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i,
  14.189 +		       bounded_input_height >> 2);
  14.190 +	}
  14.191 +	DMA_WAIT_TAG(tag_id_base + 2);
  14.192 +	DMA_WAIT_TAG(tag_id_base + 3);
  14.193 +	DMA_WAIT_TAG(tag_id_base + 4);
  14.194 +}
  14.195 +
  14.196 +
    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/src/video/ps3/spulibs/spu_common.h	Thu Apr 02 04:06:55 2009 +0000
    15.3 @@ -0,0 +1,108 @@
    15.4 +/*
    15.5 + * SDL - Simple DirectMedia Layer
    15.6 + * CELL BE Support for PS3 Framebuffer
    15.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    15.8 + *
    15.9 + * This library is free software; you can redistribute it and/or modify it
   15.10 + * under the terms of the GNU Lesser General Public License as published
   15.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   15.12 + * (at your option) any later version.
   15.13 + *
   15.14 + * This library is distributed in the hope that it will be useful, but
   15.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   15.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   15.17 + * Lesser General Public License for more details.
   15.18 + *
   15.19 + * You should have received a copy of the GNU Lesser General Public
   15.20 + * License along with this library; if not, write to the Free Software
   15.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   15.22 + * USA
   15.23 + *
   15.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   15.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   15.26 + *  SPE code based on research by:
   15.27 + *  Rene Becker
   15.28 + *  Thimo Emmerich
   15.29 + */
   15.30 +
   15.31 +/* Common definitions/makros for SPUs */
   15.32 +
   15.33 +#ifndef _SPU_COMMON_H
   15.34 +#define _SPU_COMMON_H
   15.35 +
   15.36 +#include <stdio.h>
   15.37 +#include <stdint.h>
   15.38 +#include <string.h>
   15.39 +
   15.40 +/* Tag management */
   15.41 +#define DMA_WAIT_TAG(_tag)     \
   15.42 +    mfc_write_tag_mask(1<<(_tag)); \
   15.43 +    mfc_read_tag_status_all();
   15.44 +
   15.45 +/* SPU mailbox messages */
   15.46 +#define SPU_READY	0
   15.47 +#define SPU_START	1
   15.48 +#define SPU_FIN		2
   15.49 +#define SPU_EXIT	3
   15.50 +
   15.51 +/* Tags */
   15.52 +#define RETR_BUF	0
   15.53 +#define STR_BUF		1
   15.54 +#define TAG_INIT	2
   15.55 +
   15.56 +/* Buffersizes */
   15.57 +#define MAX_HDTV_WIDTH 1920
   15.58 +#define MAX_HDTV_HEIGHT 1080
   15.59 +/* One stride of HDTV */
   15.60 +#define BUFFER_SIZE 7680
   15.61 +
   15.62 +/* fb_writer ppu/spu exchange parms */
   15.63 +struct fb_writer_parms_t {
   15.64 +	uint8_t *data;
   15.65 +	uint8_t *center;
   15.66 +	uint32_t out_line_stride;
   15.67 +	uint32_t in_line_stride;
   15.68 +	uint32_t bounded_input_height;
   15.69 +	uint32_t bounded_input_width;
   15.70 +	uint32_t fb_pixel_size;
   15.71 +
   15.72 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
   15.73 +	char padding[4];
   15.74 +} __attribute__((aligned(128)));
   15.75 +
   15.76 +/* yuv2rgb ppu/spu exchange parms */
   15.77 +struct yuv2rgb_parms_t {
   15.78 +	uint8_t* y_plane;
   15.79 +	uint8_t* v_plane;
   15.80 +	uint8_t* u_plane;
   15.81 +
   15.82 +	uint8_t* dstBuffer;
   15.83 +
   15.84 +	unsigned int src_pixel_width;
   15.85 +	unsigned int src_pixel_height;
   15.86 +
   15.87 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
   15.88 +	char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)];
   15.89 +} __attribute__((aligned(128)));
   15.90 +
   15.91 +/* bilin_scaler ppu/spu exchange parms */
   15.92 +struct scale_parms_t {
   15.93 +	uint8_t* y_plane;
   15.94 +	uint8_t* v_plane;
   15.95 +	uint8_t* u_plane;
   15.96 +
   15.97 +	uint8_t* dstBuffer;
   15.98 +
   15.99 +	unsigned int src_pixel_width;
  15.100 +	unsigned int src_pixel_height;
  15.101 +
  15.102 +	unsigned int dst_pixel_width;
  15.103 +	unsigned int dst_pixel_height;
  15.104 +
  15.105 +	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
  15.106 +	char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)];
  15.107 +} __attribute__((aligned(128)));
  15.108 +
  15.109 +#endif /* _SPU_COMMON_H */
  15.110 +
  15.111 +
    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/src/video/ps3/spulibs/yuv2rgb_converter.c	Thu Apr 02 04:06:55 2009 +0000
    16.3 @@ -0,0 +1,629 @@
    16.4 +/*
    16.5 + * SDL - Simple DirectMedia Layer
    16.6 + * CELL BE Support for PS3 Framebuffer
    16.7 + * Copyright (C) 2008, 2009 International Business Machines Corporation
    16.8 + *
    16.9 + * This library is free software; you can redistribute it and/or modify it
   16.10 + * under the terms of the GNU Lesser General Public License as published
   16.11 + * by the Free Software Foundation; either version 2.1 of the License, or
   16.12 + * (at your option) any later version.
   16.13 + *
   16.14 + * This library is distributed in the hope that it will be useful, but
   16.15 + * WITHOUT ANY WARRANTY; without even the implied warranty of
   16.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   16.17 + * Lesser General Public License for more details.
   16.18 + *
   16.19 + * You should have received a copy of the GNU Lesser General Public
   16.20 + * License along with this library; if not, write to the Free Software
   16.21 + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
   16.22 + * USA
   16.23 + *
   16.24 + *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
   16.25 + *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
   16.26 + *  SPE code based on research by:
   16.27 + *  Rene Becker
   16.28 + *  Thimo Emmerich
   16.29 + */
   16.30 +
   16.31 +#include "spu_common.h"
   16.32 +
   16.33 +#include <spu_intrinsics.h>
   16.34 +#include <spu_mfcio.h>
   16.35 +
   16.36 +// Debugging
   16.37 +//#define DEBUG
   16.38 +
   16.39 +#ifdef DEBUG
   16.40 +#define deprintf(fmt, args... ) \
   16.41 +	fprintf( stdout, fmt, ##args ); \
   16.42 +	fflush( stdout );
   16.43 +#else
   16.44 +#define deprintf( fmt, args... )
   16.45 +#endif
   16.46 +
   16.47 +struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
   16.48 +
   16.49 +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
   16.50 + * there might be the need to retrieve misaligned data, adjust
   16.51 + * incoming v and u plane to be able to handle this (add 128)
   16.52 + */
   16.53 +unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
   16.54 +unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
   16.55 +unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
   16.56 +
   16.57 +/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
   16.58 +unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
   16.59 +
   16.60 +/* some vectors needed by the float to int conversion */
   16.61 +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
   16.62 +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
   16.63 +
   16.64 +void yuv_to_rgb_w16();
   16.65 +void yuv_to_rgb_w32();
   16.66 +
   16.67 +void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
   16.68 +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
   16.69 +
   16.70 +
   16.71 +int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
   16.72 +{
   16.73 +	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
   16.74 +	uint32_t ea_mfc, mbox;
   16.75 +	// send ready message
   16.76 +	spu_write_out_mbox(SPU_READY);
   16.77 +
   16.78 +	while (1) {
   16.79 +		/* Check mailbox */
   16.80 +		mbox = spu_read_in_mbox();
   16.81 +		deprintf("[SPU] Message is %u\n", mbox);
   16.82 +		switch (mbox) {
   16.83 +			case SPU_EXIT:
   16.84 +				deprintf("[SPU] fb_writer goes down...\n");
   16.85 +				return 0;
   16.86 +			case SPU_START:
   16.87 +				break;
   16.88 +			default:
   16.89 +				deprintf("[SPU] Cannot handle message\n");
   16.90 +				continue;
   16.91 +		}
   16.92 +
   16.93 +		/* Tag Manager setup */
   16.94 +		unsigned int tag_id;
   16.95 +		tag_id = mfc_multi_tag_reserve(1);
   16.96 +		if (tag_id == MFC_TAG_INVALID) {
   16.97 +			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
   16.98 +			return 0;
   16.99 +		}
  16.100 +
  16.101 +		/* DMA transfer for the input parameters */
  16.102 +		ea_mfc = spu_read_in_mbox();
  16.103 +		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
  16.104 +		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
  16.105 +		DMA_WAIT_TAG(tag_id);
  16.106 +
  16.107 +		/* There are alignment issues that involve handling of special cases
  16.108 +		 * a width of 32 results in a width of 16 in the chrominance
  16.109 +		 * --> choose the proper handling to optimize the performance
  16.110 +		 */
  16.111 +		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
  16.112 +		if (parms_converter.src_pixel_width & 0x1f) {
  16.113 +			deprintf("[SPU] Using yuv_to_rgb_w16\n");
  16.114 +			yuv_to_rgb_w16();
  16.115 +		} else {
  16.116 +			deprintf("[SPU] Using yuv_to_rgb_w32\n");
  16.117 +			yuv_to_rgb_w32();
  16.118 +		}
  16.119 +
  16.120 +		mfc_multi_tag_release(tag_id, 1);
  16.121 +		deprintf("[SPU] yuv2rgb_spu... done!\n");
  16.122 +		/* Send FIN message */
  16.123 +		spu_write_out_mbox(SPU_FIN);
  16.124 +	}
  16.125 +
  16.126 +	return 0;
  16.127 +}
  16.128 +
  16.129 +
  16.130 +/*
  16.131 + * float_to_char()
  16.132 + *
  16.133 + * converts a float to a character using saturated
  16.134 + * arithmetic
  16.135 + *
  16.136 + * @param s float for conversion
  16.137 + * @returns converted character
  16.138 + */
  16.139 +inline static unsigned char float_to_char(float s) {
  16.140 +	vector float vec_s = spu_splats(s);
  16.141 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  16.142 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  16.143 +
  16.144 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  16.145 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  16.146 +	return (unsigned char) spu_extract(vec_s,0);
  16.147 +}
  16.148 +
  16.149 +
  16.150 +/*
  16.151 + * vfloat_to_vuint()
  16.152 + *
  16.153 + * converts a float vector to an unsinged int vector using saturated
  16.154 + * arithmetic
  16.155 + *
  16.156 + * @param vec_s float vector for conversion
  16.157 + * @returns converted unsigned int vector
  16.158 + */
  16.159 +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
  16.160 +	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
  16.161 +	vec_s = spu_sel(vec_s, vec_0_1, select_1);
  16.162 +
  16.163 +	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
  16.164 +	vec_s = spu_sel(vec_s, vec_255, select_2);
  16.165 +	return spu_convtu(vec_s,0);
  16.166 +}
  16.167 +
  16.168 +
  16.169 +void yuv_to_rgb_w16() {
  16.170 +	// Pixel dimensions of the picture
  16.171 +	uint32_t width, height;
  16.172 +
  16.173 +	// Extract parameters
  16.174 +	width = parms_converter.src_pixel_width;
  16.175 +	height = parms_converter.src_pixel_height;
  16.176 +
  16.177 +	// Plane data management
  16.178 +	// Y
  16.179 +	unsigned char* ram_addr_y = parms_converter.y_plane;
  16.180 +	// V
  16.181 +	unsigned char* ram_addr_v = parms_converter.v_plane;
  16.182 +	// U
  16.183 +	unsigned char* ram_addr_u = parms_converter.u_plane;
  16.184 +
  16.185 +	// BGRA
  16.186 +	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
  16.187 +
  16.188 +	// Strides
  16.189 +	unsigned int stride_y = width;
  16.190 +	unsigned int stride_vu = width>>1;
  16.191 +
  16.192 +	// Buffer management
  16.193 +	unsigned int buf_idx = 0;
  16.194 +	unsigned int size_4lines_y = stride_y<<2;
  16.195 +	unsigned int size_2lines_y = stride_y<<1;
  16.196 +	unsigned int size_2lines_vu = stride_vu<<1;
  16.197 +
  16.198 +	// 2*width*4byte_per_pixel
  16.199 +	unsigned int size_2lines_bgra = width<<3;
  16.200 +
  16.201 +
  16.202 +	// start double-buffered processing
  16.203 +	// 4 lines y
  16.204 +	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.205 +
  16.206 +	// 2 lines v
  16.207 +	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.208 +
  16.209 +	// 2 lines u
  16.210 +	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.211 +
  16.212 +	// Wait for these transfers to be completed
  16.213 +	DMA_WAIT_TAG((RETR_BUF + buf_idx));
  16.214 +
  16.215 +	unsigned int i;
  16.216 +	for(i=0; i<(height>>2)-1; i++) {
  16.217 +
  16.218 +		buf_idx^=1;
  16.219 +
  16.220 +		// 4 lines y
  16.221 +		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.222 +
  16.223 +		// 2 lines v
  16.224 +		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.225 +
  16.226 +		// 2 lines u
  16.227 +		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
  16.228 +
  16.229 +		DMA_WAIT_TAG((RETR_BUF + buf_idx));
  16.230 +
  16.231 +		buf_idx^=1;
  16.232 +
  16.233 +
  16.234 +		// Convert YUV to BGRA, store it back (first two lines)
  16.235 +		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  16.236 +
  16.237 +		// Next two lines
  16.238 +		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
  16.239 +				v_plane[buf_idx] + stride_vu,
  16.240 +				u_plane[buf_idx] + stride_vu,
  16.241 +				bgra + size_2lines_bgra,
  16.242 +				width);
  16.243 +
  16.244 +		// Wait for previous storing transfer to be completed
  16.245 +		DMA_WAIT_TAG(STR_BUF);
  16.246 +
  16.247 +		// Store converted lines in two steps->max transfer size 16384
  16.248 +		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.249 +		ram_addr_bgra += size_2lines_bgra;
  16.250 +		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.251 +		ram_addr_bgra += size_2lines_bgra;
  16.252 +
  16.253 +		// Move 4 lines
  16.254 +		ram_addr_y += size_4lines_y;
  16.255 +		ram_addr_v += size_2lines_vu;
  16.256 +		ram_addr_u += size_2lines_vu;
  16.257 +
  16.258 +		buf_idx^=1;
  16.259 +	}
  16.260 +
  16.261 +	// Convert YUV to BGRA, store it back (first two lines)
  16.262 +	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  16.263 +
  16.264 +	// Next two lines
  16.265 +	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
  16.266 +			v_plane[buf_idx] + stride_vu,
  16.267 +			u_plane[buf_idx] + stride_vu,
  16.268 +			bgra + size_2lines_bgra,
  16.269 +			width);
  16.270 +
  16.271 +	// Wait for previous storing transfer to be completed
  16.272 +	DMA_WAIT_TAG(STR_BUF);
  16.273 +	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.274 +	ram_addr_bgra += size_2lines_bgra;
  16.275 +	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.276 +
  16.277 +	// wait for previous storing transfer to be completed
  16.278 +	DMA_WAIT_TAG(STR_BUF);
  16.279 +
  16.280 +}
  16.281 +
  16.282 +
  16.283 +void yuv_to_rgb_w32() {
  16.284 +	// Pixel dimensions of the picture
  16.285 +	uint32_t width, height;
  16.286 +
  16.287 +	// Extract parameters
  16.288 +	width = parms_converter.src_pixel_width;
  16.289 +	height = parms_converter.src_pixel_height;
  16.290 +
  16.291 +	// Plane data management
  16.292 +	// Y
  16.293 +	unsigned char* ram_addr_y = parms_converter.y_plane;
  16.294 +	// V
  16.295 +	unsigned char* ram_addr_v = parms_converter.v_plane;
  16.296 +	// U
  16.297 +	unsigned char* ram_addr_u = parms_converter.u_plane;
  16.298 +
  16.299 +	// BGRA
  16.300 +	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
  16.301 +
  16.302 +	// Strides
  16.303 +	unsigned int stride_y = width;
  16.304 +	unsigned int stride_vu = width>>1;
  16.305 +
  16.306 +	// Buffer management
  16.307 +	unsigned int buf_idx = 0;
  16.308 +	unsigned int size_4lines_y = stride_y<<2;
  16.309 +	unsigned int size_2lines_y = stride_y<<1;
  16.310 +	unsigned int size_2lines_vu = stride_vu<<1;
  16.311 +
  16.312 +	// 2*width*4byte_per_pixel
  16.313 +	unsigned int size_2lines_bgra = width<<3;
  16.314 +
  16.315 +	// start double-buffered processing
  16.316 +	// 4 lines y
  16.317 +	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.318 +	// 2 lines v
  16.319 +	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.320 +	// 2 lines u
  16.321 +	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.322 +
  16.323 +	// Wait for these transfers to be completed
  16.324 +	DMA_WAIT_TAG((RETR_BUF + buf_idx));
  16.325 +
  16.326 +	unsigned int i;
  16.327 +	for(i=0; i < (height>>2)-1; i++) {
  16.328 +		buf_idx^=1;
  16.329 +		// 4 lines y
  16.330 +		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.331 +		deprintf("4lines = %d\n", size_4lines_y);
  16.332 +		// 2 lines v
  16.333 +		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.334 +		deprintf("2lines = %d\n", size_2lines_vu);
  16.335 +		// 2 lines u
  16.336 +		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
  16.337 +		deprintf("2lines = %d\n", size_2lines_vu);
  16.338 +
  16.339 +		DMA_WAIT_TAG((RETR_BUF + buf_idx));
  16.340 +
  16.341 +		buf_idx^=1;
  16.342 +
  16.343 +		// Convert YUV to BGRA, store it back (first two lines)
  16.344 +		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  16.345 +
  16.346 +		// Next two lines
  16.347 +		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
  16.348 +				v_plane[buf_idx] + stride_vu,
  16.349 +				u_plane[buf_idx] + stride_vu,
  16.350 +				bgra + size_2lines_bgra,
  16.351 +				width);
  16.352 +
  16.353 +		// Wait for previous storing transfer to be completed
  16.354 +		DMA_WAIT_TAG(STR_BUF);
  16.355 +
  16.356 +		// Store converted lines in two steps->max transfer size 16384
  16.357 +		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.358 +		ram_addr_bgra += size_2lines_bgra;
  16.359 +		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.360 +		ram_addr_bgra += size_2lines_bgra;
  16.361 +
  16.362 +		// Move 4 lines
  16.363 +		ram_addr_y += size_4lines_y;
  16.364 +		ram_addr_v += size_2lines_vu;
  16.365 +		ram_addr_u += size_2lines_vu;
  16.366 +
  16.367 +		buf_idx^=1;
  16.368 +	}
  16.369 +
  16.370 +	// Convert YUV to BGRA, store it back (first two lines)
  16.371 +	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
  16.372 +
  16.373 +	// Next two lines
  16.374 +	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
  16.375 +			v_plane[buf_idx] + stride_vu,
  16.376 +			u_plane[buf_idx] + stride_vu,
  16.377 +			bgra + size_2lines_bgra,
  16.378 +			width);
  16.379 +
  16.380 +	// Wait for previous storing transfer to be completed
  16.381 +	DMA_WAIT_TAG(STR_BUF);
  16.382 +	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.383 +	ram_addr_bgra += size_2lines_bgra;
  16.384 +	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
  16.385 +
  16.386 +	// Wait for previous storing transfer to be completed
  16.387 +	DMA_WAIT_TAG(STR_BUF);
  16.388 +}
  16.389 +
  16.390 +
  16.391 +/* Some vectors needed by the yuv 2 rgb conversion algorithm */
  16.392 +const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
  16.393 +const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  16.394 +const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
  16.395 +const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
  16.396 +const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
  16.397 +const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
  16.398 +
  16.399 +const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
  16.400 +const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
  16.401 +const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
  16.402 +const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
  16.403 +
  16.404 +const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
  16.405 +
  16.406 +const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
  16.407 +const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
  16.408 +
  16.409 +
  16.410 +/*
  16.411 + * yuv_to_rgb_w16()
  16.412 + *
  16.413 + * processes to line of yuv-input, width has to be a multiple of 16
  16.414 + * two lines of yuv are taken as input
  16.415 + *
  16.416 + * @param y_addr address of the y plane in local store
  16.417 + * @param v_addr address of the v plane in local store
  16.418 + * @param u_addr address of the u plane in local store
  16.419 + * @param bgra_addr_ address of the bgra output buffer
  16.420 + * @param width the width in pixel
  16.421 + */
  16.422 +void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
  16.423 +	// each pixel is stored as an integer
  16.424 +	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
  16.425 +
  16.426 +	unsigned int x;
  16.427 +	for(x = 0; x < width; x+=2) {
  16.428 +		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
  16.429 +		const unsigned char Y_1 = *(y_addr + x);
  16.430 +		const unsigned char Y_2 = *(y_addr + x + 1);
  16.431 +		const unsigned char Y_3 = *(y_addr + x + width);
  16.432 +		const unsigned char Y_4 = *(y_addr + x + width + 1);
  16.433 +		const unsigned char U = *(u_addr + (x >> 1));
  16.434 +		const unsigned char V = *(v_addr + (x >> 1));
  16.435 +
  16.436 +		float V_minus_128 = (float)((float)V - 128.0f);
  16.437 +		float U_minus_128 = (float)((float)U - 128.0f);
  16.438 +
  16.439 +		float R_precalculate = 1.403f * V_minus_128;
  16.440 +		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
  16.441 +		float B_precalculate = 1.773f * U_minus_128;
  16.442 +
  16.443 +		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
  16.444 +		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
  16.445 +		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
  16.446 +		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
  16.447 +		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
  16.448 +		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
  16.449 +		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
  16.450 +		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
  16.451 +		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
  16.452 +		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
  16.453 +		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
  16.454 +		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
  16.455 +
  16.456 +		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
  16.457 +		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
  16.458 +		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
  16.459 +		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
  16.460 +	}
  16.461 +}
  16.462 +
  16.463 +
  16.464 +/*
  16.465 + * yuv_to_rgb_w32()
  16.466 + *
  16.467 + * processes to line of yuv-input, width has to be a multiple of 32
  16.468 + * two lines of yuv are taken as input
  16.469 + *
  16.470 + * @param y_addr address of the y plane in local store
  16.471 + * @param v_addr address of the v plane in local store
  16.472 + * @param u_addr address of the u plane in local store
  16.473 + * @param bgra_addr_ address of the bgra output buffer
  16.474 + * @param width the width in pixel
  16.475 + */
  16.476 +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
  16.477 +	// each pixel is stored as an integer
  16.478 +	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
  16.479 +
  16.480 +	unsigned int x;
  16.481 +	for(x = 0; x < width; x+=32) {
  16.482 +		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
  16.483 +
  16.484 +		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
  16.485 +		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
  16.486 +		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
  16.487 +		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
  16.488 +		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
  16.489 +		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
  16.490 +
  16.491 +		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
  16.492 +		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
  16.493 +		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
  16.494 +		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
  16.495 +
  16.496 +		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
  16.497 +		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
  16.498 +		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
  16.499 +		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
  16.500 +
  16.501 +		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
  16.502 +		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
  16.503 +		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
  16.504 +		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
  16.505 +		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
  16.506 +		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
  16.507 +		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
  16.508 +		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
  16.509 +		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
  16.510 +		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
  16.511 +		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
  16.512 +		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
  16.513 +		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
  16.514 +		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
  16.515 +		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
  16.516 +		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
  16.517 +
  16.518 +		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
  16.519 +		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
  16.520 +		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
  16.521 +		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
  16.522 +
  16.523 +		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
  16.524 +		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
  16.525 +		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
  16.526 +		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
  16.527 +		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
  16.528 +		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
  16.529 +		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
  16.530 +		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
  16.531 +
  16.532 +
  16.533 +		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
  16.534 +		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
  16.535 +		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
  16.536 +		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
  16.537 +
  16.538 +		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
  16.539 +		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
  16.540 +		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
  16.541 +		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
  16.542 +		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
  16.543 +		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
  16.544 +		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
  16.545 +		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
  16.546 +
  16.547 +
  16.548 +		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
  16.549 +		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
  16.550 +		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
  16.551 +		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
  16.552 +
  16.553 +		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
  16.554 +		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
  16.555 +		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
  16.556 +		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
  16.557 +		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
  16.558 +		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
  16.559 +		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
  16.560 +		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
  16.561 +
  16.562 +
  16.563 +		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
  16.564 +		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
  16.565 +		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
  16.566 +		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
  16.567 +		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
  16.568 +		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
  16.569 +		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
  16.570 +		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
  16.571 +		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
  16.572 +		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
  16.573 +		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
  16.574 +		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
  16.575 +		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
  16.576 +		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
  16.577 +		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
  16.578 +		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
  16.579 +
  16.580 +		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
  16.581 +		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
  16.582 +		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
  16.583 +		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
  16.584 +		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
  16.585 +		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
  16.586 +		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
  16.587 +		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
  16.588 +		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
  16.589 +		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
  16.590 +		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
  16.591 +		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
  16.592 +		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
  16.593 +		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
  16.594 +		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
  16.595 +		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
  16.596 +
  16.597 +		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
  16.598 +		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
  16.599 +		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
  16.600 +		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
  16.601 +		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
  16.602 +		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
  16.603 +		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
  16.604 +		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
  16.605 +		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
  16.606 +		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
  16.607 +		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
  16.608 +		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
  16.609 +		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
  16.610 +		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
  16.611 +		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
  16.612 +		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
  16.613 +
  16.614 +		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
  16.615 +		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
  16.616 +		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
  16.617 +		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
  16.618 +		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
  16.619 +		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
  16.620 +		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
  16.621 +		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
  16.622 +		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
  16.623 +		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
  16.624 +		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
  16.625 +		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
  16.626 +		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
  16.627 +		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
  16.628 +		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
  16.629 +		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
  16.630 +	}
  16.631 +}
  16.632 +