From 2ea07f153114e00e97d6e5cbc0f9b25012b6430f Mon Sep 17 00:00:00 2001
From: Sam Lantinga <slouken@libsdl.org>
Date: Thu, 2 Apr 2009 04:06:55 +0000
Subject: [PATCH] Hello.

This patch provides basic support for video on the Sony PS3
Linux framebuffer. Scaling, format-conversion, and drawing is
done from the SPEs, so there is little performance impact to
PPE applications. This is by no means production quality code,
but it is a very good start and a good example of how to use the
PS3's hardware capabilities to accelerate video playback on
the box.

The driver has been verified to work with ffplay, mplayer and xine.
This piece of software has been developed at the IBM R&D Lab
in Boeblingen, Germany and is now returned to the community.

Enjoy !

Signed-off-by: D.Herrendoerfer < d.herrendoerfer [at] de [dot] ibm [dot] com >
---
 README.PS3                                |   29 +
 configure.in                              |   28 +
 include/SDL_config.h.in                   |    1 +
 src/video/SDL_sysvideo.h                  |    3 +
 src/video/SDL_video.c                     |    3 +
 src/video/ps3/SDL_ps3events.c             |   44 +
 src/video/ps3/SDL_ps3events_c.h           |   41 +
 src/video/ps3/SDL_ps3video.c              |  621 +++++++
 src/video/ps3/SDL_ps3video.h              |  165 ++
 src/video/ps3/SDL_ps3yuv.c                |  340 ++++
 src/video/ps3/SDL_ps3yuv_c.h              |   44 +
 src/video/ps3/spulibs/Makefile            |   83 +
 src/video/ps3/spulibs/bilin_scaler.c      | 2050 +++++++++++++++++++++
 src/video/ps3/spulibs/fb_writer.c         |  193 ++
 src/video/ps3/spulibs/spu_common.h        |  108 ++
 src/video/ps3/spulibs/yuv2rgb_converter.c |  629 +++++++
 16 files changed, 4382 insertions(+)
 create mode 100644 README.PS3
 create mode 100644 src/video/ps3/SDL_ps3events.c
 create mode 100644 src/video/ps3/SDL_ps3events_c.h
 create mode 100644 src/video/ps3/SDL_ps3video.c
 create mode 100644 src/video/ps3/SDL_ps3video.h
 create mode 100644 src/video/ps3/SDL_ps3yuv.c
 create mode 100644 src/video/ps3/SDL_ps3yuv_c.h
 create mode 100644 src/video/ps3/spulibs/Makefile
 create mode 100644 src/video/ps3/spulibs/bilin_scaler.c
 create mode 100644 src/video/ps3/spulibs/fb_writer.c
 create mode 100644 src/video/ps3/spulibs/spu_common.h
 create mode 100644 src/video/ps3/spulibs/yuv2rgb_converter.c

diff --git a/README.PS3 b/README.PS3
new file mode 100644
index 000000000..c66467d39
--- /dev/null
+++ b/README.PS3
@@ -0,0 +1,29 @@
+
+SDL on Sony Playstation3
+------------------------
+
+Installation:
+  First, you have to install the Cell SDK
+  - Download the Cell SDK installer RPM and ISO images to
+    a temporary directory such as /tmp/cellsdk.
+  - Mount the image: mount -o loop CellSDK-Devel-Fedora_3.1.0.0.0.iso /tmp/cellsdk
+  - Install the SDK installer: rpm -ivh cell-install-3.1.0-0.0.noarch.rpm
+  - Install the SDK: cd /opt/cell && ./cellsdk --iso /tmp/cellsdkiso install
+
+  You need to install the SPU-libs before installing SDL
+  - Go to SDL-1.2/src/video/ps3/spulibs/
+  - Run make && make install
+
+  Finally, install SDL
+  - Go to SDL-1.2/ and build SDL like any other GNU style package.
+  e.g.
+    - Build the configure-script with ./autogen.sh
+    - Configure SDL for your needs: ./configure --enable-video-ps3 ...
+    - Build and install it: make && make install
+
+
+Todo:
+  - mouse/keyboard/controller support
+
+Have fun!
+  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot ibm [dot] com>
diff --git a/configure.in b/configure.in
index a641df1ec..75fad3262 100644
--- a/configure.in
+++ b/configure.in
@@ -1287,6 +1287,33 @@ AC_HELP_STRING([--enable-video-ps2gs], [use PlayStation 2 GS video driver [[defa
     fi
 }
 
+dnl See if we're running on PlayStation 3 Cell hardware
+CheckPS3()
+{
+  AC_ARG_ENABLE(video-ps3,
+                AC_HELP_STRING([--enable-video-ps3], [use PlayStation 3 Cell driver [[default=yes]]]),
+                , enable_video_ps3=yes)
+  if test x$enable_video = xyes -a x$enable_video_ps3 = xyes; then
+    AC_MSG_CHECKING(for PlayStation 3 Cell support)
+    video_ps3=no
+    AC_TRY_COMPILE([
+      #include <linux/fb.h>
+      #include <asm/ps3fb.h>
+    ],[
+    ],[
+      video_ps3=yes
+    ])
+    AC_MSG_RESULT($video_ps3)
+    if test x$video_ps3 = xyes; then
+      AC_DEFINE(SDL_VIDEO_DRIVER_PS3)
+      SOURCES="$SOURCES $srcdir/src/video/ps3/*.c"
+      EXTRA_CFLAGS="$EXTRA_CFLAGS -I/opt/cell/sdk/usr/include"
+      EXTRA_LDFLAGS="$EXTRA_LDFLAGS -lbilin_scaler_spu -lfb_writer_spu -lyuv2rgb_spu -L/opt/cell/sdk/usr/lib -lspe2"
+      have_video=yes
+    fi
+  fi
+}
+
 dnl Find the GGI includes
 CheckGGI()
 {
@@ -2251,6 +2278,7 @@ case "$host" in
         CheckFBCON
         CheckDirectFB
         CheckPS2GS
+        CheckPS3
         CheckGGI
         CheckSVGA
         CheckVGL
diff --git a/include/SDL_config.h.in b/include/SDL_config.h.in
index 43b584b5d..a63618fb9 100644
--- a/include/SDL_config.h.in
+++ b/include/SDL_config.h.in
@@ -269,6 +269,7 @@
 #undef SDL_VIDEO_DRIVER_PHOTON
 #undef SDL_VIDEO_DRIVER_PICOGUI
 #undef SDL_VIDEO_DRIVER_PS2GS
+#undef SDL_VIDEO_DRIVER_PS3
 #undef SDL_VIDEO_DRIVER_QTOPIA
 #undef SDL_VIDEO_DRIVER_QUARTZ
 #undef SDL_VIDEO_DRIVER_RISCOS
diff --git a/src/video/SDL_sysvideo.h b/src/video/SDL_sysvideo.h
index 2ecc4074d..fea819e10 100644
--- a/src/video/SDL_sysvideo.h
+++ b/src/video/SDL_sysvideo.h
@@ -347,6 +347,9 @@ extern VideoBootStrap DirectFB_bootstrap;
 #if SDL_VIDEO_DRIVER_PS2GS
 extern VideoBootStrap PS2GS_bootstrap;
 #endif
+#if SDL_VIDEO_DRIVER_PS3
+extern VideoBootStrap PS3_bootstrap;
+#endif
 #if SDL_VIDEO_DRIVER_GGI
 extern VideoBootStrap GGI_bootstrap;
 #endif
diff --git a/src/video/SDL_video.c b/src/video/SDL_video.c
index ce723c9a5..4d250b39c 100644
--- a/src/video/SDL_video.c
+++ b/src/video/SDL_video.c
@@ -63,6 +63,9 @@ static VideoBootStrap *bootstrap[] = {
 #if SDL_VIDEO_DRIVER_PS2GS
 	&PS2GS_bootstrap,
 #endif
+#if SDL_VIDEO_DRIVER_PS3
+	&PS3_bootstrap,
+#endif
 #if SDL_VIDEO_DRIVER_GGI
 	&GGI_bootstrap,
 #endif
diff --git a/src/video/ps3/SDL_ps3events.c b/src/video/ps3/SDL_ps3events.c
new file mode 100644
index 000000000..e39efcc4f
--- /dev/null
+++ b/src/video/ps3/SDL_ps3events.c
@@ -0,0 +1,44 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+
+#include "../../events/SDL_sysevents.h"
+#include "../../events/SDL_events_c.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3events_c.h"
+
+void PS3_PumpEvents(_THIS)
+{
+	return;
+}
+
+void PS3_InitOSKeymap(_THIS)
+{
+        return;
+}
+
diff --git a/src/video/ps3/SDL_ps3events_c.h b/src/video/ps3/SDL_ps3events_c.h
new file mode 100644
index 000000000..fd11209af
--- /dev/null
+++ b/src/video/ps3/SDL_ps3events_c.h
@@ -0,0 +1,41 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+
+#ifndef _SDL_ps3events_h
+#define _SDL_ps3events_h
+
+#include "SDL_ps3video.h"
+
+extern void PS3_InitOSKeymap(_THIS);
+extern void PS3_PumpEvents(_THIS);
+
+extern void enable_cursor(int enable);
+
+#endif /* _SDL_ps3events_h */
+
diff --git a/src/video/ps3/SDL_ps3video.c b/src/video/ps3/SDL_ps3video.c
new file mode 100644
index 000000000..d5519e051
--- /dev/null
+++ b/src/video/ps3/SDL_ps3video.c
@@ -0,0 +1,621 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+
+#include "SDL_video.h"
+#include "../SDL_sysvideo.h"
+#include "SDL_ps3events_c.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3yuv_c.h"
+#include "spulibs/spu_common.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/kd.h>
+#include <sys/mman.h>
+
+#include <linux/fb.h>
+#include <asm/ps3fb.h>
+#include <libspe2.h>
+#include <malloc.h>
+
+/* SDL_VideoDevice functions */
+static int PS3_Available();
+static SDL_VideoDevice *PS3_CreateDevice(int devindex);
+static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat);
+static void PS3_VideoQuit(_THIS);
+static void PS3_DeleteDevice(SDL_VideoDevice * device);
+static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags);
+static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags);
+
+/* Hardware surface functions */
+static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface);
+static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface);
+static int PS3_LockHWSurface(_THIS, SDL_Surface * surface);
+static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface);
+static int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface);
+static void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects);
+
+/* SPU specific functions */
+int SPE_Start(_THIS, spu_data_t * spe_data);
+int SPE_Stop(_THIS, spu_data_t * spe_data);
+int SPE_Boot(_THIS, spu_data_t * spe_data);
+int SPE_Shutdown(_THIS, spu_data_t * spe_data);
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+void SPE_RunContext(void *thread_argp);
+
+/* Helpers */
+void enable_cursor(int enable);
+
+/* Stores the SPE executable name of fb_writer_spu */
+extern spe_program_handle_t fb_writer_spu;
+
+/* SDL PS3 bootstrap function for checking availability */
+static int PS3_Available()
+{
+	return 1;
+}
+
+/* SDL PS3 bootstrap function for creating the device */
+static SDL_VideoDevice *PS3_CreateDevice(int devindex)
+{
+	SDL_VideoDevice *this;
+
+	/* Initialise SDL_VideoDevice */
+	this = (SDL_VideoDevice *) SDL_malloc(sizeof(SDL_VideoDevice));
+	if (this) {
+		memset(this, 0, sizeof *this);
+		this->hidden = (struct SDL_PrivateVideoData *)
+		    SDL_malloc(sizeof(struct SDL_PrivateVideoData));
+	}
+	/* Error handling */
+	if ((this == NULL) || (this->hidden == NULL)) {
+		SDL_OutOfMemory();
+		if (this)
+			SDL_free(this);
+		return 0;
+	}
+	memset(this->hidden, 0, sizeof(struct SDL_PrivateVideoData));
+
+	/* Set the function pointers */
+	this->VideoInit = PS3_VideoInit;
+	this->ListModes = PS3_ListModes;
+	this->SetVideoMode = PS3_SetVideoMode;
+	this->SetColors = 0;
+	this->CreateYUVOverlay = PS3_CreateYUVOverlay;
+	this->UpdateRects = 0;
+	this->VideoQuit = PS3_VideoQuit;
+	this->AllocHWSurface = PS3_AllocHWSurface;
+	this->CheckHWBlit = 0;
+	this->FillHWRect = 0;
+	this->SetHWColorKey = 0;
+	this->SetHWAlpha = 0;
+	this->LockHWSurface = PS3_LockHWSurface;
+	this->UnlockHWSurface = PS3_UnlockHWSurface;
+	this->FlipHWSurface = PS3_FlipDoubleBuffer;
+	this->FreeHWSurface = PS3_FreeHWSurface;
+	this->SetCaption = 0;
+	this->SetIcon = 0;
+	this->IconifyWindow = 0;
+	this->GrabInput = 0;
+	this->GetWMInfo = 0;
+	this->InitOSKeymap = PS3_InitOSKeymap;
+	this->PumpEvents = PS3_PumpEvents;
+
+	this->free = PS3_DeleteDevice;
+
+	return this;
+}
+
+
+/* Bootstraping (see SDL_sysvideo.h) */
+VideoBootStrap PS3_bootstrap = {
+	"ps3", "PS3 Cell SPU Driver",
+	PS3_Available, PS3_CreateDevice
+};
+
+
+/* Delete the device */
+static void PS3_DeleteDevice(SDL_VideoDevice * device)
+{
+	free(device->hidden);
+	free(device);
+}
+
+
+/* Initialise the PS3 video device */
+static int PS3_VideoInit(_THIS, SDL_PixelFormat * vformat)
+{
+	/* Hide the cursor */
+	enable_cursor(0);
+
+	/* Create SPU fb_parms and thread structure */
+	fb_parms = (struct fb_writer_parms_t *)
+	    memalign(16, sizeof(struct fb_writer_parms_t));
+	fb_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+	if (fb_parms == NULL || fb_thread_data == NULL) {
+		SDL_OutOfMemory();
+		return -1;
+	}
+	fb_thread_data->program = fb_writer_spu;
+	fb_thread_data->program_name = "fb_writer_spu";
+	fb_thread_data->argp = (void *)fb_parms;
+	fb_thread_data->keepalive = 1;
+	fb_thread_data->booted = 0;
+
+	SPE_Start(this, fb_thread_data);
+
+	/* Open the device */
+	fb_dev_fd = open(PS3_DEV_FB, O_RDWR);
+	if (fb_dev_fd < 0) {
+		SDL_SetError("[PS3] Unable to open device %s", PS3_DEV_FB);
+		return -1;
+	}
+
+	/* Get vscreeninfo */
+	if (ioctl(fb_dev_fd, FBIOGET_VSCREENINFO, &fb_vinfo)) {
+		SDL_SetError("[PS3] Can't get VSCREENINFO");
+		if (fb_dev_fd >= 0)
+			close(fb_dev_fd);
+		fb_dev_fd = -1;
+		return -1;
+	}
+
+	/* Fill in our hardware acceleration capabilities */
+	this->info.current_w = fb_vinfo.xres;
+	this->info.current_h = fb_vinfo.yres;
+	this->info.wm_available = 0;
+	this->info.hw_available = 1;
+
+	/* Backup the original vinfo to restore later */
+	fb_orig_vinfo = fb_vinfo;
+
+	/* 16 and 15 bpp is reported as 16 bpp */
+	fb_bits_per_pixel = fb_vinfo.bits_per_pixel;
+	if (fb_bits_per_pixel == 16)
+		fb_bits_per_pixel =
+		    fb_vinfo.red.length + fb_vinfo.green.length +
+		    fb_vinfo.blue.length;
+
+	/* Set SDL_PixelFormat */
+	vformat->BitsPerPixel = fb_vinfo.bits_per_pixel;
+
+	fb_vinfo.xres_virtual = fb_vinfo.xres;
+	fb_vinfo.yres_virtual = fb_vinfo.yres;
+
+	/* Put vscreeninfo */
+	if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) {
+		SDL_SetError("[PS3] Can't put VSCREENINFO");
+		if (fb_dev_fd >= 0)
+			close(fb_dev_fd);
+		fb_dev_fd = -1;
+		return -1;
+	}
+
+	s_fb_pixel_size = fb_vinfo.bits_per_pixel / 8;
+
+	s_writeable_width = fb_vinfo.xres;
+	s_writeable_height = fb_vinfo.yres;
+
+	/* Get ps3 screeninfo */
+	if (ioctl(fb_dev_fd, PS3FB_IOCTL_SCREENINFO, (unsigned long)&res) < 0) {
+		SDL_SetError("[PS3] PS3FB_IOCTL_SCREENINFO failed");
+	}
+	deprintf(1, "[PS3] xres:%d yres:%d xoff:%d yoff:%d\n", res.xres, res.yres, res.xoff, res.yoff);
+
+	/* Only use double buffering if enough fb memory is available */
+	if (res.num_frames < 2) {
+		double_buffering = 0;
+	} else {
+		double_buffering = 1;
+	}
+
+	real_width = res.xres;
+	real_height = res.yres;
+
+	/*
+	 * Take control of frame buffer from kernel, for details see
+	 * http://felter.org/wesley/files/ps3/linux-20061110-docs/ApplicationProgrammingEnvironment.html
+	 * kernel will no longer flip the screen itself
+	 */
+	ioctl(fb_dev_fd, PS3FB_IOCTL_ON, 0);
+
+	/* Unblank screen */
+	ioctl(fb_dev_fd, FBIOBLANK, 0);
+
+	return 0;
+}
+
+
+/* List available PS3 resolutions */
+static SDL_Rect **PS3_ListModes(_THIS, SDL_PixelFormat * format, Uint32 flags)
+{
+	/* A list of video resolutions that we query for (sorted largest to
+	 * smallest)
+	 */
+	static SDL_Rect PS3_resolutions[] = {
+		{0, 0, 1920, 1080}, // 1080p 16:9 HD
+		{0, 0, 1600, 1200}, // WUXGA
+		{0, 0, 1280, 1024}, // SXGA
+		{0, 0, 1280, 720},  // 720p 16:9 HD
+		{0, 0, 1024, 768},  // WXGA
+		{0, 0, 1024, 576},  // 576p 16:9
+		{0, 0, 853, 480},   // 480p 16:9
+		{0, 0, 720, 576},   // 576p 4:3 (PAL)
+		{0, 0, 720, 480},   // 480p 16:9 (NTSC)
+	};
+	static SDL_Rect *PS3_modes[] = {
+		&PS3_resolutions[0],
+		&PS3_resolutions[1],
+		&PS3_resolutions[2],
+		&PS3_resolutions[3],
+		&PS3_resolutions[4],
+		&PS3_resolutions[5],
+		&PS3_resolutions[6],
+		&PS3_resolutions[7],
+		&PS3_resolutions[8],
+		NULL
+	};
+	SDL_Rect **modes = PS3_modes;
+
+	return modes;
+}
+
+
+/* Get a list of the available display modes */
+static SDL_Surface *PS3_SetVideoMode(_THIS, SDL_Surface * current, int width, int height, int bpp, Uint32 flags)
+{
+	s_bounded_input_width = width < s_writeable_width ? width : s_writeable_width;
+	s_bounded_input_height = height < s_writeable_height ? height : s_writeable_height;
+	s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
+	s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
+	s_input_line_length = width * s_fb_pixel_size;
+
+	current->flags |= flags;
+
+	if (ioctl(fb_dev_fd, FBIOGET_FSCREENINFO, &fb_finfo)) {
+		SDL_SetError("[PS3] Can't get fixed screeninfo");
+		return NULL;
+	}
+
+	if (fb_finfo.type != FB_TYPE_PACKED_PIXELS) {
+		SDL_SetError("[PS3] type %s not supported",
+			     fb_finfo.type);
+		return NULL;
+	}
+
+	/* Note: on PS3, fb_finfo.smem_len is enough for double buffering */
+	if ((frame_buffer =
+	     (uint8_t *) mmap(0, fb_finfo.smem_len,
+			      PROT_READ | PROT_WRITE, MAP_SHARED,
+			      fb_dev_fd, 0)) == (uint8_t *) - 1) {
+		SDL_SetError("[PS3] Can't mmap for %s", PS3_DEV_FB);
+		return NULL;
+	} else {
+		current->flags |= SDL_DOUBLEBUF;
+	}
+	if (!SDL_ReallocFormat(current, fb_bits_per_pixel, 0, 0, 0, 0)) {
+		return (NULL);
+	}
+
+	/* Blank screen */
+	memset(frame_buffer, 0x00, fb_finfo.smem_len);
+
+	/* Centering */
+	s_center[0] =
+	    frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
+	    s_bounded_input_height_offset * fb_finfo.line_length;
+	s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
+	s_center_index = 0;
+
+	current->flags |= SDL_FULLSCREEN;
+	current->w = width;
+	current->h = height;
+	current->pitch = SDL_CalculatePitch(current);
+
+	/* Alloc aligned mem for current->pixels */
+	s_pixels = memalign(16, current->h * current->pitch);
+	current->pixels = (void *)s_pixels;
+	if (!current->pixels) {
+		SDL_OutOfMemory();
+		return NULL;
+	}
+
+	/* Set the update rectangle function */
+	this->UpdateRects = PS3_DoubleBufferUpdate;
+
+	return current;
+}
+
+
+/* Copy screen to framebuffer and flip */
+void PS3_DoubleBufferUpdate(_THIS, int numrects, SDL_Rect * rects)
+{
+	if (converter_thread_data && converter_thread_data->booted)
+		SPE_WaitForMsg(this, converter_thread_data, SPU_FIN);
+
+	/* Adjust centering */
+	s_bounded_input_width_offset = (s_writeable_width - s_bounded_input_width) >> 1;
+	s_bounded_input_height_offset = (s_writeable_height - s_bounded_input_height) >> 1;
+	s_center[0] = frame_buffer + s_bounded_input_width_offset * s_fb_pixel_size +
+		s_bounded_input_height_offset * fb_finfo.line_length;
+	s_center[1] = s_center[0] + real_height * fb_finfo.line_length;
+
+	/* Set SPU parms for copying the surface to framebuffer */
+	fb_parms->data = (unsigned char *)s_pixels;
+	fb_parms->center = s_center[s_center_index];
+	fb_parms->out_line_stride = fb_finfo.line_length;
+	fb_parms->in_line_stride = s_input_line_length;
+	fb_parms->bounded_input_height = s_bounded_input_height;
+	fb_parms->bounded_input_width = s_bounded_input_width;
+	fb_parms->fb_pixel_size = s_fb_pixel_size;
+
+	deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", fb_thread_data->argp);
+
+	/* Copying.. */
+	SPE_SendMsg(this, fb_thread_data, SPU_START);
+	SPE_SendMsg(this, fb_thread_data, (unsigned int)fb_thread_data->argp);
+
+	SPE_WaitForMsg(this, fb_thread_data, SPU_FIN);
+
+	/* Flip the pages */
+	if (double_buffering)
+		s_center_index = s_center_index ^ 0x01;
+	PS3_FlipDoubleBuffer(this, this->screen);
+}
+
+
+/* Enable/Disable cursor */
+void enable_cursor(int enable)
+{
+	int fd = open("/dev/console", O_RDWR | O_NONBLOCK);
+	if (fd >= 0) {
+		ioctl(fd, KDSETMODE, enable ? KD_TEXT : KD_GRAPHICS);
+		close(fd);
+	}
+}
+
+
+static int PS3_AllocHWSurface(_THIS, SDL_Surface * surface)
+{
+	return -1;
+}
+
+
+static void PS3_FreeHWSurface(_THIS, SDL_Surface * surface)
+{
+	return;
+}
+
+
+static int PS3_LockHWSurface(_THIS, SDL_Surface * surface)
+{
+	return 0;
+}
+
+
+static void PS3_UnlockHWSurface(_THIS, SDL_Surface * surface)
+{
+	return;
+}
+
+
+/* Blit/Flip buffer to the screen. Must be called after each frame! */
+int PS3_FlipDoubleBuffer(_THIS, SDL_Surface * surface)
+{
+	unsigned long crt = 0;
+	/* Wait for vsync */
+	deprintf(1, "[PS3] Wait for vsync\n");
+	ioctl(fb_dev_fd, FBIO_WAITFORVSYNC, &crt);
+	/* Page flip */
+	deprintf(1, "[PS3] Page flip to buffer #%u 0x%x\n", s_center_index, s_center[s_center_index]);
+	ioctl(fb_dev_fd, PS3FB_IOCTL_FSEL, (unsigned long)&s_center_index);
+	return 1;
+}
+
+
+/* Start the SPE thread */
+int SPE_Start(_THIS, spu_data_t * spe_data)
+{
+	deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
+	if (!(spe_data->booted))
+		SPE_Boot(this, spe_data);
+
+	/* To allow re-running of context, spe_ctx_entry has to be set before each call */
+	spe_data->entry = SPE_DEFAULT_ENTRY;
+	spe_data->error_code = 0;
+
+	/* Create SPE thread and run */
+	deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
+	if (pthread_create
+	    (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
+		deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
+		SDL_SetError("[PS3->SPU] Could not create pthread for spe");
+		return -1;
+	}
+
+	if (spe_data->keepalive)
+		SPE_WaitForMsg(this, spe_data, SPU_READY);
+}
+
+
+/* Stop the SPE thread */
+int SPE_Stop(_THIS, spu_data_t * spe_data)
+{
+	deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
+	/* Wait for SPE thread to complete */
+	deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
+	if (pthread_join(spe_data->thread, NULL)) {
+		deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
+		SDL_SetError("[PS3->SPU] Failed joining the thread");
+		return -1;
+	}
+
+	return 0;
+}
+
+
+/* Create SPE context and load program */
+int SPE_Boot(_THIS, spu_data_t * spe_data)
+{
+	/* Create SPE context */
+	deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
+	spe_data->ctx = spe_context_create(0, NULL);
+	if (spe_data->ctx == NULL) {
+		deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
+		SDL_SetError("[PS3->SPU] Failed creating SPE context");
+		return -1;
+	}
+
+	/* Load SPE object into SPE local store */
+	deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
+	if (spe_program_load(spe_data->ctx, &spe_data->program)) {
+		deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
+		SDL_SetError
+		    ("[PS3->SPU] Failed loading program into SPE context");
+		return -1;
+	}
+	spe_data->booted = 1;
+	deprintf(2, "[PS3->SPU] SPE boot successful\n");
+
+	return 0;
+}
+
+/* (Stop and) shutdown the SPE */
+int SPE_Shutdown(_THIS, spu_data_t * spe_data)
+{
+	if (spe_data->keepalive && spe_data->booted) {
+		SPE_SendMsg(this, spe_data, SPU_EXIT);
+		SPE_Stop(this, spe_data);
+	}
+
+	/* Destroy SPE context */
+	deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
+	if (spe_context_destroy(spe_data->ctx)) {
+		deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
+		SDL_SetError("[PS3->SPU] Failed destroying context");
+		return -1;
+	}
+	deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
+	return 0;
+}
+
+
+/* Send message to the SPE via mailboxe */
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
+{
+	deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
+	/* Send one message, block until message was sent */
+	unsigned int spe_in_mbox_msgs[1];
+	spe_in_mbox_msgs[0] = msg;
+	int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
+
+	if (1 > in_mbox_write) {
+		deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
+		SDL_SetError("[PS3->SPU] No message could be written");
+		return -1;
+	}
+	return 0;
+}
+
+
+/* Read 1 message from SPE, block until at least 1 message was received */
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg)
+{
+	deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
+	unsigned int out_messages[1];
+	while (!spe_out_mbox_status(spe_data->ctx));
+	int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
+	deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
+	if (out_messages[0] == msg)
+		return 0;
+	else
+		return -1;
+}
+
+
+/* Re-runnable invocation of the spe_context_run call */
+void SPE_RunContext(void *thread_argp)
+{
+	/* argp is the pointer to argument to be passed to the SPE program */
+	spu_data_t *args = (spu_data_t *) thread_argp;
+	deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
+
+	/* Run it.. */
+	deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
+	if (spe_context_run
+	    (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
+	     NULL) < 0) {
+		deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
+		SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
+		exit(1);
+	}
+
+	pthread_exit(NULL);
+}
+
+
+/* Quits the video driver */
+static void PS3_VideoQuit(_THIS)
+{
+	if (fb_dev_fd > 0) {
+		/* Restore the original video mode */
+		if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_orig_vinfo))
+			SDL_SetError("[PS3] Can't restore original fb_var_screeninfo");
+
+		/* Give control of frame buffer to kernel */
+		ioctl(fb_dev_fd, PS3FB_IOCTL_OFF, 0);
+		close(fb_dev_fd);
+		fb_dev_fd = -1;
+	}
+
+	if (frame_buffer) {
+		munmap(frame_buffer, fb_finfo.smem_len);
+		frame_buffer = 0;
+	}
+
+	if (fb_parms)
+		free((void *)fb_parms);
+	if (fb_thread_data) {
+		SPE_Shutdown(this, fb_thread_data);
+		free((void *)fb_thread_data);
+	}
+
+	if (this->screen) {
+		if (double_buffering && this->screen->pixels) {
+			free(this->screen->pixels);
+		}
+		this->screen->pixels = NULL;
+	}
+
+	enable_cursor(1);
+	deprintf(1, "[PS3] VideoQuit\n");
+}
+
diff --git a/src/video/ps3/SDL_ps3video.h b/src/video/ps3/SDL_ps3video.h
new file mode 100644
index 000000000..4fe5a2b42
--- /dev/null
+++ b/src/video/ps3/SDL_ps3video.h
@@ -0,0 +1,165 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+#include "../SDL_sysvideo.h"
+#include "SDL_mouse.h"
+#include "SDL_mutex.h"
+#include "spulibs/spu_common.h"
+
+#include <libspe2.h>
+#include <pthread.h>
+#include <linux/types.h>
+#include <linux/fb.h>
+#include <asm/ps3fb.h>
+#include <linux/vt.h>
+#include <termios.h>
+
+#ifndef _SDL_ps3video_h
+#define _SDL_ps3video_h
+
+/* Debugging
+ * 0: No debug messages
+ * 1: Video debug messages
+ * 2: SPE debug messages
+ * 3: Memory adresses
+ */
+#define DEBUG_LEVEL 0
+
+#ifdef DEBUG_LEVEL
+#define deprintf( level, fmt, args... ) \
+    do \
+{ \
+    if ( (unsigned)(level) <= DEBUG_LEVEL ) \
+    { \
+        fprintf( stdout, fmt, ##args ); \
+        fflush( stdout ); \
+    } \
+} while ( 0 )
+#else
+#define deprintf( level, fmt, args... )
+#endif
+
+/* Framebuffer device */
+#define PS3_DEV_FB "/dev/fb0"
+
+/* Hidden "this" pointer for the video functions */
+#define _THIS   SDL_VideoDevice * this
+
+/* SPU thread data */
+typedef struct spu_data {
+    spe_context_ptr_t ctx;
+    pthread_t thread;
+    spe_program_handle_t program;
+    char * program_name;
+    unsigned int booted;
+    unsigned int keepalive;
+    unsigned int entry;
+    int error_code;
+    void * argp;
+} spu_data_t;
+
+/* Private video driver data needed for Cell support */
+struct SDL_PrivateVideoData
+{
+    const char * const fb_dev_name; /* FB-device name */
+    int fb_dev_fd; /* Descriptor-handle for fb_dev_name */
+    uint8_t * frame_buffer; /* mmap'd access to fbdev */
+
+    /* SPE threading stuff */
+    spu_data_t * fb_thread_data;
+    spu_data_t * scaler_thread_data;
+    spu_data_t * converter_thread_data;
+
+    /* screeninfo (from linux/fb.h) */
+    struct fb_fix_screeninfo fb_finfo;
+    struct fb_var_screeninfo fb_vinfo;
+    struct fb_var_screeninfo fb_orig_vinfo;
+
+    /* screeninfo (from asm/ps3fb.h) */
+    struct ps3fb_ioctl_res res;
+
+    unsigned int double_buffering;
+    uint32_t real_width;      // real width of screen
+    uint32_t real_height;     // real height of screen
+
+    uint32_t s_fb_pixel_size;   // 32:  4  24:  3  16:  2  15:  2
+    uint32_t fb_bits_per_pixel;   // 32: 32  24: 24  16: 16  15: 15
+
+    uint32_t config_count;
+
+    uint32_t s_input_line_length;   // precalculated: input_width * fb_pixel_size
+    uint32_t s_bounded_input_width; // width of input (bounded by writeable width)
+    uint32_t s_bounded_input_height;// height of input (bounded by writeable height)
+    uint32_t s_bounded_input_width_offset;  // offset from the left side (used for centering)
+    uint32_t s_bounded_input_height_offset; // offset from the upper side (used for centering)
+    uint32_t s_writeable_width; // width of screen which is writeable
+    uint32_t s_writeable_height;    // height of screen which is writeable
+
+    uint8_t * s_center[2]; // where to begin writing our image (centered?)
+    uint32_t s_center_index;
+
+    volatile void * s_pixels __attribute__((aligned(128)));
+
+    /* Framebuffer data */
+    volatile struct fb_writer_parms_t * fb_parms __attribute__((aligned(128)));
+};
+
+#define fb_dev_name     (this->hidden->fb_dev_name)
+#define fb_dev_fd       (this->hidden->fb_dev_fd)
+#define frame_buffer       (this->hidden->frame_buffer)
+#define fb_thread_data      (this->hidden->fb_thread_data)
+#define scaler_thread_data      (this->hidden->scaler_thread_data)
+#define converter_thread_data      (this->hidden->converter_thread_data)
+#define fb_parms           (this->hidden->fb_parms)
+#define SDL_nummodes		(this->hidden->SDL_nummodes)
+#define SDL_modelist		(this->hidden->SDL_modelist)
+#define SDL_videomode		(this->hidden->SDL_videomode)
+#define fb_finfo        (this->hidden->fb_finfo)
+#define fb_vinfo        (this->hidden->fb_vinfo)
+#define fb_orig_vinfo   (this->hidden->fb_orig_vinfo)
+#define res             (this->hidden->res)
+#define double_buffering (this->hidden->double_buffering)
+#define real_width      (this->hidden->real_width)
+#define real_height     (this->hidden->real_height)
+#define s_fb_pixel_size   (this->hidden->s_fb_pixel_size)
+#define fb_bits_per_pixel (this->hidden->fb_bits_per_pixel)
+#define config_count (this->hidden->config_count)
+#define s_input_line_length (this->hidden->s_input_line_length)
+#define s_bounded_input_width (this->hidden->s_bounded_input_width)
+#define s_bounded_input_height (this->hidden->s_bounded_input_height)
+#define s_bounded_input_width_offset (this->hidden->s_bounded_input_width_offset)
+#define s_bounded_input_height_offset (this->hidden->s_bounded_input_height_offset)
+#define s_writeable_width (this->hidden->s_writeable_width)
+#define s_writeable_height (this->hidden->s_writeable_height)
+#define s_center          (this->hidden->s_center)
+#define s_center_index    (this->hidden->s_center_index)
+#define s_pixels           (this->hidden->s_pixels)
+
+#endif /* _SDL_ps3video_h */
+
+
diff --git a/src/video/ps3/SDL_ps3yuv.c b/src/video/ps3/SDL_ps3yuv.c
new file mode 100644
index 000000000..b1e17dae6
--- /dev/null
+++ b/src/video/ps3/SDL_ps3yuv.c
@@ -0,0 +1,340 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+
+#include "SDL_video.h"
+#include "SDL_ps3video.h"
+#include "SDL_ps3yuv_c.h"
+#include "../SDL_yuvfuncs.h"
+#include "spulibs/spu_common.h"
+
+/* Stores the executable name */
+extern spe_program_handle_t yuv2rgb_spu;
+extern spe_program_handle_t bilin_scaler_spu;
+
+int SPE_Start(_THIS, spu_data_t * spe_data);
+int SPE_Stop(_THIS, spu_data_t * spe_data);
+int SPE_Boot(_THIS, spu_data_t * spe_data);
+int SPE_Shutdown(_THIS, spu_data_t * spe_data);
+int SPE_SendMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+int SPE_WaitForMsg(_THIS, spu_data_t * spe_data, unsigned int msg);
+void SPE_RunContext(void *thread_argp);
+
+
+/* The functions used to manipulate software video overlays */
+static struct private_yuvhwfuncs ps3_yuvfuncs = {
+  PS3_LockYUVOverlay,
+  PS3_UnlockYUVOverlay,
+  PS3_DisplayYUVOverlay,
+  PS3_FreeYUVOverlay
+};
+
+
+struct private_yuvhwdata {
+	SDL_Surface *display;
+	SDL_Surface *stretch;
+    volatile void * pixels __attribute__((aligned(128)));
+
+	/* These are just so we don't have to allocate them separately */
+	Uint16 pitches[3];
+	Uint8 * planes[3];
+
+	unsigned int scale;
+
+	/* Scaled YUV picture */
+	Uint8 * scaler_out __attribute__((aligned(128)));
+
+	/* YUV2RGB converter data */
+    volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128)));
+
+	/* Scaler data */
+    volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128)));
+
+	Uint8 locked;
+};
+
+
+SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display) {
+	/* Only RGB packed pixel conversion supported */
+	if ((display->format->BytesPerPixel != 2) &&
+			(display->format->BytesPerPixel != 3) &&
+			(display->format->BytesPerPixel != 4))
+	{
+		SDL_SetError ("Can't use YUV data on non 16/24/32 bit surfaces");
+		return NULL;
+	}
+
+	/* Double-check the requested format. We'll only support YV12 */
+	switch (format) {
+	    case SDL_IYUV_OVERLAY:
+		case SDL_YV12_OVERLAY:
+			/* Supported YUV format */
+			break;
+		default:
+			SDL_SetError("Unsupported YUV format");
+			return NULL;
+	}
+
+	SDL_Overlay* overlay;
+	struct private_yuvhwdata* hwdata;
+
+	/* Create the overlay structure */
+	overlay = (SDL_Overlay *) SDL_calloc(1, sizeof(SDL_Overlay));
+	if (overlay == NULL) {
+		SDL_OutOfMemory();
+		return NULL;
+	}
+	SDL_memset(overlay, 0, (sizeof *overlay));
+
+	/* Set the basic attributes */
+	overlay->format = format;
+	overlay->w = width;
+	overlay->h = height;
+	overlay->hwdata = NULL;
+
+	/* Set up the PS3 YUV surface function structure */
+	overlay->hwfuncs = &ps3_yuvfuncs;
+
+	/* Create the pixel data and lookup tables */
+	hwdata = (struct private_yuvhwdata *) SDL_calloc(1, sizeof(struct private_yuvhwdata));
+	if (hwdata == NULL) {
+		SDL_OutOfMemory();
+		SDL_FreeYUVOverlay(overlay);
+		return NULL;
+	}
+	overlay->hwdata = hwdata;
+
+	hwdata->stretch = NULL;
+	hwdata->display = display;
+
+	/* Create SPU parms structure */
+	hwdata->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t));
+	hwdata->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t));
+	if (hwdata->converter_parms == NULL || hwdata->scaler_parms == NULL) {
+		SDL_FreeYUVOverlay(overlay);
+		SDL_OutOfMemory();
+		return(NULL);
+	}
+
+	/* Set up the SPEs */
+	scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+	converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+	if (converter_thread_data == NULL || scaler_thread_data == NULL) {
+		SDL_FreeYUVOverlay(overlay);
+		SDL_OutOfMemory();
+		return(NULL);
+	}
+
+	scaler_thread_data->program = bilin_scaler_spu;
+	scaler_thread_data->program_name = "bilin_scaler_spu";
+	scaler_thread_data->keepalive = 0;
+	scaler_thread_data->booted = 0;
+
+	converter_thread_data->program = yuv2rgb_spu;
+	converter_thread_data->program_name = "yuv2rgb_spu";
+	converter_thread_data->keepalive = 1;
+	converter_thread_data->booted = 0;
+
+	SPE_Start(this, converter_thread_data);
+
+	hwdata->pixels = (Uint8 *) memalign(16, width * height + ((width * height) >> 1));
+	if (hwdata->pixels == NULL) {
+		SDL_FreeYUVOverlay(overlay);
+		SDL_OutOfMemory();
+		return(NULL);
+	}
+
+	/* Find the pitch and offset values for the overlay */
+	overlay->pitches = hwdata->pitches;
+	overlay->pixels = hwdata->planes;
+	switch (format) {
+	    case SDL_YV12_OVERLAY:
+	    case SDL_IYUV_OVERLAY:
+			overlay->pitches[0] = overlay->w;
+			overlay->pitches[1] = overlay->pitches[0] / 2;
+			overlay->pitches[2] = overlay->pitches[0] / 2;
+			overlay->pixels[0] = (Uint8 *)hwdata->pixels;
+			overlay->pixels[1] = overlay->pixels[0] +
+				overlay->pitches[0] * overlay->h;
+			overlay->pixels[2] = overlay->pixels[1] +
+				overlay->pitches[1] * overlay->h / 2;
+			overlay->planes = 3;
+		break;
+	    default:
+		/* We should never get here (caught above) */
+		break;
+	}
+
+	/* We're all done.. */
+	return overlay;
+}
+
+
+int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay) {
+	if (overlay == NULL) {
+		return -1;
+	}
+	overlay->hwdata->locked = 1;
+
+	return 0;
+}
+
+
+void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay) {
+	if (overlay == NULL) {
+		return;
+	}
+	overlay->hwdata->locked = 0;
+
+	return;
+}
+
+
+int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst) {
+	if ((overlay == NULL) || (overlay->hwdata == NULL)) {
+		return -1;
+	}
+
+	Uint8 *lum, *Cr, *Cb;
+	struct private_yuvhwdata *hwdata;
+	SDL_Surface *display;
+
+	hwdata = overlay->hwdata;
+	display = hwdata->display;
+
+	/* Do we have to scale? */
+	if ((src->w != dst->w) || (src->h != dst->h) ) {
+		hwdata->scale = 1;
+		deprintf(1, "[PS3] We need to scale\n");
+	} else {
+		hwdata->scale = 0;
+		deprintf(1, "[PS3] No scaling\n");
+	}
+
+	/* Find out where the various portions of the image are */
+	switch (overlay->format) {
+		case SDL_YV12_OVERLAY:
+			lum = (Uint8 *)overlay->pixels[0];
+			Cr =  (Uint8 *)overlay->pixels[1];
+			Cb =  (Uint8 *)overlay->pixels[2];
+			break;
+		case SDL_IYUV_OVERLAY:
+			lum = (Uint8 *)overlay->pixels[0];
+			Cr =  (Uint8 *)overlay->pixels[2];
+			Cb =  (Uint8 *)overlay->pixels[1];
+			break;
+		default:
+			SDL_SetError("Unsupported YUV format in blit");
+			return -1;
+	}
+
+	if (hwdata->scale) {
+		/* Alloc mem for scaled YUV picture */
+		hwdata->scaler_out = (Uint8 *) memalign(16, dst->w * dst->h + ((dst->w * dst->h) >> 1));
+		if (hwdata->scaler_out == NULL) {
+			SDL_FreeYUVOverlay(overlay);
+			SDL_OutOfMemory();
+			return -1;
+		}
+
+		/* Set parms for scaling */
+		hwdata->scaler_parms->src_pixel_width = src->w;
+		hwdata->scaler_parms->src_pixel_height = src->h;
+		hwdata->scaler_parms->dst_pixel_width = dst->w;
+		hwdata->scaler_parms->dst_pixel_height = dst->h;
+		hwdata->scaler_parms->y_plane = lum;
+		hwdata->scaler_parms->v_plane = Cr;
+		hwdata->scaler_parms->u_plane = Cb;
+		hwdata->scaler_parms->dstBuffer = hwdata->scaler_out;
+		scaler_thread_data->argp = (void *)hwdata->scaler_parms;
+
+		/* Scale the YUV overlay to given size */
+		SPE_Start(this, scaler_thread_data);
+		SPE_Stop(this, scaler_thread_data);
+
+		/* Set parms for converting after scaling */
+		hwdata->converter_parms->y_plane = hwdata->scaler_out;
+		hwdata->converter_parms->v_plane = hwdata->scaler_out + dst->w * dst->h;
+		hwdata->converter_parms->u_plane = hwdata->scaler_out + dst->w * dst->h + ((dst->w * dst->h) >> 2);
+	} else {
+		/* Set parms for converting */
+		hwdata->converter_parms->y_plane = lum;
+		hwdata->converter_parms->v_plane = Cr;
+		hwdata->converter_parms->u_plane = Cb;
+	}
+
+	hwdata->converter_parms->src_pixel_width = dst->w;
+	hwdata->converter_parms->src_pixel_height = dst->h;
+	hwdata->converter_parms->dstBuffer = (Uint8 *) s_pixels;
+	converter_thread_data->argp = (void *)hwdata->converter_parms;
+
+	/* Convert YUV overlay to RGB */
+	SPE_SendMsg(this, converter_thread_data, SPU_START);
+	SPE_SendMsg(this, converter_thread_data, (unsigned int)converter_thread_data->argp);
+
+	/* Centering */
+	s_bounded_input_width = dst->w;
+	s_bounded_input_height = dst->h;
+
+	/* UpdateRects() will do the rest.. */
+	SDL_UpdateRects(display, 1, dst);
+
+	if (hwdata->scale)
+		SDL_free((void *)hwdata->scaler_out);
+
+	return 0;
+}
+
+
+void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay) {
+	if (overlay == NULL) {
+		return;
+	}
+
+	if (overlay->hwdata == NULL) {
+		return;
+	}
+
+	struct private_yuvhwdata * hwdata;
+	hwdata = overlay->hwdata;
+
+	if (scaler_thread_data)
+		SDL_free(scaler_thread_data);
+	if (converter_thread_data) {
+		SPE_Shutdown(this, converter_thread_data);
+		SDL_free(converter_thread_data);
+	}
+
+	if (hwdata) {
+		if (hwdata->pixels)
+			SDL_free((void *)hwdata->pixels);
+		SDL_free(hwdata);
+	}
+	return;
+}
+
diff --git a/src/video/ps3/SDL_ps3yuv_c.h b/src/video/ps3/SDL_ps3yuv_c.h
new file mode 100644
index 000000000..49f9d7095
--- /dev/null
+++ b/src/video/ps3/SDL_ps3yuv_c.h
@@ -0,0 +1,44 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "SDL_config.h"
+
+#ifndef _SDL_ps3yuv_h
+#define _SDL_ps3yuv_h
+
+/* This is the PS3 implementation of YUV video overlays */
+
+#include "SDL_video.h"
+
+extern SDL_Overlay *PS3_CreateYUVOverlay(_THIS, int width, int height, Uint32 format, SDL_Surface *display);
+extern int PS3_DisplayYUVOverlay(_THIS, SDL_Overlay *overlay, SDL_Rect *src, SDL_Rect *dst);
+extern int PS3_LockYUVOverlay(_THIS, SDL_Overlay *overlay);
+extern void PS3_UnlockYUVOverlay(_THIS, SDL_Overlay *overlay);
+extern void PS3_FreeYUVOverlay(_THIS, SDL_Overlay *overlay);
+
+#endif /* _SDL_ps3yuv_h */
+
diff --git a/src/video/ps3/spulibs/Makefile b/src/video/ps3/spulibs/Makefile
new file mode 100644
index 000000000..dc580d943
--- /dev/null
+++ b/src/video/ps3/spulibs/Makefile
@@ -0,0 +1,83 @@
+# This Makefile is for building the CELL BE SPU libs
+# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so
+
+# Toolchain
+SPU_GCC=/usr/bin/spu-gcc
+PPU_GCC=/usr/bin/gcc
+PPU_EMBEDSPU=/usr/bin/embedspu
+PPU_AR=/usr/bin/ar
+PPU_LD=/usr/bin/ld
+INSTALL=/usr/bin/install
+
+SPU_CFLAGS=-W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
+
+# Usually /usr/lib, depending on your distribution
+PREFIX=/usr/lib
+
+
+all: libfb_writer_spu.a libfb_writer_spu.so \
+				libyuv2rgb_spu.so libyuv2rgb_spu.a \
+				libbilin_scaler_spu.so libbilin_scaler_spu.a
+
+
+# fb_writer
+fb_writer_spu-embed.o: fb_writer.c spu_common.h
+	$(SPU_GCC) $(SPU_CFLAGS) -o fb_writer_spu fb_writer.c -lm
+	$(PPU_EMBEDSPU) -m32 fb_writer_spu fb_writer_spu fb_writer_spu-embed.o
+
+libfb_writer_spu.so: fb_writer_spu-embed.o
+	$(PPU_LD) -o libfb_writer_spu.so -shared -soname=libfb_writer_spu.so fb_writer_spu-embed.o
+
+libfb_writer_spu.a: fb_writer_spu-embed.o
+	$(PPU_AR) -qcs libfb_writer_spu.a fb_writer_spu-embed.o
+
+
+# yuv2rgb_converter
+yuv2rgb_spu-embed.o: yuv2rgb_converter.c spu_common.h
+	$(SPU_GCC) $(SPU_CFLAGS) -o yuv2rgb_spu yuv2rgb_converter.c -lm
+	$(PPU_EMBEDSPU) -m32 yuv2rgb_spu yuv2rgb_spu yuv2rgb_spu-embed.o
+
+libyuv2rgb_spu.a: yuv2rgb_spu-embed.o
+	$(PPU_AR) -qcs libyuv2rgb_spu.a yuv2rgb_spu-embed.o
+
+libyuv2rgb_spu.so: yuv2rgb_spu-embed.o
+	$(PPU_LD) -o libyuv2rgb_spu.so -shared -soname=libyuv2rgb_spu.so yuv2rgb_spu-embed.o
+
+
+# bilin_scaler
+bilin_scaler_spu-embed.o: bilin_scaler.c spu_common.h
+	$(SPU_GCC) $(SPU_CFLAGS) -o bilin_scaler_spu bilin_scaler.c -lm
+	$(PPU_EMBEDSPU) -m32 bilin_scaler_spu bilin_scaler_spu bilin_scaler_spu-embed.o
+
+libbilin_scaler_spu.a: bilin_scaler_spu-embed.o
+	$(PPU_AR) -qcs libbilin_scaler_spu.a bilin_scaler_spu-embed.o
+
+libbilin_scaler_spu.so: bilin_scaler_spu-embed.o
+	$(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o
+
+install: libfb_writer_spu.a libfb_writer_spu.so \
+				libyuv2rgb_spu.so libyuv2rgb_spu.a \
+				libbilin_scaler_spu.so libbilin_scaler_spu.a
+	$(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/.
+	$(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/.
+	$(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
+	$(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
+	$(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/.
+	$(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/.
+
+
+uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \
+		$(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \
+		$(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a
+	rm -f $(PREFIX)/libfb_writer_spu.a
+	rm -f $(PREFIX)/libfb_writer_spu.so
+	rm -f $(PREFIX)/libyuv2rgb_spu.so
+	rm -f $(PREFIX)/libyuv2rgb_spu.a
+	rm -f $(PREFIX)/libbilin_scaler_spu.so
+	rm -f $(PREFIX)/libbilin_scaler_spu.a
+
+
+clean:
+	rm -f bilin_scaler_spu-embed.o libbilin_scaler_spu.so libbilin_scaler_spu.a bilin_scaler_spu
+	rm -f yuv2rgb_spu-embed.o libyuv2rgb_spu.so libyuv2rgb_spu.a yuv2rgb_spu
+	rm -f fb_writer_spu-embed.o libfb_writer_spu.so libfb_writer_spu.a fb_writer_spu
diff --git a/src/video/ps3/spulibs/bilin_scaler.c b/src/video/ps3/spulibs/bilin_scaler.c
new file mode 100644
index 000000000..be9b5c6e8
--- /dev/null
+++ b/src/video/ps3/spulibs/bilin_scaler.c
@@ -0,0 +1,2050 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+	fprintf( stdout, fmt, ##args ); \
+	fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct scale_parms_t parms __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
+
+/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
+unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
+unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
+
+void scale_srcw16_dstw16();
+void scale_srcw16_dstw32();
+void scale_srcw32_dstw16();
+void scale_srcw32_dstw32();
+
+int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
+{
+	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
+	/* DMA transfer for the input parameters */
+	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
+	DMA_WAIT_TAG(TAG_INIT);
+
+	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
+			parms.dst_pixel_width, parms.dst_pixel_height);
+
+	if(parms.src_pixel_width & 0x1f) {
+		if(parms.dst_pixel_width & 0x1F) {
+			deprintf("[SPU] Using scale_srcw16_dstw16\n");
+			scale_srcw16_dstw16();
+		} else {
+			deprintf("[SPU] Using scale_srcw16_dstw32\n");
+			scale_srcw16_dstw32();
+		}
+	} else {
+		if(parms.dst_pixel_width & 0x1F) {
+			deprintf("[SPU] Using scale_srcw32_dstw16\n");
+			scale_srcw32_dstw16();
+		} else {
+			deprintf("[SPU] Using scale_srcw32_dstw32\n");
+			scale_srcw32_dstw32();
+		}
+	}
+	deprintf("[SPU] bilin_scaler_spu... done!\n");
+
+	return 0;
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return spu_convtu(vec_s,0);
+}
+
+
+/*
+ * scale_srcw16_dstw16()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 16
+ * result stored in RAM
+ */
+void scale_srcw16_dstw16() {
+	// extract parameters
+	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+	unsigned int src_width = parms.src_pixel_width;
+	unsigned int src_height = parms.src_pixel_height;
+	unsigned int dst_width = parms.dst_pixel_width;
+	unsigned int dst_height = parms.dst_pixel_height;
+
+	// YVU
+	unsigned int src_linestride_y = src_width;
+	unsigned int src_dbl_linestride_y = src_width<<1;
+	unsigned int src_linestride_vu = src_width>>1;
+	unsigned int src_dbl_linestride_vu = src_width;
+
+	// scaled YVU
+	unsigned int scaled_src_linestride_y = dst_width;
+
+	// ram addresses
+	unsigned char* src_addr_y = parms.y_plane;
+	unsigned char* src_addr_v = parms.v_plane;
+	unsigned char* src_addr_u = parms.u_plane;
+
+	// for handling misalignment, addresses are precalculated
+	unsigned char* precalc_src_addr_v = src_addr_v;
+	unsigned char* precalc_src_addr_u = src_addr_u;
+
+	unsigned int dst_picture_size = dst_width*dst_height;
+
+	// Sizes for destination
+	unsigned int dst_dbl_linestride_y = dst_width<<1;
+	unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+	// Perform address calculation for Y, V and U in main memory with dst_addr as base
+	unsigned char* dst_addr_main_memory_y = dst_addr;
+	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+	// calculate scale factors
+	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+	float y_scale = (float)src_height/(float)dst_height;
+
+	// double buffered processing
+	// buffer switching
+	unsigned int curr_src_idx = 0;
+	unsigned int curr_dst_idx = 0;
+	unsigned int next_src_idx, next_dst_idx;
+
+	// 2 lines y as output, upper and lowerline
+	unsigned int curr_interpl_y_upper = 0;
+	unsigned int next_interpl_y_upper;
+	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+	// only 1 line v/u output, both planes have the same dimension
+	unsigned int curr_interpl_vu = 0;
+	unsigned int next_interpl_vu;
+
+	// weights, calculated in every loop iteration
+	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_y_upper;
+	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_vu;
+
+	// line indices for the src picture
+	float curr_src_y_upper = 0.0f, next_src_y_upper;
+	float curr_src_y_lower, next_src_y_lower;
+	float curr_src_vu = 0.0f, next_src_vu;
+
+	// line indices for the dst picture
+	unsigned int dst_y=0, dst_vu=0;
+
+	// offset for the v and u plane to handle misalignement
+	unsigned int curr_lsoff_v = 0, next_lsoff_v;
+	unsigned int curr_lsoff_u = 0, next_lsoff_u;
+
+	// calculate lower line indices
+	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+	// lower line weight
+	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+	// start partially double buffered processing
+	// get initial data, 2 sets of y, 1 set v, 1 set u
+	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+			src_dbl_linestride_y,
+			RETR_BUF,
+			0, 0 );
+	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+	/* iteration loop
+	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+	 * the scaled output is 2 lines y, 1 line v, 1 line u
+	 * the yuv2rgb-converted output is stored to RAM
+	 */
+	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+		dst_y = dst_vu<<1;
+
+		// calculate next indices
+		next_src_vu = ((float)dst_vu+1)*y_scale;
+		next_src_y_upper = ((float)dst_y+2)*y_scale;
+		next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+		next_interpl_vu = (unsigned int) next_src_vu;
+		next_interpl_y_upper = (unsigned int) next_src_y_upper;
+		next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+		// calculate weight NORTH-SOUTH
+		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+		// get next lines
+		next_src_idx = curr_src_idx^1;
+		next_dst_idx = curr_dst_idx^1;
+
+		// 4 lines y
+		mfc_get( y_plane[next_src_idx],
+				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		// 2 lines v
+		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+		mfc_get( v_plane[next_src_idx],
+				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+				src_dbl_linestride_vu+(next_lsoff_v<<1),
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		// 2 lines u
+		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+		mfc_get( u_plane[next_src_idx],
+				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+				src_dbl_linestride_vu+(next_lsoff_v<<1),
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+		// scaling
+		// work line y_upper
+		bilinear_scale_line_w16( y_plane[curr_src_idx],
+				scaled_y_plane[curr_src_idx],
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_upper,
+				src_linestride_y );
+		// work line y_lower
+		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_lower,
+				src_linestride_y );
+		// work line v
+		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+				scaled_v_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+		// work line u
+		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+				scaled_u_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+
+
+		// Store the result back to main memory into a destination buffer in YUV format
+		//---------------------------------------------------------------------------------------------
+		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+		// Perform three DMA transfers to 3 different locations in the main memory!
+		// dst_width:	Pixel width of destination image
+		// dst_addr:	Destination address in main memory
+		// dst_vu:	Counter which is incremented one by one
+		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
+				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,						// Tag
+				0, 0 );
+
+		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
+				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,						// Tag
+				0, 0 );
+
+		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
+				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,						// Tag
+				0, 0 );
+		//---------------------------------------------------------------------------------------------
+
+
+		// update for next cycle
+		curr_src_idx = next_src_idx;
+		curr_dst_idx = next_dst_idx;
+
+		curr_interpl_y_upper = next_interpl_y_upper;
+		curr_interpl_y_lower = next_interpl_y_lower;
+		curr_interpl_vu = next_interpl_vu;
+
+		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+		vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+		curr_src_y_upper = next_src_y_upper;
+		curr_src_y_lower = next_src_y_lower;
+		curr_src_vu = next_src_vu;
+
+		curr_lsoff_v = next_lsoff_v;
+		curr_lsoff_u = next_lsoff_u;
+	}
+
+
+
+	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+	// scaling
+	// work line y_upper
+	bilinear_scale_line_w16( y_plane[curr_src_idx],
+			scaled_y_plane[curr_src_idx],
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_upper,
+			src_linestride_y );
+	// work line y_lower
+	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_lower,
+			src_linestride_y );
+	// work line v
+	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+			scaled_v_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+	// work line u
+	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+			scaled_u_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+
+
+	// Store the result back to main memory into a destination buffer in YUV format
+	//---------------------------------------------------------------------------------------------
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+	// Perform three DMA transfers to 3 different locations in the main memory!
+	// dst_width:	Pixel width of destination image
+	// dst_addr:	Destination address in main memory
+	// dst_vu:	Counter which is incremented one by one
+	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
+			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,						// Tag
+			0, 0 );
+
+	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
+			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,						// Tag
+			0, 0 );
+
+	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
+			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,						// Tag
+			0, 0 );
+
+	// wait for completion
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+	//---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * scale_srcw16_dstw32()
+ *
+ * processes an input image of width 16
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw16_dstw32() {
+	// extract parameters
+	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+	unsigned int src_width = parms.src_pixel_width;
+	unsigned int src_height = parms.src_pixel_height;
+	unsigned int dst_width = parms.dst_pixel_width;
+	unsigned int dst_height = parms.dst_pixel_height;
+
+	// YVU
+	unsigned int src_linestride_y = src_width;
+	unsigned int src_dbl_linestride_y = src_width<<1;
+	unsigned int src_linestride_vu = src_width>>1;
+	unsigned int src_dbl_linestride_vu = src_width;
+	// scaled YVU
+	unsigned int scaled_src_linestride_y = dst_width;
+
+	// ram addresses
+	unsigned char* src_addr_y = parms.y_plane;
+	unsigned char* src_addr_v = parms.v_plane;
+	unsigned char* src_addr_u = parms.u_plane;
+
+	unsigned int dst_picture_size = dst_width*dst_height;
+
+	// Sizes for destination
+	unsigned int dst_dbl_linestride_y = dst_width<<1;
+	unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+	// Perform address calculation for Y, V and U in main memory with dst_addr as base
+	unsigned char* dst_addr_main_memory_y = dst_addr;
+	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+
+	// for handling misalignment, addresses are precalculated
+	unsigned char* precalc_src_addr_v = src_addr_v;
+	unsigned char* precalc_src_addr_u = src_addr_u;
+
+	// calculate scale factors
+	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+	float y_scale = (float)src_height/(float)dst_height;
+
+	// double buffered processing
+	// buffer switching
+	unsigned int curr_src_idx = 0;
+	unsigned int curr_dst_idx = 0;
+	unsigned int next_src_idx, next_dst_idx;
+
+	// 2 lines y as output, upper and lowerline
+	unsigned int curr_interpl_y_upper = 0;
+	unsigned int next_interpl_y_upper;
+	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+	// only 1 line v/u output, both planes have the same dimension
+	unsigned int curr_interpl_vu = 0;
+	unsigned int next_interpl_vu;
+
+	// weights, calculated in every loop iteration
+	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_y_upper;
+	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_vu;
+
+	// line indices for the src picture
+	float curr_src_y_upper = 0.0f, next_src_y_upper;
+	float curr_src_y_lower, next_src_y_lower;
+	float curr_src_vu = 0.0f, next_src_vu;
+
+	// line indices for the dst picture
+	unsigned int dst_y=0, dst_vu=0;
+
+	// offset for the v and u plane to handle misalignement
+	unsigned int curr_lsoff_v = 0, next_lsoff_v;
+	unsigned int curr_lsoff_u = 0, next_lsoff_u;
+
+	// calculate lower line idices
+	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+	// lower line weight
+	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+	// start partially double buffered processing
+	// get initial data, 2 sets of y, 1 set v, 1 set u
+	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+			src_dbl_linestride_y,
+			RETR_BUF,
+			0, 0 );
+	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+	// iteration loop
+	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+	// the scaled output is 2 lines y, 1 line v, 1 line u
+	// the yuv2rgb-converted output is stored to RAM
+	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+		dst_y = dst_vu<<1;
+
+		// calculate next indices
+		next_src_vu = ((float)dst_vu+1)*y_scale;
+		next_src_y_upper = ((float)dst_y+2)*y_scale;
+		next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+		next_interpl_vu = (unsigned int) next_src_vu;
+		next_interpl_y_upper = (unsigned int) next_src_y_upper;
+		next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+		// calculate weight NORTH-SOUTH
+		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+		// get next lines
+		next_src_idx = curr_src_idx^1;
+		next_dst_idx = curr_dst_idx^1;
+
+		// 4 lines y
+		mfc_get( y_plane[next_src_idx],
+				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		// 2 lines v
+		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
+		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
+		mfc_get( v_plane[next_src_idx],
+				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
+				src_dbl_linestride_vu+(next_lsoff_v<<1),
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		// 2 lines u
+		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
+		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
+		mfc_get( u_plane[next_src_idx],
+				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
+				src_dbl_linestride_vu+(next_lsoff_v<<1),
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+		// scaling
+		// work line y_upper
+		bilinear_scale_line_w16( y_plane[curr_src_idx],
+				scaled_y_plane[curr_src_idx],
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_upper,
+				src_linestride_y );
+		// work line y_lower
+		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_lower,
+				src_linestride_y );
+		// work line v
+		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+				scaled_v_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+		// work line u
+		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+				scaled_u_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+
+		//---------------------------------------------------------------------------------------------
+		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+		// Perform three DMA transfers to 3 different locations in the main memory!
+		// dst_width:	Pixel width of destination image
+		// dst_addr:	Destination address in main memory
+		// dst_vu:	Counter which is incremented one by one
+		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+		//---------------------------------------------------------------------------------------------
+
+
+		// update for next cycle
+		curr_src_idx = next_src_idx;
+		curr_dst_idx = next_dst_idx;
+
+		curr_interpl_y_upper = next_interpl_y_upper;
+		curr_interpl_y_lower = next_interpl_y_lower;
+		curr_interpl_vu = next_interpl_vu;
+
+		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+		vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+		curr_src_y_upper = next_src_y_upper;
+		curr_src_y_lower = next_src_y_lower;
+		curr_src_vu = next_src_vu;
+
+		curr_lsoff_v = next_lsoff_v;
+		curr_lsoff_u = next_lsoff_u;
+	}
+
+
+
+	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+	// scaling
+	// work line y_upper
+	bilinear_scale_line_w16( y_plane[curr_src_idx],
+			scaled_y_plane[curr_src_idx],
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_upper,
+			src_linestride_y );
+	// work line y_lower
+	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_lower,
+			src_linestride_y );
+	// work line v
+	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
+			scaled_v_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+	// work line u
+	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
+			scaled_u_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+
+	//---------------------------------------------------------------------------------------------
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+	// Perform three DMA transfers to 3 different locations in the main memory!
+	// dst_width:	Pixel width of destination image
+	// dst_addr:	Destination address in main memory
+	// dst_vu:	Counter which is incremented one by one
+	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	// wait for completion
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+	//---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * scale_srcw32_dstw16()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 16
+ * yuv2rgb conversion on a width of 16
+ * result stored in RAM
+ */
+void scale_srcw32_dstw16() {
+	// extract parameters
+	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+	unsigned int src_width = parms.src_pixel_width;
+	unsigned int src_height = parms.src_pixel_height;
+	unsigned int dst_width = parms.dst_pixel_width;
+	unsigned int dst_height = parms.dst_pixel_height;
+
+	// YVU
+	unsigned int src_linestride_y = src_width;
+	unsigned int src_dbl_linestride_y = src_width<<1;
+	unsigned int src_linestride_vu = src_width>>1;
+	unsigned int src_dbl_linestride_vu = src_width;
+	// scaled YVU
+	unsigned int scaled_src_linestride_y = dst_width;
+
+	// ram addresses
+	unsigned char* src_addr_y = parms.y_plane;
+	unsigned char* src_addr_v = parms.v_plane;
+	unsigned char* src_addr_u = parms.u_plane;
+
+	unsigned int dst_picture_size = dst_width*dst_height;
+
+	// Sizes for destination
+	unsigned int dst_dbl_linestride_y = dst_width<<1;
+	unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+	// Perform address calculation for Y, V and U in main memory with dst_addr as base
+	unsigned char* dst_addr_main_memory_y = dst_addr;
+	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+	// calculate scale factors
+	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+	float y_scale = (float)src_height/(float)dst_height;
+
+	// double buffered processing
+	// buffer switching
+	unsigned int curr_src_idx = 0;
+	unsigned int curr_dst_idx = 0;
+	unsigned int next_src_idx, next_dst_idx;
+
+	// 2 lines y as output, upper and lowerline
+	unsigned int curr_interpl_y_upper = 0;
+	unsigned int next_interpl_y_upper;
+	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+	// only 1 line v/u output, both planes have the same dimension
+	unsigned int curr_interpl_vu = 0;
+	unsigned int next_interpl_vu;
+
+	// weights, calculated in every loop iteration
+	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_y_upper;
+	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_vu;
+
+	// line indices for the src picture
+	float curr_src_y_upper = 0.0f, next_src_y_upper;
+	float curr_src_y_lower, next_src_y_lower;
+	float curr_src_vu = 0.0f, next_src_vu;
+
+	// line indices for the dst picture
+	unsigned int dst_y=0, dst_vu=0;
+
+	// calculate lower line idices
+	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+	// lower line weight
+	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+	// start partially double buffered processing
+	// get initial data, 2 sets of y, 1 set v, 1 set u
+	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+			src_dbl_linestride_y,
+			RETR_BUF,
+			0, 0 );
+	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+	// iteration loop
+	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+	// the scaled output is 2 lines y, 1 line v, 1 line u
+	// the yuv2rgb-converted output is stored to RAM
+	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+		dst_y = dst_vu<<1;
+
+		// calculate next indices
+		next_src_vu = ((float)dst_vu+1)*y_scale;
+		next_src_y_upper = ((float)dst_y+2)*y_scale;
+		next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+		next_interpl_vu = (unsigned int) next_src_vu;
+		next_interpl_y_upper = (unsigned int) next_src_y_upper;
+		next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+		// calculate weight NORTH-SOUTH
+		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+		// get next lines
+		next_src_idx = curr_src_idx^1;
+		next_dst_idx = curr_dst_idx^1;
+
+		// 4 lines y
+		mfc_get( y_plane[next_src_idx],
+				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		// 2 lines v
+		mfc_get( v_plane[next_src_idx],
+				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+				src_dbl_linestride_vu,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		// 2 lines u
+		mfc_get( u_plane[next_src_idx],
+				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+				src_dbl_linestride_vu,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+		// scaling
+		// work line y_upper
+		bilinear_scale_line_w16( y_plane[curr_src_idx],
+				scaled_y_plane[curr_src_idx],
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_upper,
+				src_linestride_y );
+		// work line y_lower
+		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_lower,
+				src_linestride_y );
+		// work line v
+		bilinear_scale_line_w16( v_plane[curr_src_idx],
+				scaled_v_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+		// work line u
+		bilinear_scale_line_w16( u_plane[curr_src_idx],
+				scaled_u_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+
+		//---------------------------------------------------------------------------------------------
+		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+		// Perform three DMA transfers to 3 different locations in the main memory!
+		// dst_width:	Pixel width of destination image
+		// dst_addr:	Destination address in main memory
+		// dst_vu:	Counter which is incremented one by one
+		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+		//---------------------------------------------------------------------------------------------
+
+
+		// update for next cycle
+		curr_src_idx = next_src_idx;
+		curr_dst_idx = next_dst_idx;
+
+		curr_interpl_y_upper = next_interpl_y_upper;
+		curr_interpl_y_lower = next_interpl_y_lower;
+		curr_interpl_vu = next_interpl_vu;
+
+		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+		vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+		curr_src_y_upper = next_src_y_upper;
+		curr_src_y_lower = next_src_y_lower;
+		curr_src_vu = next_src_vu;
+	}
+
+
+
+	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+	// scaling
+	// work line y_upper
+	bilinear_scale_line_w16( y_plane[curr_src_idx],
+			scaled_y_plane[curr_src_idx],
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_upper,
+			src_linestride_y );
+	// work line y_lower
+	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_lower,
+			src_linestride_y );
+	// work line v
+	bilinear_scale_line_w16( v_plane[curr_src_idx],
+			scaled_v_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+	// work line u
+	bilinear_scale_line_w16( u_plane[curr_src_idx],
+			scaled_u_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+
+
+	//---------------------------------------------------------------------------------------------
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+	// Perform three DMA transfers to 3 different locations in the main memory!
+	// dst_width:	Pixel width of destination image
+	// dst_addr:	Destination address in main memory
+	// dst_vu:	Counter which is incremented one by one
+	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	// wait for completion
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+	//---------------------------------------------------------------------------------------------
+}
+
+
+/**
+ * scale_srcw32_dstw32()
+ *
+ * processes an input image of width 32
+ * scaling is done to a width 32
+ * yuv2rgb conversion on a width of 32
+ * result stored in RAM
+ */
+void scale_srcw32_dstw32() {
+	// extract parameters
+	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;
+
+	unsigned int src_width = parms.src_pixel_width;
+	unsigned int src_height = parms.src_pixel_height;
+	unsigned int dst_width = parms.dst_pixel_width;
+	unsigned int dst_height = parms.dst_pixel_height;
+
+	// YVU
+	unsigned int src_linestride_y = src_width;
+	unsigned int src_dbl_linestride_y = src_width<<1;
+	unsigned int src_linestride_vu = src_width>>1;
+	unsigned int src_dbl_linestride_vu = src_width;
+
+	// scaled YVU
+	unsigned int scaled_src_linestride_y = dst_width;
+
+	// ram addresses
+	unsigned char* src_addr_y = parms.y_plane;
+	unsigned char* src_addr_v = parms.v_plane;
+	unsigned char* src_addr_u = parms.u_plane;
+
+	unsigned int dst_picture_size = dst_width*dst_height;
+
+	// Sizes for destination
+	unsigned int dst_dbl_linestride_y = dst_width<<1;
+	unsigned int dst_dbl_linestride_vu = dst_width>>1;
+
+	// Perform address calculation for Y, V and U in main memory with dst_addr as base
+	unsigned char* dst_addr_main_memory_y = dst_addr;
+	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
+	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);
+
+	// calculate scale factors
+	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
+	float y_scale = (float)src_height/(float)dst_height;
+
+	// double buffered processing
+	// buffer switching
+	unsigned int curr_src_idx = 0;
+	unsigned int curr_dst_idx = 0;
+	unsigned int next_src_idx, next_dst_idx;
+
+	// 2 lines y as output, upper and lowerline
+	unsigned int curr_interpl_y_upper = 0;
+	unsigned int next_interpl_y_upper;
+	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
+	// only 1 line v/u output, both planes have the same dimension
+	unsigned int curr_interpl_vu = 0;
+	unsigned int next_interpl_vu;
+
+	// weights, calculated in every loop iteration
+	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_y_upper;
+	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
+	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
+	vector float vf_next_NSweight_vu;
+
+	// line indices for the src picture
+	float curr_src_y_upper = 0.0f, next_src_y_upper;
+	float curr_src_y_lower, next_src_y_lower;
+	float curr_src_vu = 0.0f, next_src_vu;
+
+	// line indices for the dst picture
+	unsigned int dst_y=0, dst_vu=0;
+
+	// calculate lower line idices
+	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
+	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
+	// lower line weight
+	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );
+
+
+	// start partially double buffered processing
+	// get initial data, 2 sets of y, 1 set v, 1 set u
+	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
+	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
+			src_dbl_linestride_y,
+			RETR_BUF,
+			0, 0 );
+	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
+
+	// iteration loop
+	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
+	// the scaled output is 2 lines y, 1 line v, 1 line u
+	// the yuv2rgb-converted output is stored to RAM
+	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
+		dst_y = dst_vu<<1;
+
+		// calculate next indices
+		next_src_vu = ((float)dst_vu+1)*y_scale;
+		next_src_y_upper = ((float)dst_y+2)*y_scale;
+		next_src_y_lower = ((float)dst_y+3)*y_scale;
+
+		next_interpl_vu = (unsigned int) next_src_vu;
+		next_interpl_y_upper = (unsigned int) next_src_y_upper;
+		next_interpl_y_lower = (unsigned int) next_src_y_lower;
+
+		// calculate weight NORTH-SOUTH
+		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
+		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
+		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );
+
+		// get next lines
+		next_src_idx = curr_src_idx^1;
+		next_dst_idx = curr_dst_idx^1;
+
+		// 4 lines y
+		mfc_get( y_plane[next_src_idx],
+				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
+				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
+				src_dbl_linestride_y,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		// 2 lines v
+		mfc_get( v_plane[next_src_idx],
+				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
+				src_dbl_linestride_vu,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+		// 2 lines u
+		mfc_get( u_plane[next_src_idx],
+				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
+				src_dbl_linestride_vu,
+				RETR_BUF+next_src_idx,
+				0, 0 );
+
+		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+		// scaling
+		// work line y_upper
+		bilinear_scale_line_w16( y_plane[curr_src_idx],
+				scaled_y_plane[curr_src_idx],
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_upper,
+				src_linestride_y );
+		// work line y_lower
+		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+				dst_width,
+				vf_x_scale,
+				vf_curr_NSweight_y_lower,
+				src_linestride_y );
+		// work line v
+		bilinear_scale_line_w16( v_plane[curr_src_idx],
+				scaled_v_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+		// work line u
+		bilinear_scale_line_w16( u_plane[curr_src_idx],
+				scaled_u_plane[curr_src_idx],
+				dst_width>>1,
+				vf_x_scale,
+				vf_curr_NSweight_vu,
+				src_linestride_vu );
+
+
+
+		// Store the result back to main memory into a destination buffer in YUV format
+		//---------------------------------------------------------------------------------------------
+		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+		// Perform three DMA transfers to 3 different locations in the main memory!
+		// dst_width:	Pixel width of destination image
+		// dst_addr:	Destination address in main memory
+		// dst_vu:	Counter which is incremented one by one
+		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+
+		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+				STR_BUF+curr_dst_idx,								// Tag
+				0, 0 );
+		//---------------------------------------------------------------------------------------------
+
+
+		// update for next cycle
+		curr_src_idx = next_src_idx;
+		curr_dst_idx = next_dst_idx;
+
+		curr_interpl_y_upper = next_interpl_y_upper;
+		curr_interpl_y_lower = next_interpl_y_lower;
+		curr_interpl_vu = next_interpl_vu;
+
+		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
+		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
+		vf_curr_NSweight_vu = vf_next_NSweight_vu;
+
+		curr_src_y_upper = next_src_y_upper;
+		curr_src_y_lower = next_src_y_lower;
+		curr_src_vu = next_src_vu;
+	}
+
+
+
+	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );
+
+	// scaling
+	// work line y_upper
+	bilinear_scale_line_w16( y_plane[curr_src_idx],
+			scaled_y_plane[curr_src_idx],
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_upper,
+			src_linestride_y );
+	// work line y_lower
+	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
+			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
+			dst_width,
+			vf_x_scale,
+			vf_curr_NSweight_y_lower,
+			src_linestride_y );
+	// work line v
+	bilinear_scale_line_w16( v_plane[curr_src_idx],
+			scaled_v_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+	// work line u
+	bilinear_scale_line_w16( u_plane[curr_src_idx],
+			scaled_u_plane[curr_src_idx],
+			dst_width>>1,
+			vf_x_scale,
+			vf_curr_NSweight_vu,
+			src_linestride_vu );
+
+
+	// Store the result back to main memory into a destination buffer in YUV format
+	//---------------------------------------------------------------------------------------------
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+
+	// Perform three DMA transfers to 3 different locations in the main memory!
+	// dst_width:	Pixel width of destination image
+	// dst_addr:	Destination address in main memory
+	// dst_vu:	Counter which is incremented one by one
+	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
+
+	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
+			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
+			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
+			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
+			STR_BUF+curr_dst_idx,								// Tag
+			0, 0 );
+
+	// wait for completion
+	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
+	//---------------------------------------------------------------------------------------------
+}
+
+
+/*
+ * bilinear_scale_line_w8()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 8
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+
+	unsigned char* dst = dst_;
+
+	unsigned int dst_x;
+	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
+		// address calculation for loading the 4 surrounding pixel of each calculated
+		// destination pixel
+		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+		// lower range->first 4 pixel
+		// upper range->next 4 pixel
+		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
+		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
+		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
+		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );
+
+		// calculate weight EAST-WEST
+		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
+		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
+		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
+		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
+		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
+		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
+		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
+		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
+		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
+		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );
+
+		// calculate address offset
+		//
+		// pixel NORTH WEST
+		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
+		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;
+
+		// pixel NORTH EAST-->(offpixelNW+1)
+		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
+		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );
+
+		// SOUTH-WEST-->(offpixelNW+src_linestride)
+		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
+		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );
+
+		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
+		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
+		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );
+
+		// calculate each address
+		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
+		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
+		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
+		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );
+
+		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
+		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
+		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
+		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );
+
+		// get each pixel
+		//
+		// scalar load, afterwards insertion into the right position
+		// NORTH WEST
+		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
+				vuc_pixel_NW_lower_range, 7 );
+		vuc_pixel_NW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
+				vuc_pixel_NW_lower_range, 11 );
+		vuc_pixel_NW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
+				vuc_pixel_NW_lower_range, 15 );
+
+		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
+				vuc_pixel_NW_upper_range, 7 );
+		vuc_pixel_NW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
+				vuc_pixel_NW_upper_range, 11 );
+		vuc_pixel_NW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
+				vuc_pixel_NW_upper_range, 15 );
+
+		// NORTH EAST
+		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
+				vuc_pixel_NE_lower_range, 7 );
+		vuc_pixel_NE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
+				vuc_pixel_NE_lower_range, 11 );
+		vuc_pixel_NE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
+				vuc_pixel_NE_lower_range, 15 );
+
+		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
+				vuc_pixel_NE_upper_range, 7 );
+		vuc_pixel_NE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
+				vuc_pixel_NE_upper_range, 11 );
+		vuc_pixel_NE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
+				vuc_pixel_NE_upper_range, 15 );
+
+
+		// SOUTH WEST
+		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
+				vuc_pixel_SW_lower_range, 7 );
+		vuc_pixel_SW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
+				vuc_pixel_SW_lower_range, 11 );
+		vuc_pixel_SW_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
+				vuc_pixel_SW_lower_range, 15 );
+
+		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
+				vuc_pixel_SW_upper_range, 7 );
+		vuc_pixel_SW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
+				vuc_pixel_SW_upper_range, 11 );
+		vuc_pixel_SW_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
+				vuc_pixel_SW_upper_range, 15 );
+
+		// SOUTH EAST
+		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
+				vuc_pixel_SE_lower_range, 7 );
+		vuc_pixel_SE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
+				vuc_pixel_SE_lower_range, 11 );
+		vuc_pixel_SE_lower_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
+				vuc_pixel_SE_lower_range, 15 );
+
+		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
+				vuc_pixel_SE_upper_range, 7 );
+		vuc_pixel_SE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
+				vuc_pixel_SE_upper_range, 11 );
+		vuc_pixel_SE_upper_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
+				vuc_pixel_SE_upper_range, 15 );
+
+
+		// convert to float
+		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
+		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );
+
+		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
+		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );
+
+		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
+		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );
+
+		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
+		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );
+
+
+
+		// first linear interpolation: EWtop
+		// EWtop = NW + EWweight*(NE-NW)
+		//
+		// lower range
+		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
+		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
+								vf_EWtop_lower_range_tmp,
+								vf_pixel_NW_lower_range );
+
+		// upper range
+		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
+		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
+								vf_EWtop_upper_range_tmp,
+								vf_pixel_NW_upper_range );
+
+
+
+		// second linear interpolation: EWbottom
+		// EWbottom = SW + EWweight*(SE-SW)
+		//
+		// lower range
+		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
+		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
+								vf_EWbottom_lower_range_tmp,
+								vf_pixel_SW_lower_range );
+
+		// upper range
+		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
+		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
+								vf_EWbottom_upper_range_tmp,
+								vf_pixel_SW_upper_range );
+
+
+
+		// third linear interpolation: the bilinear interpolated value
+		// result = EWtop + NSweight*(EWbottom-EWtop);
+		//
+		// lower range
+		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
+		vector float vf_result_lower_range = spu_madd( vf_NSweight,
+								vf_result_lower_range_tmp,
+								vf_EWtop_lower_range );
+
+		// upper range
+		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
+		vector float vf_result_upper_range = spu_madd( vf_NSweight,
+								vf_result_upper_range_tmp,
+								vf_EWtop_upper_range );
+
+
+		// convert back: using saturated arithmetic
+		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
+		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );
+
+		// merge results->lower,upper
+		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
+							       0x13, 0x17, 0x1B, 0x1F,
+							       0x00, 0x00, 0x00, 0x00,
+							       0x00, 0x00, 0x00, 0x00 };
+
+		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
+								(vector unsigned char) vui_result_upper_range,
+								vuc_mask_merge_result );
+
+		// partial storing
+		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
+						      0x00, 0x00, 0x00, 0x00,
+						      0xFF, 0xFF, 0xFF, 0xFF,
+						      0xFF, 0xFF, 0xFF, 0xFF };
+
+
+		// get currently stored data
+		vector unsigned char vuc_orig = *((vector unsigned char*)dst);
+
+		// clear currently stored data
+		vuc_orig = spu_and( vuc_orig,
+				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );
+
+		// rotate result according to storing address
+		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );
+
+		// store result
+		*((vector unsigned char*)dst) = spu_or( vuc_result,
+							vuc_orig );
+		dst += 8;
+	}
+}
+
+
+/*
+ * bilinear_scale_line_w16()
+ *
+ * processes a line of yuv-input, width has to be a multiple of 16
+ * scaled yuv-output is written to local store buffer
+ *
+ * @param src buffer for 2 lines input
+ * @param dst_ buffer for 1 line output
+ * @param dst_width the width of the destination line
+ * @param vf_x_scale a float vector, at each entry is the x_scale-factor
+ * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
+ * @param src_linestride the stride of the srcline
+ */
+void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {
+
+	unsigned char* dst = dst_;
+
+	unsigned int dst_x;
+	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
+		// address calculation for loading the 4 surrounding pixel of each calculated
+		// destination pixel
+		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
+		// parallelised processing
+		// first range->pixel 1 2 3 4
+		// second range->pixel 5 6 7 8
+		// third range->pixel 9 10 11 12
+		// fourth range->pixel 13 14 15 16
+		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
+		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
+		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
+		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
+		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
+		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
+		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
+		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );
+
+		// calculate weight EAST-WEST
+		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
+		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
+		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
+		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
+		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
+		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
+		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
+		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
+		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
+		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
+		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
+		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
+		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
+		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
+		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
+		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
+		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
+		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
+		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
+		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );
+
+		// calculate address offset
+		//
+		// pixel NORTH WEST
+		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
+		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
+		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
+		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;
+
+		// pixel NORTH EAST-->(offpixelNW+1)
+		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
+		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
+		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
+		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
+		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );
+
+		// SOUTH-WEST-->(offpixelNW+src_linestride)
+		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
+		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
+		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
+		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
+		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );
+
+		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
+		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
+		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
+		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
+		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );
+
+		// calculate each address
+		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
+		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
+		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
+		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
+		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );
+
+		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
+		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
+		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
+		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );
+
+		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
+		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
+		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
+		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );
+
+		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
+		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
+		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
+		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );
+
+
+		// get each pixel
+		//
+		// scalar load, afterwards insertion into the right position
+		// NORTH WEST
+		// first range
+		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
+				vuc_pixel_NW_first_range, 7 );
+		vuc_pixel_NW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
+				vuc_pixel_NW_first_range, 11 );
+		vuc_pixel_NW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
+				vuc_pixel_NW_first_range, 15 );
+		// second range
+		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
+				vuc_pixel_NW_second_range, 7 );
+		vuc_pixel_NW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
+				vuc_pixel_NW_second_range, 11 );
+		vuc_pixel_NW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
+				vuc_pixel_NW_second_range, 15 );
+		// third range
+		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
+				vuc_pixel_NW_third_range, 7 );
+		vuc_pixel_NW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
+				vuc_pixel_NW_third_range, 11 );
+		vuc_pixel_NW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
+				vuc_pixel_NW_third_range, 15 );
+		// fourth range
+		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
+		vuc_pixel_NW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
+				vuc_pixel_NW_fourth_range, 7 );
+		vuc_pixel_NW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
+				vuc_pixel_NW_fourth_range, 11 );
+		vuc_pixel_NW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
+				vuc_pixel_NW_fourth_range, 15 );
+
+		// NORTH EAST
+		// first range
+		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
+				vuc_pixel_NE_first_range, 7 );
+		vuc_pixel_NE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
+				vuc_pixel_NE_first_range, 11 );
+		vuc_pixel_NE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
+				vuc_pixel_NE_first_range, 15 );
+		// second range
+		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
+				vuc_pixel_NE_second_range, 7 );
+		vuc_pixel_NE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
+				vuc_pixel_NE_second_range, 11 );
+		vuc_pixel_NE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
+				vuc_pixel_NE_second_range, 15 );
+		// third range
+		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
+				vuc_pixel_NE_third_range, 7 );
+		vuc_pixel_NE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
+				vuc_pixel_NE_third_range, 11 );
+		vuc_pixel_NE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
+				vuc_pixel_NE_third_range, 15 );
+		// fourth range
+		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
+		vuc_pixel_NE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
+				vuc_pixel_NE_fourth_range, 7 );
+		vuc_pixel_NE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
+				vuc_pixel_NE_fourth_range, 11 );
+		vuc_pixel_NE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
+				vuc_pixel_NE_fourth_range, 15 );
+
+		// SOUTH WEST
+		// first range
+		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
+				vuc_pixel_SW_first_range, 7 );
+		vuc_pixel_SW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
+				vuc_pixel_SW_first_range, 11 );
+		vuc_pixel_SW_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
+				vuc_pixel_SW_first_range, 15 );
+		// second range
+		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
+				vuc_pixel_SW_second_range, 7 );
+		vuc_pixel_SW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
+				vuc_pixel_SW_second_range, 11 );
+		vuc_pixel_SW_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
+				vuc_pixel_SW_second_range, 15 );
+		// third range
+		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
+				vuc_pixel_SW_third_range, 7 );
+		vuc_pixel_SW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
+				vuc_pixel_SW_third_range, 11 );
+		vuc_pixel_SW_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
+				vuc_pixel_SW_third_range, 15 );
+		// fourth range
+		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
+		vuc_pixel_SW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
+				vuc_pixel_SW_fourth_range, 7 );
+		vuc_pixel_SW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
+				vuc_pixel_SW_fourth_range, 11 );
+		vuc_pixel_SW_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
+				vuc_pixel_SW_fourth_range, 15 );
+
+		// NORTH EAST
+		// first range
+		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
+				vuc_pixel_SE_first_range, 7 );
+		vuc_pixel_SE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
+				vuc_pixel_SE_first_range, 11 );
+		vuc_pixel_SE_first_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
+				vuc_pixel_SE_first_range, 15 );
+		// second range
+		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
+				vuc_pixel_SE_second_range, 7 );
+		vuc_pixel_SE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
+				vuc_pixel_SE_second_range, 11 );
+		vuc_pixel_SE_second_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
+				vuc_pixel_SE_second_range, 15 );
+		// third range
+		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
+				vuc_pixel_SE_third_range, 7 );
+		vuc_pixel_SE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
+				vuc_pixel_SE_third_range, 11 );
+		vuc_pixel_SE_third_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
+				vuc_pixel_SE_third_range, 15 );
+		// fourth range
+		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
+		vuc_pixel_SE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
+				vuc_pixel_SE_fourth_range, 7 );
+		vuc_pixel_SE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
+				vuc_pixel_SE_fourth_range, 11 );
+		vuc_pixel_SE_fourth_range = spu_insert(
+				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
+				vuc_pixel_SE_fourth_range, 15 );
+
+
+
+		// convert to float
+		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
+		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
+		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
+		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );
+
+		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
+		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
+		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
+		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );
+
+		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
+		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
+		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
+		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );
+
+		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
+		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
+		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
+		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );
+
+		// first linear interpolation: EWtop
+		// EWtop = NW + EWweight*(NE-NW)
+		//
+		// first range
+		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
+		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
+								vf_EWtop_first_range_tmp,
+								vf_pixel_NW_first_range );
+
+		// second range
+		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
+		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
+								vf_EWtop_second_range_tmp,
+								vf_pixel_NW_second_range );
+
+		// third range
+		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
+		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
+								vf_EWtop_third_range_tmp,
+								vf_pixel_NW_third_range );
+
+		// fourth range
+		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
+		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
+								vf_EWtop_fourth_range_tmp,
+								vf_pixel_NW_fourth_range );
+
+
+
+		// second linear interpolation: EWbottom
+		// EWbottom = SW + EWweight*(SE-SW)
+		//
+		// first range
+		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
+		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
+								vf_EWbottom_first_range_tmp,
+								vf_pixel_SW_first_range );
+
+		// second range
+		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
+		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
+								vf_EWbottom_second_range_tmp,
+								vf_pixel_SW_second_range );
+		// first range
+		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
+		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
+								vf_EWbottom_third_range_tmp,
+								vf_pixel_SW_third_range );
+
+		// first range
+		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
+		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
+								vf_EWbottom_fourth_range_tmp,
+								vf_pixel_SW_fourth_range );
+
+
+
+		// third linear interpolation: the bilinear interpolated value
+		// result = EWtop + NSweight*(EWbottom-EWtop);
+		//
+		// first range
+		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
+		vector float vf_result_first_range = spu_madd( vf_NSweight,
+								vf_result_first_range_tmp,
+								vf_EWtop_first_range );
+
+		// second range
+		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
+		vector float vf_result_second_range = spu_madd( vf_NSweight,
+								vf_result_second_range_tmp,
+								vf_EWtop_second_range );
+
+		// third range
+		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
+		vector float vf_result_third_range = spu_madd( vf_NSweight,
+								vf_result_third_range_tmp,
+								vf_EWtop_third_range );
+
+		// fourth range
+		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
+		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
+								vf_result_fourth_range_tmp,
+								vf_EWtop_fourth_range );
+
+
+
+		// convert back: using saturated arithmetic
+		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
+		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
+		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
+		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );
+
+		// merge results->lower,upper
+		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
+							       		    0x13, 0x17, 0x1B, 0x1F,
+							       		    0x00, 0x00, 0x00, 0x00,
+							       		    0x00, 0x00, 0x00, 0x00 };
+
+		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
+							       		    0x00, 0x00, 0x00, 0x00,
+									    0x03, 0x07, 0x0B, 0x0F,
+							       		    0x13, 0x17, 0x1B, 0x1F };
+
+		vector unsigned char vuc_result_first_second =
+						spu_shuffle( (vector unsigned char) vui_result_first_range,
+								 (vector unsigned char) vui_result_second_range,
+								vuc_mask_merge_result_first_second );
+
+		vector unsigned char vuc_result_third_fourth =
+						spu_shuffle( (vector unsigned char) vui_result_third_range,
+								 (vector unsigned char) vui_result_fourth_range,
+								vuc_mask_merge_result_third_fourth );
+
+		// store result
+		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
+							vuc_result_third_fourth );
+		dst += 16;
+	}
+}
+
diff --git a/src/video/ps3/spulibs/fb_writer.c b/src/video/ps3/spulibs/fb_writer.c
new file mode 100644
index 000000000..0eb51cc68
--- /dev/null
+++ b/src/video/ps3/spulibs/fb_writer.c
@@ -0,0 +1,193 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <stdio.h>
+#include <string.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+	fprintf( stdout, fmt, ##args ); \
+	fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+void cpy_to_fb(unsigned int);
+
+/* fb_writer_spu parms */
+static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128)));
+
+/* Code running on SPU */
+int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+	deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id);
+	uint32_t ea_mfc, mbox;
+	// send ready message
+	spu_write_out_mbox(SPU_READY);
+
+	while (1) {
+		/* Check mailbox */
+		mbox = spu_read_in_mbox();
+		deprintf("[SPU] Message is %u\n", mbox);
+		switch (mbox) {
+			case SPU_EXIT:
+				deprintf("[SPU] fb_writer goes down...\n");
+				return 0;
+			case SPU_START:
+				break;
+			default:
+				deprintf("[SPU] Cannot handle message\n");
+				continue;
+		}
+
+		/* Tag Manager setup */
+		unsigned int tags;
+		tags = mfc_multi_tag_reserve(5);
+		if (tags == MFC_TAG_INVALID) {
+			deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n");
+			return 0;
+		}
+
+		/* Framebuffer parms */
+		ea_mfc = spu_read_in_mbox();
+		deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc);
+		spu_mfcdma32(&parms, (unsigned int)ea_mfc,
+				sizeof(struct fb_writer_parms_t), tags,
+				MFC_GET_CMD);
+		deprintf("[SPU] argp = %u\n", (unsigned int)argp);
+		DMA_WAIT_TAG(tags);
+
+		/* Copy parms->data to framebuffer */
+		deprintf("[SPU] Copying to framebuffer started\n");
+		cpy_to_fb(tags);
+		deprintf("[SPU] Copying to framebuffer done!\n");
+
+		mfc_multi_tag_release(tags, 5);
+		deprintf("[SPU] fb_writer_spu... done!\n");
+		/* Send FIN msg */
+		spu_write_out_mbox(SPU_FIN);
+	}
+
+	return 0;
+}
+
+void cpy_to_fb(unsigned int tag_id_base)
+{
+	unsigned int i;
+	unsigned char current_buf;
+	uint8_t *in = parms.data;
+
+	/* Align fb pointer which was centered before */
+	uint8_t *fb =
+	    (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0);
+
+	uint32_t bounded_input_height = parms.bounded_input_height;
+	uint32_t bounded_input_width = parms.bounded_input_width;
+	uint32_t fb_pixel_size = parms.fb_pixel_size;
+
+	uint32_t out_line_stride = parms.out_line_stride;
+	uint32_t in_line_stride = parms.in_line_stride;
+	uint32_t in_line_size = bounded_input_width * fb_pixel_size;
+
+	current_buf = 0;
+
+	/* Local store buffer */
+	static volatile uint8_t buf[4][BUFFER_SIZE]
+	    __attribute__ ((aligned(128)));
+	/* do 4-times multibuffering using DMA list, process in two steps */
+	for (i = 0; i < bounded_input_height >> 2; i++) {
+		/* first buffer */
+		DMA_WAIT_TAG(tag_id_base + 1);
+		// retrieve buffer
+		spu_mfcdma32(buf[0], (unsigned int)in, in_line_size,
+			     tag_id_base + 1, MFC_GETB_CMD);
+		DMA_WAIT_TAG(tag_id_base + 1);
+		// store buffer
+		spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size,
+			     tag_id_base + 1, MFC_PUTB_CMD);
+		in += in_line_stride;
+		fb += out_line_stride;
+		deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in,
+		       fb);
+
+		/* second buffer */
+		DMA_WAIT_TAG(tag_id_base + 2);
+		// retrieve buffer
+		spu_mfcdma32(buf[1], (unsigned int)in, in_line_size,
+			     tag_id_base + 2, MFC_GETB_CMD);
+		DMA_WAIT_TAG(tag_id_base + 2);
+		// store buffer
+		spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size,
+			     tag_id_base + 2, MFC_PUTB_CMD);
+		in += in_line_stride;
+		fb += out_line_stride;
+		deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in,
+		       fb);
+
+		/* third buffer */
+		DMA_WAIT_TAG(tag_id_base + 3);
+		// retrieve buffer
+		spu_mfcdma32(buf[2], (unsigned int)in, in_line_size,
+			     tag_id_base + 3, MFC_GETB_CMD);
+		DMA_WAIT_TAG(tag_id_base + 3);
+		// store buffer
+		spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size,
+			     tag_id_base + 3, MFC_PUTB_CMD);
+		in += in_line_stride;
+		fb += out_line_stride;
+		deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in,
+		       fb);
+
+		/* fourth buffer */
+		DMA_WAIT_TAG(tag_id_base + 4);
+		// retrieve buffer
+		spu_mfcdma32(buf[3], (unsigned int)in, in_line_size,
+			     tag_id_base + 4, MFC_GETB_CMD);
+		DMA_WAIT_TAG(tag_id_base + 4);
+		// store buffer
+		spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size,
+			     tag_id_base + 4, MFC_PUTB_CMD);
+		in += in_line_stride;
+		fb += out_line_stride;
+		deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in,
+		       fb);
+		deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i,
+		       bounded_input_height >> 2);
+	}
+	DMA_WAIT_TAG(tag_id_base + 2);
+	DMA_WAIT_TAG(tag_id_base + 3);
+	DMA_WAIT_TAG(tag_id_base + 4);
+}
+
+
diff --git a/src/video/ps3/spulibs/spu_common.h b/src/video/ps3/spulibs/spu_common.h
new file mode 100644
index 000000000..42c328c83
--- /dev/null
+++ b/src/video/ps3/spulibs/spu_common.h
@@ -0,0 +1,108 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+/* Common definitions/makros for SPUs */
+
+#ifndef _SPU_COMMON_H
+#define _SPU_COMMON_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Tag management */
+#define DMA_WAIT_TAG(_tag)     \
+    mfc_write_tag_mask(1<<(_tag)); \
+    mfc_read_tag_status_all();
+
+/* SPU mailbox messages */
+#define SPU_READY	0
+#define SPU_START	1
+#define SPU_FIN		2
+#define SPU_EXIT	3
+
+/* Tags */
+#define RETR_BUF	0
+#define STR_BUF		1
+#define TAG_INIT	2
+
+/* Buffersizes */
+#define MAX_HDTV_WIDTH 1920
+#define MAX_HDTV_HEIGHT 1080
+/* One stride of HDTV */
+#define BUFFER_SIZE 7680
+
+/* fb_writer ppu/spu exchange parms */
+struct fb_writer_parms_t {
+	uint8_t *data;
+	uint8_t *center;
+	uint32_t out_line_stride;
+	uint32_t in_line_stride;
+	uint32_t bounded_input_height;
+	uint32_t bounded_input_width;
+	uint32_t fb_pixel_size;
+
+	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+	char padding[4];
+} __attribute__((aligned(128)));
+
+/* yuv2rgb ppu/spu exchange parms */
+struct yuv2rgb_parms_t {
+	uint8_t* y_plane;
+	uint8_t* v_plane;
+	uint8_t* u_plane;
+
+	uint8_t* dstBuffer;
+
+	unsigned int src_pixel_width;
+	unsigned int src_pixel_height;
+
+	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+	char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+
+/* bilin_scaler ppu/spu exchange parms */
+struct scale_parms_t {
+	uint8_t* y_plane;
+	uint8_t* v_plane;
+	uint8_t* u_plane;
+
+	uint8_t* dstBuffer;
+
+	unsigned int src_pixel_width;
+	unsigned int src_pixel_height;
+
+	unsigned int dst_pixel_width;
+	unsigned int dst_pixel_height;
+
+	/* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */
+	char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)];
+} __attribute__((aligned(128)));
+
+#endif /* _SPU_COMMON_H */
+
+
diff --git a/src/video/ps3/spulibs/yuv2rgb_converter.c b/src/video/ps3/spulibs/yuv2rgb_converter.c
new file mode 100644
index 000000000..5e166914c
--- /dev/null
+++ b/src/video/ps3/spulibs/yuv2rgb_converter.c
@@ -0,0 +1,629 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+	fprintf( stdout, fmt, ##args ); \
+	fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+
+/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
+unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void yuv_to_rgb_w16();
+void yuv_to_rgb_w32();
+
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
+
+
+int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
+	uint32_t ea_mfc, mbox;
+	// send ready message
+	spu_write_out_mbox(SPU_READY);
+
+	while (1) {
+		/* Check mailbox */
+		mbox = spu_read_in_mbox();
+		deprintf("[SPU] Message is %u\n", mbox);
+		switch (mbox) {
+			case SPU_EXIT:
+				deprintf("[SPU] fb_writer goes down...\n");
+				return 0;
+			case SPU_START:
+				break;
+			default:
+				deprintf("[SPU] Cannot handle message\n");
+				continue;
+		}
+
+		/* Tag Manager setup */
+		unsigned int tag_id;
+		tag_id = mfc_multi_tag_reserve(1);
+		if (tag_id == MFC_TAG_INVALID) {
+			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
+			return 0;
+		}
+
+		/* DMA transfer for the input parameters */
+		ea_mfc = spu_read_in_mbox();
+		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
+		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
+		DMA_WAIT_TAG(tag_id);
+
+		/* There are alignment issues that involve handling of special cases
+		 * a width of 32 results in a width of 16 in the chrominance
+		 * --> choose the proper handling to optimize the performance
+		 */
+		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
+		if (parms_converter.src_pixel_width & 0x1f) {
+			deprintf("[SPU] Using yuv_to_rgb_w16\n");
+			yuv_to_rgb_w16();
+		} else {
+			deprintf("[SPU] Using yuv_to_rgb_w32\n");
+			yuv_to_rgb_w32();
+		}
+
+		mfc_multi_tag_release(tag_id, 1);
+		deprintf("[SPU] yuv2rgb_spu... done!\n");
+		/* Send FIN message */
+		spu_write_out_mbox(SPU_FIN);
+	}
+
+	return 0;
+}
+
+
+/*
+ * float_to_char()
+ *
+ * converts a float to a character using saturated
+ * arithmetic
+ *
+ * @param s float for conversion
+ * @returns converted character
+ */
+inline static unsigned char float_to_char(float s) {
+	vector float vec_s = spu_splats(s);
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return (unsigned char) spu_extract(vec_s,0);
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return spu_convtu(vec_s,0);
+}
+
+
+void yuv_to_rgb_w16() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i<(height>>2)-1; i++) {
+
+		buf_idx^=1;
+
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+
+		// Convert YUV to BGRA, store it back (first two lines)
+		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+
+}
+
+
+void yuv_to_rgb_w32() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i < (height>>2)-1; i++) {
+		buf_idx^=1;
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("4lines = %d\n", size_4lines_y);
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+		// Convert YUV to BGRA, store it back (first two lines)
+		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+}
+
+
+/* Some vectors needed by the yuv 2 rgb conversion algorithm */
+const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
+const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
+const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
+const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
+const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
+
+const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
+const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
+const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
+const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
+
+const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
+
+const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
+const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
+
+
+/*
+ * yuv_to_rgb_w16()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 16
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+	unsigned int x;
+	for(x = 0; x < width; x+=2) {
+		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+		const unsigned char Y_1 = *(y_addr + x);
+		const unsigned char Y_2 = *(y_addr + x + 1);
+		const unsigned char Y_3 = *(y_addr + x + width);
+		const unsigned char Y_4 = *(y_addr + x + width + 1);
+		const unsigned char U = *(u_addr + (x >> 1));
+		const unsigned char V = *(v_addr + (x >> 1));
+
+		float V_minus_128 = (float)((float)V - 128.0f);
+		float U_minus_128 = (float)((float)U - 128.0f);
+
+		float R_precalculate = 1.403f * V_minus_128;
+		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
+		float B_precalculate = 1.773f * U_minus_128;
+
+		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
+		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
+		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
+		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
+		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
+		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
+		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
+		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
+		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
+		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
+		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
+		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
+
+		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
+		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
+		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
+		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
+	}
+}
+
+
+/*
+ * yuv_to_rgb_w32()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 32
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+	unsigned int x;
+	for(x = 0; x < width; x+=32) {
+		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+
+		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
+		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
+		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
+		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
+		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
+		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
+
+		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
+
+		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
+
+		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
+		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
+		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
+		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
+		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
+		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
+		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
+		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
+		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
+		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
+		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
+		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
+		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
+		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
+		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
+		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
+
+		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
+		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
+		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
+		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
+
+		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
+		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
+		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
+		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
+		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
+		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
+		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
+		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
+		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
+		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
+		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
+
+		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
+		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
+		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
+		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
+		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
+		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
+		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
+		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
+		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
+		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
+		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
+
+		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
+		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
+		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
+		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
+		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
+		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
+		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
+		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
+
+
+		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
+		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
+		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
+		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
+		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
+		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
+		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
+		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
+		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
+		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
+		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
+		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
+		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
+		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
+		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
+		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
+
+		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
+		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
+		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
+		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
+		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
+		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
+		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
+		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
+		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
+		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
+		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
+		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
+		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
+		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
+		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
+		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
+
+		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
+		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
+		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
+		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
+		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
+		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
+		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
+		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
+		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
+		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
+		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
+		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
+		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
+		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
+		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
+		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
+
+		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
+	}
+}
+