From d3117b5d8cda961ccceff975a16e4e4f81ad4172 Mon Sep 17 00:00:00 2001 From: "Ryan C. Gordon" Date: Sun, 17 Apr 2005 10:19:22 +0000 Subject: [PATCH] Altivec-optimized blitters! Vast majority of this work is compliments of Bob Ippolito. http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many other posts. --- configure.in | 14 +- src/video/SDL_blit.h | 14 + src/video/SDL_blit_A.c | 792 ++++++++++++++++++++++++++++++++++++++++- src/video/SDL_blit_N.c | 707 +++++++++++++++++++++++++++++++++++- 4 files changed, 1501 insertions(+), 26 deletions(-) diff --git a/configure.in b/configure.in index ff6ca5b4a..e75683697 100644 --- a/configure.in +++ b/configure.in @@ -1839,17 +1839,18 @@ CheckAltivec() { AC_MSG_CHECKING(for GCC Altivec instruction support) have_gcc_altivec=no + save_CFLAGS="${CFLAGS}" + CFLAGS="${CFLAGS} -DGCC_ALTIVEC -DUSE_ALTIVEC_BLITTERS -faltivec" AC_TRY_COMPILE([ + vector unsigned int vzero() { + return vec_splat_u32(0); + } ],[ - asm volatile ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); ],[ have_gcc_altivec=yes ]) - if test x$have_gcc_altivec = xyes; then - CFLAGS="$CFLAGS -DGCC_ALTIVEC" + if test x$have_gcc_altivec = xno; then + CFLAGS="${save_CFLAGS}" fi AC_MSG_RESULT($have_gcc_altivec) } @@ -2564,6 +2565,7 @@ case "$target" in CheckMacGL CheckPTHREAD CheckSIGACTION + CheckAltivec # If either the audio or CD driver is used, add the AudioUnit framework if test x$enable_audio = xyes -o x$enable_cdrom = xyes; then SYSTEM_LIBS="$SYSTEM_LIBS -framework AudioToolbox -framework AudioUnit" diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index 8337e7d13..41b6e0e2f 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -374,6 +374,20 @@ do { \ dB = (((sB-dB)*(A))>>8)+dB; \ } while(0) +/* Blend the RGB values of two pixels based on a source alpha value */ +#define ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB) \ +do { \ + unsigned tR, tG, tB, tA; \ + tA = 255 - sA; \ + tR = 1 + (sR * sA) + (dR * tA); \ + dR = (tR + (tR >> 8)) >> 8; \ + tG = 1 + (sG * sA) + (dG * tA); \ + dG = (tG + (tG >> 8)) >> 8; \ + tB = 1 + (sB * sA) + (dB * tA); \ + dB = (tB + (tB >> 8)) >> 8; \ +} while(0) + + /* This is a very useful loop for optimizing blitters */ #if defined(_MSC_VER) && (_MSC_VER == 1300) /* There's a bug in the Visual C++ 7 optimizer when compiling this code */ diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index 35384155d..bae6fd528 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -35,9 +35,9 @@ static char rcsid = #define MMX_ASMBLIT #endif -#ifdef MMX_ASMBLIT /* Function to check the CPU flags */ #include "SDL_cpuinfo.h" +#ifdef MMX_ASMBLIT #include "mmx.h" #endif @@ -421,6 +421,762 @@ static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) } #endif +#ifdef USE_ALTIVEC_BLITTERS +#include +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) +#define VECPRINT(msg, v) do { \ + vector unsigned int tmpvec = (vector unsigned int)(v); \ + unsigned int *vp = (unsigned int *)&tmpvec; \ + printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ +} while (0) + +/* the permuation vector that takes the high bytes out of all the appropriate shorts + (vector unsigned char)( + 0x00, 0x10, 0x02, 0x12, + 0x04, 0x14, 0x06, 0x16, + 0x08, 0x18, 0x0A, 0x1A, + 0x0C, 0x1C, 0x0E, 0x1E ); +*/ +#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) +#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) +#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ + ? vec_lvsl(0, src) \ + : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) + + +#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ + /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ + vector unsigned short vtemp1 = vec_mule(vs, valpha); \ + /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ + vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ + /* valpha2 is 255-alpha */ \ + vector unsigned char valpha2 = vec_nor(valpha, valpha); \ + /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ + vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ + /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ + vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ + /* add source and dest */ \ + vtemp1 = vec_add(vtemp1, vtemp3); \ + vtemp2 = vec_add(vtemp2, vtemp4); \ + /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ + vtemp1 = vec_add(vtemp1, v1_16); \ + vtemp3 = vec_sr(vtemp1, v8_16); \ + vtemp1 = vec_add(vtemp1, vtemp3); \ + /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ + vtemp2 = vec_add(vtemp2, v1_16); \ + vtemp4 = vec_sr(vtemp2, v8_16); \ + vtemp2 = vec_add(vtemp2, vtemp4); \ + /* (>>8) and get ARGBARGBARGBARGB */ \ + vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ +} while (0) + +/* Calculate the permute vector used for 32->32 swizzling */ +static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, + const SDL_PixelFormat *dstfmt) +{ + /* + * We have to assume that the bits that aren't used by other + * colors is alpha, and it's one complete byte, since some formats + * leave alpha with a zero mask, but we should still swizzle the bits. + */ + /* ARGB */ + const static struct SDL_PixelFormat default_pixel_format = { + NULL, 0, 0, + 0, 0, 0, 0, + 16, 8, 0, 24, + 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, + 0, 0}; + if (!srcfmt) { + srcfmt = &default_pixel_format; + } + if (!dstfmt) { + dstfmt = &default_pixel_format; + } + vector unsigned char plus = (vector unsigned char) + ( 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0C, 0x0C, 0x0C, 0x0C ); + vector unsigned char vswiz; + vector unsigned int srcvec; +#define RESHIFT(X) (3 - ((X) >> 3)) + Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); + Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); + Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); + Uint32 amask; + /* Use zero for alpha if either surface doesn't have alpha */ + if (dstfmt->Amask) { + amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); + } else { + amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); + } +#undef RESHIFT + ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask); + vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); + return(vswiz); +} + +static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info) +{ + int height = info->d_height; + Uint8 *src = (Uint8 *)info->s_pixels; + int srcskip = info->s_skip; + Uint8 *dst = (Uint8 *)info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + + vector unsigned char v0 = vec_splat_u8(0); + vector unsigned short v8_16 = vec_splat_u16(8); + vector unsigned short v1_16 = vec_splat_u16(1); + vector unsigned short v2_16 = vec_splat_u16(2); + vector unsigned short v3_16 = vec_splat_u16(3); + vector unsigned int v8_32 = vec_splat_u32(8); + vector unsigned int v16_32 = vec_add(v8_32, v8_32); + vector unsigned short v3f = (vector unsigned short)( + 0x003f, 0x003f, 0x003f, 0x003f, + 0x003f, 0x003f, 0x003f, 0x003f); + vector unsigned short vfc = (vector unsigned short)( + 0x00fc, 0x00fc, 0x00fc, 0x00fc, + 0x00fc, 0x00fc, 0x00fc, 0x00fc); + + /* + 0x10 - 0x1f is the alpha + 0x00 - 0x0e evens are the red + 0x01 - 0x0f odds are zero + */ + vector unsigned char vredalpha1 = (vector unsigned char)( + 0x10, 0x00, 0x01, 0x01, + 0x10, 0x02, 0x01, 0x01, + 0x10, 0x04, 0x01, 0x01, + 0x10, 0x06, 0x01, 0x01 + ); + vector unsigned char vredalpha2 = (vector unsigned char)( + vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32)) + ); + /* + 0x00 - 0x0f is ARxx ARxx ARxx ARxx + 0x11 - 0x0f odds are blue + */ + vector unsigned char vblue1 = (vector unsigned char)( + 0x00, 0x01, 0x02, 0x11, + 0x04, 0x05, 0x06, 0x13, + 0x08, 0x09, 0x0a, 0x15, + 0x0c, 0x0d, 0x0e, 0x17 + ); + vector unsigned char vblue2 = (vector unsigned char)( + vec_add((vector unsigned int)vblue1, v8_32) + ); + /* + 0x00 - 0x0f is ARxB ARxB ARxB ARxB + 0x10 - 0x0e evens are green + */ + vector unsigned char vgreen1 = (vector unsigned char)( + 0x00, 0x01, 0x10, 0x03, + 0x04, 0x05, 0x12, 0x07, + 0x08, 0x09, 0x14, 0x0b, + 0x0c, 0x0d, 0x16, 0x0f + ); + vector unsigned char vgreen2 = (vector unsigned char)( + vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32)) + ); + vector unsigned char vgmerge = (vector unsigned char)( + 0x00, 0x02, 0x00, 0x06, + 0x00, 0x0a, 0x00, 0x0e, + 0x00, 0x12, 0x00, 0x16, + 0x00, 0x1a, 0x00, 0x1e); + vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); + vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); + vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); + + vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + while(height--) { + int extrawidth; + vector unsigned char valigner; + vector unsigned char vsrc; + vector unsigned char voverflow; + int width = info->d_width; + +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, dR, dG, dB, sA; \ + DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \ + if(sA) { \ + unsigned short dstpixel = *((unsigned short *)dst); \ + dR = (dstpixel >> 8) & 0xf8; \ + dG = (dstpixel >> 3) & 0xfc; \ + dB = (dstpixel << 3) & 0xf8; \ + ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + *((unsigned short *)dst) = ( \ + ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ + ); \ + } \ + src += 4; \ + dst += 2; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); + extrawidth = (width % 8); + valigner = VEC_ALIGNER(src); + vsrc = (vector unsigned char)vec_ld(0, src); + width -= extrawidth; + while (width) { + vector unsigned char valpha; + vector unsigned char vsrc1, vsrc2; + vector unsigned char vdst1, vdst2; + vector unsigned short vR, vG, vB; + vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; + + /* Load 8 pixels from src as ARGB */ + voverflow = (vector unsigned char)vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc1 = vec_perm(vsrc, vsrc, vpermute); + src += 16; + vsrc = (vector unsigned char)vec_ld(15, src); + voverflow = vec_perm(voverflow, vsrc, valigner); + vsrc2 = vec_perm(voverflow, voverflow, vpermute); + src += 16; + + /* Load 8 pixels from dst as XRGB */ + voverflow = vec_ld(0, dst); + vR = vec_and((vector unsigned short)voverflow, vf800); + vB = vec_sl((vector unsigned short)voverflow, v3_16); + vG = vec_sl(vB, v2_16); + vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1); + vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); + vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); + vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2); + vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); + vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); + + /* Alpha blend 8 pixels as ARGB */ + valpha = vec_perm(vsrc1, v0, valphaPermute); + VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16); + valpha = vec_perm(vsrc2, v0, valphaPermute); + VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16); + + /* Convert 8 pixels to 565 */ + vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2); + vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge); + vgpixel = vec_and(vgpixel, vfc); + vgpixel = vec_sl(vgpixel, v3_16); + vrpixel = vec_sl(vpixel, v1_16); + vrpixel = vec_and(vrpixel, vf800); + vbpixel = vec_and(vpixel, v3f); + vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); + vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel); + + /* Store 8 pixels */ + vec_st(vdst1, 0, dst); + + width -= 8; + dst += 16; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + src += srcskip; + dst += dstskip; + } +} + +static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info) +{ + unsigned alpha = info->src->alpha; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + SDL_PixelFormat *srcfmt = info->src; + SDL_PixelFormat *dstfmt = info->dst; + unsigned sA = srcfmt->alpha; + unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; + Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; + Uint32 ckey = info->src->colorkey; + vector unsigned char mergePermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned char vbits; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + vector unsigned int vckey; + vector unsigned int vrgbmask; + + mergePermute = VEC_MERGE_PERMUTE(); + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *)&valpha)[0] = alpha; + valpha = vec_splat(valpha, 0); + vbits = (vector unsigned char)vec_splat_s8(-1); + + ckey &= rgbmask; + ((unsigned int *)&vckey)[0] = ckey; + vckey = vec_splat(vckey, 0); + ((unsigned int *)&vrgbmask)[0] = rgbmask; + vrgbmask = vec_splat(vrgbmask, 0); + + while(height--) { + int width = info->d_width; +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, dR, dG, dB; \ + RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \ + if(sA && pixel != ckey) { \ + RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \ + DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \ + ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ + } \ + ((Uint8 *)dstp) += 4; \ + ((Uint8 *)srcp) += 4; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char vsel; + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char vd_orig; + + /* s = *srcp */ + voverflow = (vector unsigned char)vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + /* vsel is set for items that match the key */ + vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask); + vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey); + + /* permute to source format */ + vs = vec_perm(vs, valpha, vsrcPermute); + + /* d = *dstp */ + vd = (vector unsigned char)vec_ld(0, dstp); + vd_orig = vd = vec_perm(vd, v0, vsdstPermute); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + + /* mask out color key */ + vd = vec_sel(vd, vd_orig, vsel); + + /* permute to dest format */ + vd = vec_perm(vd, vbits, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int)vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } +} + + +static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info) +{ + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + SDL_PixelFormat *srcfmt = info->src; + SDL_PixelFormat *dstfmt = info->dst; + vector unsigned char mergePermute; + vector unsigned char valphaPermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valphamask; + vector unsigned char vpixelmask; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + mergePermute = VEC_MERGE_PERMUTE(); + valphamask = VEC_ALPHA_MASK(); + valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); + vpixelmask = vec_nor(valphamask, v0); + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + while ( height-- ) { + width = info->d_width; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ + DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \ + if(sA) { \ + DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \ + ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ + } \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + // vsrcPermute + // vdstPermute + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char valpha; + vector unsigned char vdstalpha; + /* s = *srcp */ + voverflow = (vector unsigned char)vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + vs = vec_perm(vs, v0, vsrcPermute); + + valpha = vec_perm(vs, v0, valphaPermute); + + /* d = *dstp */ + vd = (vector unsigned char)vec_ld(0, dstp); + vd = vec_perm(vd, v0, vsdstPermute); + vdstalpha = vec_and(vd, valphamask); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha to the dest alpha */ + vd = vec_and(vd, vpixelmask); + vd = vec_or(vd, vdstalpha); + vd = vec_perm(vd, v0, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int)vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } + srcp += srcskip; + dstp += dstskip; +#undef ONE_PIXEL_BLEND + } +} + +/* fast ARGB888->(A)RGB888 blending with pixel alpha */ +static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info) +{ + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + vector unsigned char mergePermute; + vector unsigned char valphaPermute; + vector unsigned char valphamask; + vector unsigned char vpixelmask; + vector unsigned char v0; + vector unsigned short v1; + vector unsigned short v8; + v0 = vec_splat_u8(0); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + mergePermute = VEC_MERGE_PERMUTE(); + valphamask = VEC_ALPHA_MASK(); + valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); + + + vpixelmask = vec_nor(valphamask, v0); + while(height--) { + width = info->d_width; +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while ((condition)) { \ + Uint32 dalpha; \ + Uint32 d; \ + Uint32 s1; \ + Uint32 d1; \ + Uint32 s = *srcp; \ + Uint32 alpha = s >> 24; \ + if(alpha) { \ + if(alpha == SDL_ALPHA_OPAQUE) { \ + *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ + } else { \ + d = *dstp; \ + dalpha = d & 0xff000000; \ + s1 = s & 0xff00ff; \ + d1 = d & 0xff00ff; \ + d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ + s &= 0xff00; \ + d &= 0xff00; \ + d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ + *dstp = d1 | d | dalpha; \ + } \ + } \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + vector unsigned char valpha; + vector unsigned char vdstalpha; + /* s = *srcp */ + voverflow = (vector unsigned char)vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + valpha = vec_perm(vs, v0, valphaPermute); + + /* d = *dstp */ + vd = (vector unsigned char)vec_ld(0, dstp); + vdstalpha = vec_and(vd, valphamask); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha to the dest alpha */ + vd = vec_and(vd, vpixelmask); + vd = vec_or(vd, vdstalpha); + + /* *dstp = res */ + vec_st((vector unsigned int)vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } + srcp += srcskip; + dstp += dstskip; + } +#undef ONE_PIXEL_BLEND +} + +static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info) +{ + /* XXX : 6 */ + unsigned alpha = info->src->alpha; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + SDL_PixelFormat *srcfmt = info->src; + SDL_PixelFormat *dstfmt = info->dst; + unsigned sA = srcfmt->alpha; + unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; + vector unsigned char mergePermute; + vector unsigned char vsrcPermute; + vector unsigned char vdstPermute; + vector unsigned char vsdstPermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned char vbits; + vector unsigned short v1; + vector unsigned short v8; + + mergePermute = VEC_MERGE_PERMUTE(); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + vsrcPermute = calc_swizzle32(srcfmt, NULL); + vdstPermute = calc_swizzle32(NULL, dstfmt); + vsdstPermute = calc_swizzle32(dstfmt, NULL); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *)&valpha)[0] = alpha; + valpha = vec_splat(valpha, 0); + vbits = (vector unsigned char)vec_splat_s8(-1); + + while(height--) { + int width = info->d_width; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, dR, dG, dB; \ + DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \ + DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \ + ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ + ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = vec_lvsl(0, srcp); + vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + + /* s = *srcp */ + voverflow = (vector unsigned char)vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + vs = vec_perm(vs, valpha, vsrcPermute); + + /* d = *dstp */ + vd = (vector unsigned char)vec_ld(0, dstp); + vd = vec_perm(vd, vd, vsdstPermute); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + vd = vec_perm(vd, vbits, vdstPermute); + + /* *dstp = res */ + vec_st((vector unsigned int)vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } + +} + + +/* fast RGB888->(A)RGB888 blending */ +static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) +{ + unsigned alpha = info->src->alpha; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + vector unsigned char mergePermute; + vector unsigned char valpha; + vector unsigned char valphamask; + vector unsigned short v1; + vector unsigned short v8; + + mergePermute = VEC_MERGE_PERMUTE(); + v1 = vec_splat_u16(1); + v8 = vec_splat_u16(8); + + /* set the alpha to 255 on the destination surf */ + valphamask = VEC_ALPHA_MASK(); + + /* set a vector full of alpha and 255-alpha */ + ((unsigned char *)&valpha)[0] = alpha; + valpha = vec_splat(valpha, 0); + + while(height--) { + int width = info->d_width; +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ + Uint32 s = *srcp; \ + Uint32 d = *dstp; \ + Uint32 s1 = s & 0xff00ff; \ + Uint32 d1 = d & 0xff00ff; \ + d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ + & 0xff00ff; \ + s &= 0xff00; \ + d &= 0xff00; \ + d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ + *dstp = d1 | d | 0xff000000; \ + ++srcp; \ + ++dstp; \ + widthvar--; \ + } + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); + width -= extrawidth; + while (width) { + vector unsigned char voverflow; + vector unsigned char vd; + + /* s = *srcp */ + voverflow = (vector unsigned char)vec_ld(15, srcp); + vs = vec_perm(vs, voverflow, valigner); + + /* d = *dstp */ + vd = (vector unsigned char)vec_ld(0, dstp); + + VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); + + /* set the alpha channel to full on */ + vd = vec_or(vd, valphamask); + + /* *dstp = res */ + vec_st((vector unsigned int)vd, 0, dstp); + + srcp += 4; + dstp += 4; + width -= 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); + } +#undef ONE_PIXEL_BLEND + + srcp += srcskip; + dstp += dstskip; + } +} +#endif /* USE_ALTIVEC_BLITTERS */ + /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) { @@ -1372,7 +2128,12 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) if(df->BytesPerPixel == 1) return BlitNto1SurfaceAlphaKey; else - return BlitNtoNSurfaceAlphaKey; +#ifdef USE_ALTIVEC_BLITTERS + if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec()) + return Blit32to32SurfaceAlphaKeyAltivec; + else +#endif + return BlitNtoNSurfaceAlphaKey; } else { /* Per-surface alpha blits */ switch(df->BytesPerPixel) { @@ -1413,10 +2174,20 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) if(SDL_HasMMX()) return BlitRGBtoRGBSurfaceAlphaMMX; else +#endif +#ifdef USE_ALTIVEC_BLITTERS + if(SDL_HasAltiVec()) + return BlitRGBtoRGBSurfaceAlphaAltivec; + else #endif return BlitRGBtoRGBSurfaceAlpha; } else +#ifdef USE_ALTIVEC_BLITTERS + if((sf->BytesPerPixel == 4) && SDL_HasAltiVec()) + return Blit32to32SurfaceAlphaAltivec; + else +#endif return BlitNtoNSurfaceAlpha; case 3: @@ -1431,6 +2202,13 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) return BlitNto1PixelAlpha; case 2: +#ifdef USE_ALTIVEC_BLITTERS + if(sf->BytesPerPixel == 4 && + df->Gmask == 0x7e0 && + df->Bmask == 0x1f) + return Blit32to565PixelAlphaAltivec; + else +#endif if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) @@ -1456,9 +2234,19 @@ SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int blit_index) if(SDL_HasMMX()) return BlitRGBtoRGBPixelAlphaMMX; else +#endif +#ifdef USE_ALTIVEC_BLITTERS + if(SDL_HasAltiVec()) + return BlitRGBtoRGBPixelAlphaAltivec; + else #endif return BlitRGBtoRGBPixelAlpha; } +#ifdef USE_ALTIVEC_BLITTERS + if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec()) + return Blit32to32PixelAlphaAltivec; + else +#endif return BlitNtoNPixelAlpha; case 3: diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 72d5559f2..0c935d214 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -35,6 +35,656 @@ static char rcsid = /* Functions to blit from N-bit surfaces to other surfaces */ +#ifdef USE_ALTIVEC_BLITTERS +#include +#ifdef MACOSX +#include +#include +static size_t GetL3CacheSize( void ) +{ + const char key[] = "hw.l3cachesize"; + u_int64_t result = 0; + size_t typeSize = sizeof( result ); + + + int err = sysctlbyname( key, &result, &typeSize, NULL, 0 ); + if( 0 != err ) return 0; + + return result; +} +#else +static size_t GetL3CacheSize( void ) +{ + /* XXX: Just guess G4 */ + return 2097152; +} +#endif /* MACOSX */ + +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) +#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \ + ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \ + 0x04+a, 0x04+b, 0x04+c, 0x04+d, \ + 0x08+a, 0x08+b, 0x08+c, 0x08+d, \ + 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d ) + +#define MAKE8888(dstfmt, r, g, b, a) \ + ( ((r<Rshift)&dstfmt->Rmask) | \ + ((g<Gshift)&dstfmt->Gmask) | \ + ((b<Bshift)&dstfmt->Bmask) | \ + ((a<Ashift)&dstfmt->Amask) ) + +/* + * Data Stream Touch...Altivec cache prefetching. + * + * Don't use this on a G5...however, the speed boost is very significant + * on a G4. + */ +#define DST_CHAN_SRC 1 +#define DST_CHAN_DEST 2 + +/* macro to set DST control word value... */ +#define DST_CTRL(size, count, stride) \ + (((size) << 24) | ((count) << 16) | (stride)) + +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ + ? vec_lvsl(0, src) \ + : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) + +/* Calculate the permute vector used for 32->32 swizzling */ +static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, + const SDL_PixelFormat *dstfmt) +{ + /* + * We have to assume that the bits that aren't used by other + * colors is alpha, and it's one complete byte, since some formats + * leave alpha with a zero mask, but we should still swizzle the bits. + */ + /* ARGB */ + const static struct SDL_PixelFormat default_pixel_format = { + NULL, 0, 0, + 0, 0, 0, 0, + 16, 8, 0, 24, + 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, + 0, 0}; + if (!srcfmt) { + srcfmt = &default_pixel_format; + } + if (!dstfmt) { + dstfmt = &default_pixel_format; + } + vector unsigned char plus = (vector unsigned char)( 0x00, 0x00, 0x00, 0x00, + 0x04, 0x04, 0x04, 0x04, + 0x08, 0x08, 0x08, 0x08, + 0x0C, 0x0C, 0x0C, 0x0C ); + vector unsigned char vswiz; + vector unsigned int srcvec; +#define RESHIFT(X) (3 - ((X) >> 3)) + Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); + Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); + Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); + Uint32 amask; + /* Use zero for alpha if either surface doesn't have alpha */ + if (dstfmt->Amask) { + amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); + } else { + amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); + } +#undef RESHIFT + ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask); + vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); + return(vswiz); +} + +static void Blit_RGB888_RGB565(SDL_BlitInfo *info); +static void Blit_RGB888_RGB565Altivec(SDL_BlitInfo *info) { + int height = info->d_height; + Uint8 *src = (Uint8 *) info->s_pixels; + int srcskip = info->s_skip; + Uint8 *dst = (Uint8 *) info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + vector unsigned char valpha = vec_splat_u8(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); + vector unsigned char vgmerge = (vector unsigned char)( + 0x00, 0x02, 0x00, 0x06, + 0x00, 0x0a, 0x00, 0x0e, + 0x00, 0x12, 0x00, 0x16, + 0x00, 0x1a, 0x00, 0x1e); + vector unsigned short v1 = vec_splat_u16(1); + vector unsigned short v3 = vec_splat_u16(3); + vector unsigned short v3f = (vector unsigned short)( + 0x003f, 0x003f, 0x003f, 0x003f, + 0x003f, 0x003f, 0x003f, 0x003f); + vector unsigned short vfc = (vector unsigned short)( + 0x00fc, 0x00fc, 0x00fc, 0x00fc, + 0x00fc, 0x00fc, 0x00fc, 0x00fc); + vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + while (height--) { + vector unsigned char valigner; + vector unsigned char voverflow; + vector unsigned char vsrc; + + int width = info->d_width; + int extrawidth; + + /* do scalar until we can align... */ +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, sA; \ + DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, pixel, \ + sR, sG, sB, sA); \ + *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \ + ((sG << 3) & 0x000007E0) | \ + ((sB >> 3) & 0x0000001F)); \ + dst += 2; \ + src += 4; \ + widthvar--; \ + } + + ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width); + + /* After all that work, here's the vector part! */ + extrawidth = (width % 8); /* trailing unaligned stores */ + width -= extrawidth; + vsrc = vec_ld(0, src); + valigner = VEC_ALIGNER(src); + + while (width) { + vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; + vector unsigned int vsrc1, vsrc2; + vector unsigned char vdst; + + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc1 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute); + src += 16; + vsrc = voverflow; + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + vsrc2 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute); + /* 1555 */ + vpixel = (vector unsigned short)vec_packpx(vsrc1, vsrc2); + vgpixel = (vector unsigned short)vec_perm(vsrc1, vsrc2, vgmerge); + vgpixel = vec_and(vgpixel, vfc); + vgpixel = vec_sl(vgpixel, v3); + vrpixel = vec_sl(vpixel, v1); + vrpixel = vec_and(vrpixel, vf800); + vbpixel = vec_and(vpixel, v3f); + vdst = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); + /* 565 */ + vdst = vec_or(vdst, (vector unsigned char)vbpixel); + vec_st(vdst, 0, dst); + + width -= 8; + src += 16; + dst += 16; + vsrc = voverflow; + } + + assert(width == 0); + + + /* do scalar until we can align... */ + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + + src += srcskip; /* move to next row, accounting for pitch. */ + dst += dstskip; + } + + +} + +static void Blit_RGB565_32Altivec(SDL_BlitInfo *info) { + int height = info->d_height; + Uint8 *src = (Uint8 *) info->s_pixels; + int srcskip = info->s_skip; + Uint8 *dst = (Uint8 *) info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + SDL_PixelFormat *dstfmt = info->dst; + unsigned alpha; + vector unsigned char valpha; + vector unsigned char vpermute; + vector unsigned short vf800; + vector unsigned int v8 = vec_splat_u32(8); + vector unsigned int v16 = vec_add(v8, v8); + vector unsigned short v2 = vec_splat_u16(2); + vector unsigned short v3 = vec_splat_u16(3); + /* + 0x10 - 0x1f is the alpha + 0x00 - 0x0e evens are the red + 0x01 - 0x0f odds are zero + */ + vector unsigned char vredalpha1 = (vector unsigned char)( + 0x10, 0x00, 0x01, 0x01, + 0x10, 0x02, 0x01, 0x01, + 0x10, 0x04, 0x01, 0x01, + 0x10, 0x06, 0x01, 0x01 + ); + vector unsigned char vredalpha2 = (vector unsigned char)( + vec_add((vector unsigned int)vredalpha1, vec_sl(v8, v16)) + ); + /* + 0x00 - 0x0f is ARxx ARxx ARxx ARxx + 0x11 - 0x0f odds are blue + */ + vector unsigned char vblue1 = (vector unsigned char)( + 0x00, 0x01, 0x02, 0x11, + 0x04, 0x05, 0x06, 0x13, + 0x08, 0x09, 0x0a, 0x15, + 0x0c, 0x0d, 0x0e, 0x17 + ); + vector unsigned char vblue2 = (vector unsigned char)( + vec_add((vector unsigned int)vblue1, v8) + ); + /* + 0x00 - 0x0f is ARxB ARxB ARxB ARxB + 0x10 - 0x0e evens are green + */ + vector unsigned char vgreen1 = (vector unsigned char)( + 0x00, 0x01, 0x10, 0x03, + 0x04, 0x05, 0x12, 0x07, + 0x08, 0x09, 0x14, 0x0b, + 0x0c, 0x0d, 0x16, 0x0f + ); + vector unsigned char vgreen2 = (vector unsigned char)( + vec_add((vector unsigned int)vgreen1, vec_sl(v8, v8)) + ); + + + assert(srcfmt->BytesPerPixel == 2); + assert(dstfmt->BytesPerPixel == 4); + + vf800 = (vector unsigned short)vec_splat_u8(-7); + vf800 = vec_sl(vf800, vec_splat_u16(8)); + + if (dstfmt->Amask && srcfmt->alpha) { + ((unsigned char *)&valpha)[0] = alpha = srcfmt->alpha; + valpha = vec_splat(valpha, 0); + } else { + alpha = 0; + valpha = vec_splat_u8(0); + } + + vpermute = calc_swizzle32(NULL, dstfmt); + while (height--) { + vector unsigned char valigner; + vector unsigned char voverflow; + vector unsigned char vsrc; + + int width = info->d_width; + int extrawidth; + + /* do scalar until we can align... */ +#define ONE_PIXEL_BLEND(condition, widthvar) \ + while (condition) { \ + unsigned sR, sG, sB; \ + unsigned short pixel = *((unsigned short *)src); \ + sR = (pixel >> 8) & 0xf8; \ + sG = (pixel >> 3) & 0xfc; \ + sB = (pixel << 3) & 0xf8; \ + ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \ + src += 2; \ + dst += 4; \ + widthvar--; \ + } + ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width); + + /* After all that work, here's the vector part! */ + extrawidth = (width % 8); /* trailing unaligned stores */ + width -= extrawidth; + vsrc = vec_ld(0, src); + valigner = VEC_ALIGNER(src); + + while (width) { + vector unsigned short vR, vG, vB; + vector unsigned char vdst1, vdst2; + + voverflow = vec_ld(15, src); + vsrc = vec_perm(vsrc, voverflow, valigner); + + vR = vec_and((vector unsigned short)vsrc, vf800); + vB = vec_sl((vector unsigned short)vsrc, v3); + vG = vec_sl(vB, v2); + + vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha1); + vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); + vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); + vdst1 = vec_perm(vdst1, valpha, vpermute); + vec_st(vdst1, 0, dst); + + vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha2); + vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); + vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); + vdst2 = vec_perm(vdst2, valpha, vpermute); + vec_st(vdst2, 16, dst); + + width -= 8; + dst += 32; + src += 16; + vsrc = voverflow; + } + + assert(width == 0); + + + /* do scalar until we can align... */ + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + + src += srcskip; /* move to next row, accounting for pitch. */ + dst += dstskip; + } + +} + +static void BlitNtoNKey(SDL_BlitInfo *info); +static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info); +static void Blit32to32KeyAltivec(SDL_BlitInfo *info) +{ + int height = info->d_height; + Uint32 *srcp = (Uint32 *) info->s_pixels; + int srcskip = info->s_skip; + Uint32 *dstp = (Uint32 *) info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + int srcbpp = srcfmt->BytesPerPixel; + SDL_PixelFormat *dstfmt = info->dst; + int dstbpp = dstfmt->BytesPerPixel; + int copy_alpha = (srcfmt->Amask && dstfmt->Amask); + unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0; + Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; + Uint32 ckey = info->src->colorkey; + vector unsigned int valpha; + vector unsigned char vpermute; + vector unsigned char vzero; + vector unsigned int vckey; + vector unsigned int vrgbmask; + vpermute = calc_swizzle32(srcfmt, dstfmt); + if (info->d_width < 16) { + if(copy_alpha) { + return BlitNtoNKeyCopyAlpha(info); + } else { + return BlitNtoNKey(info); + } + } + vzero = vec_splat_u8(0); + if (alpha) { + ((unsigned char *)&valpha)[0] = (unsigned char)alpha; + valpha = (vector unsigned int)vec_splat((vector unsigned char)valpha, 0); + } else { + valpha = (vector unsigned int)vzero; + } + ckey &= rgbmask; + ((unsigned int *)&vckey)[0] = ckey; + vckey = vec_splat(vckey, 0); + ((unsigned int *)&vrgbmask)[0] = rgbmask; + vrgbmask = vec_splat(vrgbmask, 0); + + while (height--) { +#define ONE_PIXEL_BLEND(condition, widthvar) \ + if (copy_alpha) { \ + while (condition) { \ + Uint32 pixel; \ + unsigned sR, sG, sB, sA; \ + DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, pixel, \ + sR, sG, sB, sA); \ + if ( (pixel & rgbmask) != ckey ) { \ + ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \ + sR, sG, sB, sA); \ + } \ + ((Uint8 *)dstp) += dstbpp; \ + ((Uint8 *)srcp) += srcbpp; \ + widthvar--; \ + } \ + } else { \ + while (condition) { \ + Uint32 pixel; \ + unsigned sR, sG, sB; \ + RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, pixel); \ + if ( pixel != ckey ) { \ + RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \ + ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \ + sR, sG, sB, alpha); \ + } \ + ((Uint8 *)dstp) += dstbpp; \ + ((Uint8 *)srcp) += srcbpp; \ + widthvar--; \ + } \ + } + int width = info->d_width; + ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); + assert(width > 0); + if (width > 0) { + int extrawidth = (width % 4); + vector unsigned char valigner = VEC_ALIGNER(srcp); + vector unsigned int vs = vec_ld(0, srcp); + width -= extrawidth; + assert(width >= 4); + while (width) { + vector unsigned char vsel; + vector unsigned int vd; + vector unsigned int voverflow = vec_ld(15, srcp); + /* load the source vec */ + vs = vec_perm(vs, voverflow, valigner); + /* vsel is set for items that match the key */ + vsel = (vector unsigned char)vec_and(vs, vrgbmask); + vsel = (vector unsigned char)vec_cmpeq(vs, vckey); + /* permute the src vec to the dest format */ + vs = vec_perm(vs, valpha, vpermute); + /* load the destination vec */ + vd = vec_ld(0, dstp); + /* select the source and dest into vs */ + vd = (vector unsigned int)vec_sel((vector unsigned char)vs, (vector unsigned char)vd, vsel); + + vec_st(vd, 0, dstp); + srcp += 4; + width -= 4; + dstp += 4; + vs = voverflow; + } + ONE_PIXEL_BLEND((extrawidth), extrawidth); +#undef ONE_PIXEL_BLEND + srcp += srcskip >> 2; + dstp += dstskip >> 2; + } + } +} + +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */ +/* Use this on a G5 */ +static void ConvertAltivec32to32_noprefetch(SDL_BlitInfo *info) +{ + int height = info->d_height; + Uint32 *src = (Uint32 *) info->s_pixels; + int srcskip = info->s_skip; + Uint32 *dst = (Uint32 *) info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + int srcbpp = srcfmt->BytesPerPixel; + SDL_PixelFormat *dstfmt = info->dst; + int dstbpp = dstfmt->BytesPerPixel; + vector unsigned int vzero = vec_splat_u32(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt); + if (dstfmt->Amask && !srcfmt->Amask) { + if (srcfmt->alpha) { + vector unsigned char valpha; + ((unsigned char *)&valpha)[0] = srcfmt->alpha; + vzero = (vector unsigned int)vec_splat(valpha, 0); + } + } + + assert(srcbpp == 4); + assert(dstbpp == 4); + + while (height--) { + vector unsigned char valigner; + vector unsigned int vbits; + vector unsigned int voverflow; + Uint32 bits; + Uint8 r, g, b, a; + + int width = info->d_width; + int extrawidth; + + /* do scalar until we can align... */ + while ((UNALIGNED_PTR(dst)) && (width)) { + bits = *(src++); + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + width--; + } + + /* After all that work, here's the vector part! */ + extrawidth = (width % 4); + width -= extrawidth; + valigner = VEC_ALIGNER(src); + vbits = vec_ld(0, src); + + while (width) { + voverflow = vec_ld(15, src); + src += 4; + width -= 4; + vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */ + vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */ + vec_st(vbits, 0, dst); /* store it back out. */ + dst += 4; + vbits = voverflow; + } + + assert(width == 0); + + /* cover pixels at the end of the row that didn't fit in 16 bytes. */ + while (extrawidth) { + bits = *(src++); /* max 7 pixels, don't bother with prefetch. */ + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + extrawidth--; + } + + src += srcskip >> 2; /* move to next row, accounting for pitch. */ + dst += dstskip >> 2; + } + +} + +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */ +/* Use this on a G4 */ +static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info) +{ + const int scalar_dst_lead = sizeof (Uint32) * 4; + const int vector_dst_lead = sizeof (Uint32) * 16; + + int height = info->d_height; + Uint32 *src = (Uint32 *) info->s_pixels; + int srcskip = info->s_skip; + Uint32 *dst = (Uint32 *) info->d_pixels; + int dstskip = info->d_skip; + SDL_PixelFormat *srcfmt = info->src; + int srcbpp = srcfmt->BytesPerPixel; + SDL_PixelFormat *dstfmt = info->dst; + int dstbpp = dstfmt->BytesPerPixel; + vector unsigned int vzero = vec_splat_u32(0); + vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt); + if (dstfmt->Amask && !srcfmt->Amask) { + if (srcfmt->alpha) { + vector unsigned char valpha; + ((unsigned char *)&valpha)[0] = srcfmt->alpha; + vzero = (vector unsigned int)vec_splat(valpha, 0); + } + } + + assert(srcbpp == 4); + assert(dstbpp == 4); + + while (height--) { + vector unsigned char valigner; + vector unsigned int vbits; + vector unsigned int voverflow; + Uint32 bits; + Uint8 r, g, b, a; + + int width = info->d_width; + int extrawidth; + + /* do scalar until we can align... */ + while ((UNALIGNED_PTR(dst)) && (width)) { + vec_dstt(src+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC); + vec_dstst(dst+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST); + bits = *(src++); + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + width--; + } + + /* After all that work, here's the vector part! */ + extrawidth = (width % 4); + width -= extrawidth; + valigner = VEC_ALIGNER(src); + vbits = vec_ld(0, src); + + while (width) { + vec_dstt(src+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC); + vec_dstst(dst+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST); + voverflow = vec_ld(15, src); + src += 4; + width -= 4; + vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */ + vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */ + vec_st(vbits, 0, dst); /* store it back out. */ + dst += 4; + vbits = voverflow; + } + + assert(width == 0); + + /* cover pixels at the end of the row that didn't fit in 16 bytes. */ + while (extrawidth) { + bits = *(src++); /* max 7 pixels, don't bother with prefetch. */ + RGBA_FROM_8888(bits, srcfmt, r, g, b, a); + *(dst++) = MAKE8888(dstfmt, r, g, b, a); + extrawidth--; + } + + src += srcskip >> 2; /* move to next row, accounting for pitch. */ + dst += dstskip >> 2; + } + + vec_dss(DST_CHAN_SRC); + vec_dss(DST_CHAN_DEST); +} + +static Uint32 GetBlitFeatures( void ) +{ + static Uint32 features = 0xffffffff; + if (features == 0xffffffff) { + /* Provide an override for testing .. */ + char *override = getenv("SDL_ALTIVEC_BLIT_FEATURES"); + if (override) { + features = 0; + sscanf(override, "%u", &features); + } else { + features = ( 0 + /* Feature 1 is has-MMX */ + | ((SDL_HasMMX()) ? 1 : 0) + /* Feature 2 is has-AltiVec */ + | ((SDL_HasAltiVec()) ? 2 : 0) + /* Feature 4 is dont-use-prefetch */ + | ((GetL3CacheSize() == 0) ? 4 : 0) + ); + } + } + return features; +} +#else +/* Feature 1 is has-MMX */ +#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0)) +#endif + #ifdef USE_ASMBLIT /* Heheheh, we coerce Hermes into using SDL blit information */ @@ -406,11 +1056,7 @@ static void Blit_RGB888_RGB565(SDL_BlitInfo *info) /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */ -#if ( SDL_BYTEORDER == SDL_LIL_ENDIAN ) -#define RGB565_32(dst, src, map) (map[src[0]*2] + map[src[1]*2+1]) -#else /* ( SDL_BYTEORDER == SDL_BIG_ENDIAN ) */ -#define RGB565_32(dst, src, map) (map[src[1]*2] + map[src[0]*2+1]) -#endif +#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1]) static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map) { #ifndef USE_DUFFS_LOOP @@ -1422,10 +2068,10 @@ struct blit_table { Uint32 srcR, srcG, srcB; int dstbpp; Uint32 dstR, dstG, dstB; - SDL_bool cpu_mmx; + Uint32 blit_features; void *aux_data; SDL_loblit blitfunc; - enum { NO_ALPHA, SET_ALPHA, COPY_ALPHA } alpha; + enum { NO_ALPHA=1, SET_ALPHA=2, COPY_ALPHA=4 } alpha; }; static const struct blit_table normal_blit_1[] = { /* Default for 8-bit RGB source, an invalid combination */ @@ -1439,6 +2085,11 @@ static const struct blit_table normal_blit_2[] = { 0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA }, { 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00, 0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA }, +#endif +#ifdef USE_ALTIVEC_BLITTERS + /* has-altivec */ + { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000, + 2, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, #endif { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF, 0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA }, @@ -1485,6 +2136,17 @@ static const struct blit_table normal_blit_4[] = { { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000, 0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA }, #else +#ifdef USE_ALTIVEC_BLITTERS + /* has-altivec | dont-use-prefetch */ + { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000, + 6, NULL, ConvertAltivec32to32_noprefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, + /* has-altivec */ + { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000, + 2, NULL, ConvertAltivec32to32_prefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA }, + /* has-altivec */ + { 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F, + 2, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA }, +#endif { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F, 0, NULL, Blit_RGB888_RGB565, NO_ALPHA }, { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F, @@ -1497,6 +2159,9 @@ static const struct blit_table *normal_blit[] = { normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4 }; +/* Mask matches table, or table entry is zero */ +#define MASKOK(x, y) (((x) == (y)) || ((y) == 0x00000000)) + SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index) { struct private_swaccel *sdata; @@ -1532,6 +2197,12 @@ SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index) else if(dstfmt->BytesPerPixel == 1) return BlitNto1Key; else { +#ifdef USE_ALTIVEC_BLITTERS + if((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) && SDL_HasAltiVec()) { + return Blit32to32KeyAltivec; + } else +#endif + if(srcfmt->Amask && dstfmt->Amask) return BlitNtoNKeyCopyAlpha; else @@ -1561,20 +2232,20 @@ SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index) } } else { /* Now the meat, choose the blitter we want */ - int a_need = 0; + int a_need = 0; if(dstfmt->Amask) a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA; table = normal_blit[srcfmt->BytesPerPixel-1]; - for ( which=0; table[which].srcR; ++which ) { - if ( srcfmt->Rmask == table[which].srcR && - srcfmt->Gmask == table[which].srcG && - srcfmt->Bmask == table[which].srcB && - dstfmt->BytesPerPixel == table[which].dstbpp && - dstfmt->Rmask == table[which].dstR && - dstfmt->Gmask == table[which].dstG && - dstfmt->Bmask == table[which].dstB && - (a_need & table[which].alpha) == a_need && - (table[which].cpu_mmx == SDL_HasMMX())) + for ( which=0; table[which].dstbpp; ++which ) { + if ( MASKOK(srcfmt->Rmask, table[which].srcR) && + MASKOK(srcfmt->Gmask, table[which].srcG) && + MASKOK(srcfmt->Bmask, table[which].srcB) && + MASKOK(dstfmt->Rmask, table[which].dstR) && + MASKOK(dstfmt->Gmask, table[which].dstG) && + MASKOK(dstfmt->Bmask, table[which].dstB) && + dstfmt->BytesPerPixel == table[which].dstbpp && + (a_need & table[which].alpha) == a_need && + ((table[which].blit_features & GetBlitFeatures()) == table[which].blit_features) ) break; } sdata->aux_data = table[which].aux_data;