Altivec-optimized blitters!
authorRyan C. Gordon <icculus@icculus.org>
Sun, 17 Apr 2005 10:19:22 +0000
changeset 1047ffaaf7ecf685
parent 1046 f09d5edfc7a3
child 1048 a78acdd4967e
Altivec-optimized blitters!

Vast majority of this work is compliments of Bob Ippolito.

http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many
other posts.
configure.in
src/video/SDL_blit.h
src/video/SDL_blit_A.c
src/video/SDL_blit_N.c
     1.1 --- a/configure.in	Sun Apr 17 10:16:30 2005 +0000
     1.2 +++ b/configure.in	Sun Apr 17 10:19:22 2005 +0000
     1.3 @@ -1839,17 +1839,18 @@
     1.4  {
     1.5      AC_MSG_CHECKING(for GCC Altivec instruction support)
     1.6      have_gcc_altivec=no
     1.7 +    save_CFLAGS="${CFLAGS}"
     1.8 +    CFLAGS="${CFLAGS} -DGCC_ALTIVEC -DUSE_ALTIVEC_BLITTERS -faltivec"
     1.9      AC_TRY_COMPILE([
    1.10 +    vector unsigned int vzero() {
    1.11 +        return vec_splat_u32(0);
    1.12 +    }
    1.13      ],[
    1.14 -        asm volatile ("mtspr 256, %0\n\t"
    1.15 -                      "vand %%v0, %%v0, %%v0"
    1.16 -                      :
    1.17 -                      : "r" (-1));
    1.18      ],[
    1.19      have_gcc_altivec=yes
    1.20      ])
    1.21 -    if test x$have_gcc_altivec = xyes; then
    1.22 -        CFLAGS="$CFLAGS -DGCC_ALTIVEC"
    1.23 +    if test x$have_gcc_altivec = xno; then
    1.24 +        CFLAGS="${save_CFLAGS}"
    1.25      fi
    1.26      AC_MSG_RESULT($have_gcc_altivec)
    1.27  }
    1.28 @@ -2564,6 +2565,7 @@
    1.29          CheckMacGL
    1.30          CheckPTHREAD
    1.31          CheckSIGACTION
    1.32 +        CheckAltivec
    1.33          # If either the audio or CD driver is used, add the AudioUnit framework
    1.34          if test x$enable_audio = xyes -o x$enable_cdrom = xyes; then
    1.35              SYSTEM_LIBS="$SYSTEM_LIBS -framework AudioToolbox -framework AudioUnit"
     2.1 --- a/src/video/SDL_blit.h	Sun Apr 17 10:16:30 2005 +0000
     2.2 +++ b/src/video/SDL_blit.h	Sun Apr 17 10:19:22 2005 +0000
     2.3 @@ -374,6 +374,20 @@
     2.4  	dB = (((sB-dB)*(A))>>8)+dB;		\
     2.5  } while(0)
     2.6  
     2.7 +/* Blend the RGB values of two pixels based on a source alpha value */
     2.8 +#define ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB)	\
     2.9 +do {						\
    2.10 +    unsigned tR, tG, tB, tA; \
    2.11 +    tA = 255 - sA; \
    2.12 +    tR = 1 + (sR * sA) + (dR * tA); \
    2.13 +    dR = (tR + (tR >> 8)) >> 8; \
    2.14 +    tG = 1 + (sG * sA) + (dG * tA); \
    2.15 +    dG = (tG + (tG >> 8)) >> 8; \
    2.16 +    tB = 1 + (sB * sA) + (dB * tA); \
    2.17 +    dB = (tB + (tB >> 8)) >> 8; \
    2.18 +} while(0)
    2.19 +
    2.20 +
    2.21  /* This is a very useful loop for optimizing blitters */
    2.22  #if defined(_MSC_VER) && (_MSC_VER == 1300)
    2.23  /* There's a bug in the Visual C++ 7 optimizer when compiling this code */
     3.1 --- a/src/video/SDL_blit_A.c	Sun Apr 17 10:16:30 2005 +0000
     3.2 +++ b/src/video/SDL_blit_A.c	Sun Apr 17 10:19:22 2005 +0000
     3.3 @@ -35,9 +35,9 @@
     3.4  #define MMX_ASMBLIT
     3.5  #endif
     3.6  
     3.7 -#ifdef MMX_ASMBLIT
     3.8  /* Function to check the CPU flags */
     3.9  #include "SDL_cpuinfo.h"
    3.10 +#ifdef MMX_ASMBLIT
    3.11  #include "mmx.h"
    3.12  #endif
    3.13  
    3.14 @@ -421,6 +421,762 @@
    3.15  }
    3.16  #endif
    3.17  
    3.18 +#ifdef USE_ALTIVEC_BLITTERS
    3.19 +#include <assert.h>
    3.20 +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
    3.21 +#define VECPRINT(msg, v) do { \
    3.22 +    vector unsigned int tmpvec = (vector unsigned int)(v); \
    3.23 +    unsigned int *vp = (unsigned int *)&tmpvec; \
    3.24 +    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
    3.25 +} while (0)
    3.26 +
    3.27 +/* the permuation vector that takes the high bytes out of all the appropriate shorts 
    3.28 +    (vector unsigned char)(
    3.29 +        0x00, 0x10, 0x02, 0x12,
    3.30 +        0x04, 0x14, 0x06, 0x16,
    3.31 +        0x08, 0x18, 0x0A, 0x1A,
    3.32 +        0x0C, 0x1C, 0x0E, 0x1E );
    3.33 +*/
    3.34 +#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
    3.35 +#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
    3.36 +#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
    3.37 +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
    3.38 +    ? vec_lvsl(0, src) \
    3.39 +    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
    3.40 +
    3.41 +   
    3.42 +#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
    3.43 +    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
    3.44 +    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
    3.45 +    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
    3.46 +    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
    3.47 +    /* valpha2 is 255-alpha */ \
    3.48 +    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
    3.49 +    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
    3.50 +    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
    3.51 +    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
    3.52 +    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
    3.53 +    /* add source and dest */ \
    3.54 +    vtemp1 = vec_add(vtemp1, vtemp3); \
    3.55 +    vtemp2 = vec_add(vtemp2, vtemp4); \
    3.56 +    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
    3.57 +    vtemp1 = vec_add(vtemp1, v1_16); \
    3.58 +    vtemp3 = vec_sr(vtemp1, v8_16); \
    3.59 +    vtemp1 = vec_add(vtemp1, vtemp3); \
    3.60 +    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
    3.61 +    vtemp2 = vec_add(vtemp2, v1_16); \
    3.62 +    vtemp4 = vec_sr(vtemp2, v8_16); \
    3.63 +    vtemp2 = vec_add(vtemp2, vtemp4); \
    3.64 +    /* (>>8) and get ARGBARGBARGBARGB */ \
    3.65 +    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
    3.66 +} while (0)
    3.67 + 
    3.68 +/* Calculate the permute vector used for 32->32 swizzling */
    3.69 +static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
    3.70 +                                  const SDL_PixelFormat *dstfmt)
    3.71 +{
    3.72 +    /*
    3.73 +     * We have to assume that the bits that aren't used by other
    3.74 +     *  colors is alpha, and it's one complete byte, since some formats
    3.75 +     *  leave alpha with a zero mask, but we should still swizzle the bits.
    3.76 +     */
    3.77 +    /* ARGB */
    3.78 +    const static struct SDL_PixelFormat default_pixel_format = {
    3.79 +        NULL, 0, 0,
    3.80 +        0, 0, 0, 0,
    3.81 +        16, 8, 0, 24,
    3.82 +        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
    3.83 +        0, 0};
    3.84 +    if (!srcfmt) {
    3.85 +        srcfmt = &default_pixel_format;
    3.86 +    }
    3.87 +    if (!dstfmt) {
    3.88 +        dstfmt = &default_pixel_format;
    3.89 +    }
    3.90 +    vector unsigned char plus = (vector unsigned char)
    3.91 +                                            ( 0x00, 0x00, 0x00, 0x00,
    3.92 +                                              0x04, 0x04, 0x04, 0x04,
    3.93 +                                              0x08, 0x08, 0x08, 0x08,
    3.94 +                                              0x0C, 0x0C, 0x0C, 0x0C );
    3.95 +    vector unsigned char vswiz;
    3.96 +    vector unsigned int srcvec;
    3.97 +#define RESHIFT(X) (3 - ((X) >> 3))
    3.98 +    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
    3.99 +    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   3.100 +    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   3.101 +    Uint32 amask;
   3.102 +    /* Use zero for alpha if either surface doesn't have alpha */
   3.103 +    if (dstfmt->Amask) {
   3.104 +        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
   3.105 +    } else {
   3.106 +        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   3.107 +    }
   3.108 +#undef RESHIFT  
   3.109 +    ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
   3.110 +    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   3.111 +    return(vswiz);
   3.112 +}
   3.113 +
   3.114 +static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
   3.115 +{
   3.116 +    int height = info->d_height;
   3.117 +    Uint8 *src = (Uint8 *)info->s_pixels;
   3.118 +    int srcskip = info->s_skip;
   3.119 +    Uint8 *dst = (Uint8 *)info->d_pixels;
   3.120 +    int dstskip = info->d_skip;
   3.121 +    SDL_PixelFormat *srcfmt = info->src;
   3.122 +
   3.123 +    vector unsigned char v0 = vec_splat_u8(0);
   3.124 +    vector unsigned short v8_16 = vec_splat_u16(8);
   3.125 +    vector unsigned short v1_16 = vec_splat_u16(1);
   3.126 +    vector unsigned short v2_16 = vec_splat_u16(2);
   3.127 +    vector unsigned short v3_16 = vec_splat_u16(3);
   3.128 +    vector unsigned int v8_32 = vec_splat_u32(8);
   3.129 +    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   3.130 +    vector unsigned short v3f = (vector unsigned short)(
   3.131 +        0x003f, 0x003f, 0x003f, 0x003f,
   3.132 +        0x003f, 0x003f, 0x003f, 0x003f);
   3.133 +    vector unsigned short vfc = (vector unsigned short)(
   3.134 +        0x00fc, 0x00fc, 0x00fc, 0x00fc,
   3.135 +        0x00fc, 0x00fc, 0x00fc, 0x00fc);
   3.136 +
   3.137 +    /* 
   3.138 +        0x10 - 0x1f is the alpha
   3.139 +        0x00 - 0x0e evens are the red
   3.140 +        0x01 - 0x0f odds are zero
   3.141 +    */
   3.142 +    vector unsigned char vredalpha1 = (vector unsigned char)(
   3.143 +        0x10, 0x00, 0x01, 0x01,
   3.144 +        0x10, 0x02, 0x01, 0x01,
   3.145 +        0x10, 0x04, 0x01, 0x01,
   3.146 +        0x10, 0x06, 0x01, 0x01
   3.147 +    );
   3.148 +    vector unsigned char vredalpha2 = (vector unsigned char)(
   3.149 +        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
   3.150 +    );
   3.151 +    /*
   3.152 +        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   3.153 +        0x11 - 0x0f odds are blue
   3.154 +    */
   3.155 +    vector unsigned char vblue1 = (vector unsigned char)(
   3.156 +        0x00, 0x01, 0x02, 0x11,
   3.157 +        0x04, 0x05, 0x06, 0x13,
   3.158 +        0x08, 0x09, 0x0a, 0x15,
   3.159 +        0x0c, 0x0d, 0x0e, 0x17
   3.160 +    );
   3.161 +    vector unsigned char vblue2 = (vector unsigned char)(
   3.162 +        vec_add((vector unsigned int)vblue1, v8_32)
   3.163 +    );
   3.164 +    /*
   3.165 +        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   3.166 +        0x10 - 0x0e evens are green
   3.167 +    */
   3.168 +    vector unsigned char vgreen1 = (vector unsigned char)(
   3.169 +        0x00, 0x01, 0x10, 0x03,
   3.170 +        0x04, 0x05, 0x12, 0x07,
   3.171 +        0x08, 0x09, 0x14, 0x0b,
   3.172 +        0x0c, 0x0d, 0x16, 0x0f
   3.173 +    );
   3.174 +    vector unsigned char vgreen2 = (vector unsigned char)(
   3.175 +        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
   3.176 +    );
   3.177 +    vector unsigned char vgmerge = (vector unsigned char)(
   3.178 +        0x00, 0x02, 0x00, 0x06,
   3.179 +        0x00, 0x0a, 0x00, 0x0e,
   3.180 +        0x00, 0x12, 0x00, 0x16,
   3.181 +        0x00, 0x1a, 0x00, 0x1e);
   3.182 +    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   3.183 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   3.184 +    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   3.185 +
   3.186 +    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   3.187 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
   3.188 +
   3.189 +    while(height--) {
   3.190 +        int extrawidth;
   3.191 +        vector unsigned char valigner;
   3.192 +        vector unsigned char vsrc;
   3.193 +        vector unsigned char voverflow;
   3.194 +        int width = info->d_width;
   3.195 +
   3.196 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   3.197 +        while (condition) { \
   3.198 +            Uint32 pixel; \
   3.199 +            unsigned sR, sG, sB, dR, dG, dB, sA; \
   3.200 +            DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \
   3.201 +            if(sA) { \
   3.202 +                unsigned short dstpixel = *((unsigned short *)dst); \
   3.203 +                dR = (dstpixel >> 8) & 0xf8; \
   3.204 +                dG = (dstpixel >> 3) & 0xfc; \
   3.205 +                dB = (dstpixel << 3) & 0xf8; \
   3.206 +                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   3.207 +                *((unsigned short *)dst) = ( \
   3.208 +                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
   3.209 +                ); \
   3.210 +            } \
   3.211 +            src += 4; \
   3.212 +            dst += 2; \
   3.213 +            widthvar--; \
   3.214 +        }
   3.215 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   3.216 +        extrawidth = (width % 8);
   3.217 +        valigner = VEC_ALIGNER(src);
   3.218 +        vsrc = (vector unsigned char)vec_ld(0, src);
   3.219 +        width -= extrawidth;
   3.220 +        while (width) {
   3.221 +            vector unsigned char valpha;
   3.222 +            vector unsigned char vsrc1, vsrc2;
   3.223 +            vector unsigned char vdst1, vdst2;
   3.224 +            vector unsigned short vR, vG, vB;
   3.225 +            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   3.226 +
   3.227 +            /* Load 8 pixels from src as ARGB */
   3.228 +            voverflow = (vector unsigned char)vec_ld(15, src);
   3.229 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   3.230 +            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   3.231 +            src += 16;
   3.232 +            vsrc = (vector unsigned char)vec_ld(15, src);
   3.233 +            voverflow = vec_perm(voverflow, vsrc, valigner);
   3.234 +            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   3.235 +            src += 16;
   3.236 +
   3.237 +            /* Load 8 pixels from dst as XRGB */
   3.238 +            voverflow = vec_ld(0, dst);
   3.239 +            vR = vec_and((vector unsigned short)voverflow, vf800);
   3.240 +            vB = vec_sl((vector unsigned short)voverflow, v3_16);
   3.241 +            vG = vec_sl(vB, v2_16);
   3.242 +            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
   3.243 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   3.244 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   3.245 +            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
   3.246 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   3.247 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   3.248 +
   3.249 +            /* Alpha blend 8 pixels as ARGB */
   3.250 +            valpha = vec_perm(vsrc1, v0, valphaPermute);
   3.251 +            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
   3.252 +            valpha = vec_perm(vsrc2, v0, valphaPermute);
   3.253 +            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
   3.254 +
   3.255 +            /* Convert 8 pixels to 565 */
   3.256 +            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
   3.257 +            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
   3.258 +            vgpixel = vec_and(vgpixel, vfc);
   3.259 +            vgpixel = vec_sl(vgpixel, v3_16);
   3.260 +            vrpixel = vec_sl(vpixel, v1_16);
   3.261 +            vrpixel = vec_and(vrpixel, vf800);
   3.262 +            vbpixel = vec_and(vpixel, v3f);
   3.263 +            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   3.264 +            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
   3.265 +            
   3.266 +            /* Store 8 pixels */
   3.267 +            vec_st(vdst1, 0, dst);
   3.268 +
   3.269 +            width -= 8;
   3.270 +            dst += 16;
   3.271 +        }
   3.272 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.273 +#undef ONE_PIXEL_BLEND
   3.274 +        src += srcskip;
   3.275 +        dst += dstskip;
   3.276 +    }
   3.277 +}
   3.278 +
   3.279 +static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
   3.280 +{
   3.281 +    unsigned alpha = info->src->alpha;
   3.282 +    int height = info->d_height;
   3.283 +    Uint32 *srcp = (Uint32 *)info->s_pixels;
   3.284 +    int srcskip = info->s_skip >> 2;
   3.285 +    Uint32 *dstp = (Uint32 *)info->d_pixels;
   3.286 +    int dstskip = info->d_skip >> 2;
   3.287 +    SDL_PixelFormat *srcfmt = info->src;
   3.288 +    SDL_PixelFormat *dstfmt = info->dst;
   3.289 +    unsigned sA = srcfmt->alpha;
   3.290 +    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   3.291 +    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   3.292 +    Uint32 ckey = info->src->colorkey;
   3.293 +    vector unsigned char mergePermute;
   3.294 +    vector unsigned char vsrcPermute;
   3.295 +    vector unsigned char vdstPermute;
   3.296 +    vector unsigned char vsdstPermute;
   3.297 +    vector unsigned char valpha;
   3.298 +    vector unsigned char valphamask;
   3.299 +    vector unsigned char vbits;
   3.300 +    vector unsigned char v0;
   3.301 +    vector unsigned short v1;
   3.302 +    vector unsigned short v8;
   3.303 +    vector unsigned int vckey;
   3.304 +    vector unsigned int vrgbmask;
   3.305 +
   3.306 +    mergePermute = VEC_MERGE_PERMUTE();
   3.307 +    v0 = vec_splat_u8(0);
   3.308 +    v1 = vec_splat_u16(1);
   3.309 +    v8 = vec_splat_u16(8);
   3.310 +
   3.311 +    /* set the alpha to 255 on the destination surf */
   3.312 +    valphamask = VEC_ALPHA_MASK();
   3.313 +
   3.314 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
   3.315 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
   3.316 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
   3.317 +
   3.318 +    /* set a vector full of alpha and 255-alpha */
   3.319 +    ((unsigned char *)&valpha)[0] = alpha;
   3.320 +    valpha = vec_splat(valpha, 0);
   3.321 +    vbits = (vector unsigned char)vec_splat_s8(-1);
   3.322 +
   3.323 +    ckey &= rgbmask;
   3.324 +    ((unsigned int *)&vckey)[0] = ckey;
   3.325 +    vckey = vec_splat(vckey, 0);
   3.326 +    ((unsigned int *)&vrgbmask)[0] = rgbmask;
   3.327 +    vrgbmask = vec_splat(vrgbmask, 0);
   3.328 +
   3.329 +    while(height--) {
   3.330 +        int width = info->d_width;
   3.331 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   3.332 +        while (condition) { \
   3.333 +            Uint32 pixel; \
   3.334 +            unsigned sR, sG, sB, dR, dG, dB; \
   3.335 +            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \
   3.336 +            if(sA && pixel != ckey) { \
   3.337 +                RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
   3.338 +                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
   3.339 +                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   3.340 +                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   3.341 +            } \
   3.342 +            ((Uint8 *)dstp) += 4; \
   3.343 +            ((Uint8 *)srcp) += 4; \
   3.344 +            widthvar--; \
   3.345 +        }
   3.346 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   3.347 +        if (width > 0) {
   3.348 +            int extrawidth = (width % 4);
   3.349 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   3.350 +            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   3.351 +            width -= extrawidth;
   3.352 +            while (width) {
   3.353 +                vector unsigned char vsel;
   3.354 +                vector unsigned char voverflow;
   3.355 +                vector unsigned char vd;
   3.356 +                vector unsigned char vd_orig;
   3.357 +
   3.358 +                /* s = *srcp */
   3.359 +                voverflow = (vector unsigned char)vec_ld(15, srcp);
   3.360 +                vs = vec_perm(vs, voverflow, valigner);
   3.361 +                
   3.362 +                /* vsel is set for items that match the key */
   3.363 +                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
   3.364 +                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
   3.365 +
   3.366 +                /* permute to source format */
   3.367 +                vs = vec_perm(vs, valpha, vsrcPermute);
   3.368 +
   3.369 +                /* d = *dstp */
   3.370 +                vd = (vector unsigned char)vec_ld(0, dstp);
   3.371 +                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
   3.372 +
   3.373 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   3.374 +
   3.375 +                /* set the alpha channel to full on */
   3.376 +                vd = vec_or(vd, valphamask);
   3.377 +
   3.378 +                /* mask out color key */
   3.379 +                vd = vec_sel(vd, vd_orig, vsel);
   3.380 +                
   3.381 +                /* permute to dest format */
   3.382 +                vd = vec_perm(vd, vbits, vdstPermute);
   3.383 +
   3.384 +                /* *dstp = res */
   3.385 +                vec_st((vector unsigned int)vd, 0, dstp);
   3.386 +                
   3.387 +                srcp += 4;
   3.388 +                dstp += 4;
   3.389 +                width -= 4;
   3.390 +                vs = voverflow;
   3.391 +            }
   3.392 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.393 +        }
   3.394 +#undef ONE_PIXEL_BLEND
   3.395 + 
   3.396 +        srcp += srcskip;
   3.397 +        dstp += dstskip;
   3.398 +    }
   3.399 +}
   3.400 +
   3.401 +
   3.402 +static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
   3.403 +{
   3.404 +    int width = info->d_width;
   3.405 +    int height = info->d_height;
   3.406 +    Uint32 *srcp = (Uint32 *)info->s_pixels;
   3.407 +    int srcskip = info->s_skip >> 2;
   3.408 +    Uint32 *dstp = (Uint32 *)info->d_pixels;
   3.409 +    int dstskip = info->d_skip >> 2;
   3.410 +    SDL_PixelFormat *srcfmt = info->src;
   3.411 +    SDL_PixelFormat *dstfmt = info->dst;
   3.412 +    vector unsigned char mergePermute;
   3.413 +    vector unsigned char valphaPermute;
   3.414 +    vector unsigned char vsrcPermute;
   3.415 +    vector unsigned char vdstPermute;
   3.416 +    vector unsigned char vsdstPermute;
   3.417 +    vector unsigned char valphamask;
   3.418 +    vector unsigned char vpixelmask;
   3.419 +    vector unsigned char v0;
   3.420 +    vector unsigned short v1;
   3.421 +    vector unsigned short v8;
   3.422 +
   3.423 +    v0 = vec_splat_u8(0);
   3.424 +    v1 = vec_splat_u16(1);
   3.425 +    v8 = vec_splat_u16(8);
   3.426 +    mergePermute = VEC_MERGE_PERMUTE();
   3.427 +    valphamask = VEC_ALPHA_MASK();
   3.428 +    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   3.429 +    vpixelmask = vec_nor(valphamask, v0);
   3.430 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
   3.431 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
   3.432 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
   3.433 +
   3.434 +	while ( height-- ) {
   3.435 +        width = info->d_width;
   3.436 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   3.437 +            Uint32 pixel; \
   3.438 +            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
   3.439 +            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \
   3.440 +            if(sA) { \
   3.441 +              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \
   3.442 +              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   3.443 +              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
   3.444 +            } \
   3.445 +            ++srcp; \
   3.446 +            ++dstp; \
   3.447 +            widthvar--; \
   3.448 +        }
   3.449 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   3.450 +        if (width > 0) {
   3.451 +            // vsrcPermute
   3.452 +            // vdstPermute
   3.453 +            int extrawidth = (width % 4);
   3.454 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   3.455 +            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   3.456 +            width -= extrawidth;
   3.457 +            while (width) {
   3.458 +                vector unsigned char voverflow;
   3.459 +                vector unsigned char vd;
   3.460 +                vector unsigned char valpha;
   3.461 +                vector unsigned char vdstalpha;
   3.462 +                /* s = *srcp */
   3.463 +                voverflow = (vector unsigned char)vec_ld(15, srcp);
   3.464 +                vs = vec_perm(vs, voverflow, valigner);
   3.465 +                vs = vec_perm(vs, v0, vsrcPermute);
   3.466 +
   3.467 +                valpha = vec_perm(vs, v0, valphaPermute);
   3.468 +                
   3.469 +                /* d = *dstp */
   3.470 +                vd = (vector unsigned char)vec_ld(0, dstp);
   3.471 +                vd = vec_perm(vd, v0, vsdstPermute);
   3.472 +                vdstalpha = vec_and(vd, valphamask);
   3.473 +
   3.474 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   3.475 +
   3.476 +                /* set the alpha to the dest alpha */
   3.477 +                vd = vec_and(vd, vpixelmask);
   3.478 +                vd = vec_or(vd, vdstalpha);
   3.479 +                vd = vec_perm(vd, v0, vdstPermute);
   3.480 +
   3.481 +                /* *dstp = res */
   3.482 +                vec_st((vector unsigned int)vd, 0, dstp);
   3.483 +                
   3.484 +                srcp += 4;
   3.485 +                dstp += 4;
   3.486 +                width -= 4;
   3.487 +                vs = voverflow;
   3.488 +
   3.489 +            }
   3.490 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.491 +        }
   3.492 +	    srcp += srcskip;
   3.493 +	    dstp += dstskip;
   3.494 +#undef ONE_PIXEL_BLEND
   3.495 +	}
   3.496 +}
   3.497 +
   3.498 +/* fast ARGB888->(A)RGB888 blending with pixel alpha */
   3.499 +static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
   3.500 +{
   3.501 +	int width = info->d_width;
   3.502 +	int height = info->d_height;
   3.503 +	Uint32 *srcp = (Uint32 *)info->s_pixels;
   3.504 +	int srcskip = info->s_skip >> 2;
   3.505 +	Uint32 *dstp = (Uint32 *)info->d_pixels;
   3.506 +	int dstskip = info->d_skip >> 2;
   3.507 +    vector unsigned char mergePermute;
   3.508 +    vector unsigned char valphaPermute;
   3.509 +    vector unsigned char valphamask;
   3.510 +    vector unsigned char vpixelmask;
   3.511 +    vector unsigned char v0;
   3.512 +    vector unsigned short v1;
   3.513 +    vector unsigned short v8;
   3.514 +    v0 = vec_splat_u8(0);
   3.515 +    v1 = vec_splat_u16(1);
   3.516 +    v8 = vec_splat_u16(8);
   3.517 +    mergePermute = VEC_MERGE_PERMUTE();
   3.518 +    valphamask = VEC_ALPHA_MASK();
   3.519 +    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
   3.520 +    
   3.521 + 
   3.522 +    vpixelmask = vec_nor(valphamask, v0);
   3.523 +	while(height--) {
   3.524 +        width = info->d_width;
   3.525 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   3.526 +        while ((condition)) { \
   3.527 +            Uint32 dalpha; \
   3.528 +            Uint32 d; \
   3.529 +            Uint32 s1; \
   3.530 +            Uint32 d1; \
   3.531 +            Uint32 s = *srcp; \
   3.532 +            Uint32 alpha = s >> 24; \
   3.533 +            if(alpha) { \
   3.534 +              if(alpha == SDL_ALPHA_OPAQUE) { \
   3.535 +                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
   3.536 +              } else { \
   3.537 +                d = *dstp; \
   3.538 +                dalpha = d & 0xff000000; \
   3.539 +                s1 = s & 0xff00ff; \
   3.540 +                d1 = d & 0xff00ff; \
   3.541 +                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
   3.542 +                s &= 0xff00; \
   3.543 +                d &= 0xff00; \
   3.544 +                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   3.545 +                *dstp = d1 | d | dalpha; \
   3.546 +              } \
   3.547 +            } \
   3.548 +            ++srcp; \
   3.549 +            ++dstp; \
   3.550 +            widthvar--; \
   3.551 +	    }
   3.552 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   3.553 +        if (width > 0) {
   3.554 +            int extrawidth = (width % 4);
   3.555 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   3.556 +            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   3.557 +            width -= extrawidth;
   3.558 +            while (width) {
   3.559 +                vector unsigned char voverflow;
   3.560 +                vector unsigned char vd;
   3.561 +                vector unsigned char valpha;
   3.562 +                vector unsigned char vdstalpha;
   3.563 +                /* s = *srcp */
   3.564 +                voverflow = (vector unsigned char)vec_ld(15, srcp);
   3.565 +                vs = vec_perm(vs, voverflow, valigner);
   3.566 +
   3.567 +                valpha = vec_perm(vs, v0, valphaPermute);
   3.568 +                
   3.569 +                /* d = *dstp */
   3.570 +                vd = (vector unsigned char)vec_ld(0, dstp);
   3.571 +                vdstalpha = vec_and(vd, valphamask);
   3.572 +
   3.573 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   3.574 +
   3.575 +                /* set the alpha to the dest alpha */
   3.576 +                vd = vec_and(vd, vpixelmask);
   3.577 +                vd = vec_or(vd, vdstalpha);
   3.578 +
   3.579 +                /* *dstp = res */
   3.580 +                vec_st((vector unsigned int)vd, 0, dstp);
   3.581 +                
   3.582 +                srcp += 4;
   3.583 +                dstp += 4;
   3.584 +                width -= 4;
   3.585 +                vs = voverflow;
   3.586 +            }
   3.587 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.588 +        }
   3.589 +	    srcp += srcskip;
   3.590 +	    dstp += dstskip;
   3.591 +	}
   3.592 +#undef ONE_PIXEL_BLEND
   3.593 +}
   3.594 +
   3.595 +static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
   3.596 +{
   3.597 +    /* XXX : 6 */
   3.598 +	unsigned alpha = info->src->alpha;
   3.599 +    int height = info->d_height;
   3.600 +    Uint32 *srcp = (Uint32 *)info->s_pixels;
   3.601 +    int srcskip = info->s_skip >> 2;
   3.602 +    Uint32 *dstp = (Uint32 *)info->d_pixels;
   3.603 +    int dstskip = info->d_skip >> 2;
   3.604 +    SDL_PixelFormat *srcfmt = info->src;
   3.605 +    SDL_PixelFormat *dstfmt = info->dst;
   3.606 +	unsigned sA = srcfmt->alpha;
   3.607 +	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
   3.608 +    vector unsigned char mergePermute;
   3.609 +    vector unsigned char vsrcPermute;
   3.610 +    vector unsigned char vdstPermute;
   3.611 +    vector unsigned char vsdstPermute;
   3.612 +    vector unsigned char valpha;
   3.613 +    vector unsigned char valphamask;
   3.614 +    vector unsigned char vbits;
   3.615 +    vector unsigned short v1;
   3.616 +    vector unsigned short v8;
   3.617 +
   3.618 +    mergePermute = VEC_MERGE_PERMUTE();
   3.619 +    v1 = vec_splat_u16(1);
   3.620 +    v8 = vec_splat_u16(8);
   3.621 +
   3.622 +    /* set the alpha to 255 on the destination surf */
   3.623 +    valphamask = VEC_ALPHA_MASK();
   3.624 +
   3.625 +    vsrcPermute = calc_swizzle32(srcfmt, NULL);
   3.626 +    vdstPermute = calc_swizzle32(NULL, dstfmt);
   3.627 +    vsdstPermute = calc_swizzle32(dstfmt, NULL);
   3.628 +
   3.629 +    /* set a vector full of alpha and 255-alpha */
   3.630 +    ((unsigned char *)&valpha)[0] = alpha;
   3.631 +    valpha = vec_splat(valpha, 0);
   3.632 +    vbits = (vector unsigned char)vec_splat_s8(-1);
   3.633 +
   3.634 +    while(height--) {
   3.635 +        int width = info->d_width;
   3.636 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   3.637 +            Uint32 pixel; \
   3.638 +            unsigned sR, sG, sB, dR, dG, dB; \
   3.639 +            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \
   3.640 +            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
   3.641 +            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
   3.642 +            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
   3.643 +            ++srcp; \
   3.644 +            ++dstp; \
   3.645 +            widthvar--; \
   3.646 +        }
   3.647 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   3.648 +        if (width > 0) {
   3.649 +            int extrawidth = (width % 4);
   3.650 +            vector unsigned char valigner = vec_lvsl(0, srcp);
   3.651 +            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   3.652 +            width -= extrawidth;
   3.653 +            while (width) {
   3.654 +                vector unsigned char voverflow;
   3.655 +                vector unsigned char vd;
   3.656 +
   3.657 +                /* s = *srcp */
   3.658 +                voverflow = (vector unsigned char)vec_ld(15, srcp);
   3.659 +                vs = vec_perm(vs, voverflow, valigner);
   3.660 +                vs = vec_perm(vs, valpha, vsrcPermute);
   3.661 +                
   3.662 +                /* d = *dstp */
   3.663 +                vd = (vector unsigned char)vec_ld(0, dstp);
   3.664 +                vd = vec_perm(vd, vd, vsdstPermute);
   3.665 +
   3.666 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   3.667 +
   3.668 +                /* set the alpha channel to full on */
   3.669 +                vd = vec_or(vd, valphamask);
   3.670 +                vd = vec_perm(vd, vbits, vdstPermute);
   3.671 +
   3.672 +                /* *dstp = res */
   3.673 +                vec_st((vector unsigned int)vd, 0, dstp);
   3.674 +                
   3.675 +                srcp += 4;
   3.676 +                dstp += 4;
   3.677 +                width -= 4;
   3.678 +                vs = voverflow;
   3.679 +            }
   3.680 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.681 +        }
   3.682 +#undef ONE_PIXEL_BLEND
   3.683 + 
   3.684 +        srcp += srcskip;
   3.685 +        dstp += dstskip;
   3.686 +    }
   3.687 +
   3.688 +}
   3.689 +
   3.690 +
   3.691 +/* fast RGB888->(A)RGB888 blending */
   3.692 +static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
   3.693 +{
   3.694 +	unsigned alpha = info->src->alpha;
   3.695 +    int height = info->d_height;
   3.696 +    Uint32 *srcp = (Uint32 *)info->s_pixels;
   3.697 +    int srcskip = info->s_skip >> 2;
   3.698 +    Uint32 *dstp = (Uint32 *)info->d_pixels;
   3.699 +    int dstskip = info->d_skip >> 2;
   3.700 +    vector unsigned char mergePermute;
   3.701 +    vector unsigned char valpha;
   3.702 +    vector unsigned char valphamask;
   3.703 +    vector unsigned short v1;
   3.704 +    vector unsigned short v8;
   3.705 +
   3.706 +    mergePermute = VEC_MERGE_PERMUTE();
   3.707 +    v1 = vec_splat_u16(1);
   3.708 +    v8 = vec_splat_u16(8);
   3.709 +
   3.710 +    /* set the alpha to 255 on the destination surf */
   3.711 +    valphamask = VEC_ALPHA_MASK();
   3.712 +
   3.713 +    /* set a vector full of alpha and 255-alpha */
   3.714 +    ((unsigned char *)&valpha)[0] = alpha;
   3.715 +    valpha = vec_splat(valpha, 0);
   3.716 +
   3.717 +    while(height--) {
   3.718 +        int width = info->d_width;
   3.719 +#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
   3.720 +            Uint32 s = *srcp; \
   3.721 +            Uint32 d = *dstp; \
   3.722 +            Uint32 s1 = s & 0xff00ff; \
   3.723 +            Uint32 d1 = d & 0xff00ff; \
   3.724 +            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
   3.725 +                 & 0xff00ff; \
   3.726 +            s &= 0xff00; \
   3.727 +            d &= 0xff00; \
   3.728 +            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
   3.729 +            *dstp = d1 | d | 0xff000000; \
   3.730 +            ++srcp; \
   3.731 +            ++dstp; \
   3.732 +            widthvar--; \
   3.733 +        }
   3.734 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   3.735 +        if (width > 0) {
   3.736 +            int extrawidth = (width % 4);
   3.737 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   3.738 +            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
   3.739 +            width -= extrawidth;
   3.740 +            while (width) {
   3.741 +                vector unsigned char voverflow;
   3.742 +                vector unsigned char vd;
   3.743 +
   3.744 +                /* s = *srcp */
   3.745 +                voverflow = (vector unsigned char)vec_ld(15, srcp);
   3.746 +                vs = vec_perm(vs, voverflow, valigner);
   3.747 +                
   3.748 +                /* d = *dstp */
   3.749 +                vd = (vector unsigned char)vec_ld(0, dstp);
   3.750 +
   3.751 +                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
   3.752 +
   3.753 +                /* set the alpha channel to full on */
   3.754 +                vd = vec_or(vd, valphamask);
   3.755 +
   3.756 +                /* *dstp = res */
   3.757 +                vec_st((vector unsigned int)vd, 0, dstp);
   3.758 +                
   3.759 +                srcp += 4;
   3.760 +                dstp += 4;
   3.761 +                width -= 4;
   3.762 +                vs = voverflow;
   3.763 +            }
   3.764 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   3.765 +        }
   3.766 +#undef ONE_PIXEL_BLEND
   3.767 + 
   3.768 +        srcp += srcskip;
   3.769 +        dstp += dstskip;
   3.770 +    }
   3.771 +}
   3.772 +#endif /* USE_ALTIVEC_BLITTERS */
   3.773 +
   3.774  /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   3.775  static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
   3.776  {
   3.777 @@ -1372,7 +2128,12 @@
   3.778  	    if(df->BytesPerPixel == 1)
   3.779  		return BlitNto1SurfaceAlphaKey;
   3.780  	    else
   3.781 -		return BlitNtoNSurfaceAlphaKey;
   3.782 +#ifdef USE_ALTIVEC_BLITTERS
   3.783 +        if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec())
   3.784 +            return Blit32to32SurfaceAlphaKeyAltivec;
   3.785 +        else
   3.786 +#endif
   3.787 +            return BlitNtoNSurfaceAlphaKey;
   3.788  	} else {
   3.789  	    /* Per-surface alpha blits */
   3.790  	    switch(df->BytesPerPixel) {
   3.791 @@ -1414,9 +2175,19 @@
   3.792  		    return BlitRGBtoRGBSurfaceAlphaMMX;
   3.793  		else
   3.794  #endif
   3.795 +#ifdef USE_ALTIVEC_BLITTERS
   3.796 +        if(SDL_HasAltiVec())
   3.797 +            return BlitRGBtoRGBSurfaceAlphaAltivec;
   3.798 +        else
   3.799 +#endif
   3.800  		    return BlitRGBtoRGBSurfaceAlpha;
   3.801  		}
   3.802  		else
   3.803 +#ifdef USE_ALTIVEC_BLITTERS
   3.804 +        if((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
   3.805 +            return Blit32to32SurfaceAlphaAltivec;
   3.806 +        else
   3.807 +#endif
   3.808  		    return BlitNtoNSurfaceAlpha;
   3.809  
   3.810  	    case 3:
   3.811 @@ -1431,6 +2202,13 @@
   3.812  	    return BlitNto1PixelAlpha;
   3.813  
   3.814  	case 2:
   3.815 +#ifdef USE_ALTIVEC_BLITTERS
   3.816 +        if(sf->BytesPerPixel == 4 && 
   3.817 +           df->Gmask == 0x7e0 &&
   3.818 +           df->Bmask == 0x1f)
   3.819 +            return Blit32to565PixelAlphaAltivec;
   3.820 +        else
   3.821 +#endif
   3.822  	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
   3.823  	       && sf->Gmask == 0xff00
   3.824  	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
   3.825 @@ -1457,8 +2235,18 @@
   3.826  		    return BlitRGBtoRGBPixelAlphaMMX;
   3.827  		else
   3.828  #endif
   3.829 +#ifdef USE_ALTIVEC_BLITTERS
   3.830 +        if(SDL_HasAltiVec())
   3.831 +            return BlitRGBtoRGBPixelAlphaAltivec;
   3.832 +        else
   3.833 +#endif
   3.834  		    return BlitRGBtoRGBPixelAlpha;
   3.835  	    }
   3.836 +#ifdef USE_ALTIVEC_BLITTERS
   3.837 +        if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
   3.838 +            return Blit32to32PixelAlphaAltivec;
   3.839 +        else
   3.840 +#endif
   3.841  	    return BlitNtoNPixelAlpha;
   3.842  
   3.843  	case 3:
     4.1 --- a/src/video/SDL_blit_N.c	Sun Apr 17 10:16:30 2005 +0000
     4.2 +++ b/src/video/SDL_blit_N.c	Sun Apr 17 10:19:22 2005 +0000
     4.3 @@ -35,6 +35,656 @@
     4.4  
     4.5  /* Functions to blit from N-bit surfaces to other surfaces */
     4.6  
     4.7 +#ifdef USE_ALTIVEC_BLITTERS
     4.8 +#include <assert.h>
     4.9 +#ifdef MACOSX
    4.10 +#include <sys/sysctl.h>
    4.11 +#include <stdlib.h>
    4.12 +static size_t GetL3CacheSize( void )
    4.13 +{
    4.14 +    const char key[] = "hw.l3cachesize";
    4.15 +    u_int64_t result = 0;
    4.16 +    size_t typeSize = sizeof( result );
    4.17 +
    4.18 +
    4.19 +    int err = sysctlbyname( key, &result, &typeSize, NULL, 0 );
    4.20 +    if( 0 != err ) return 0;
    4.21 +
    4.22 +    return result;
    4.23 +}
    4.24 +#else
    4.25 +static size_t GetL3CacheSize( void )
    4.26 +{
    4.27 +    /* XXX: Just guess G4 */
    4.28 +    return 2097152;
    4.29 +}
    4.30 +#endif /* MACOSX */
    4.31 +
    4.32 +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
    4.33 +#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
    4.34 +                               ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
    4.35 +                                 0x04+a, 0x04+b, 0x04+c, 0x04+d, \
    4.36 +                                 0x08+a, 0x08+b, 0x08+c, 0x08+d, \
    4.37 +                                 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
    4.38 +
    4.39 +#define MAKE8888(dstfmt, r, g, b, a)  \
    4.40 +    ( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
    4.41 +      ((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
    4.42 +      ((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
    4.43 +      ((a<<dstfmt->Ashift)&dstfmt->Amask) )
    4.44 +
    4.45 +/*
    4.46 + * Data Stream Touch...Altivec cache prefetching.
    4.47 + *
    4.48 + *  Don't use this on a G5...however, the speed boost is very significant
    4.49 + *   on a G4.
    4.50 + */
    4.51 +#define DST_CHAN_SRC 1
    4.52 +#define DST_CHAN_DEST 2
    4.53 +
    4.54 +/* macro to set DST control word value... */
    4.55 +#define DST_CTRL(size, count, stride) \
    4.56 +    (((size) << 24) | ((count) << 16) | (stride))
    4.57 +
    4.58 +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
    4.59 +    ? vec_lvsl(0, src) \
    4.60 +    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
    4.61 +
    4.62 +/* Calculate the permute vector used for 32->32 swizzling */
    4.63 +static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
    4.64 +                                  const SDL_PixelFormat *dstfmt)
    4.65 +{
    4.66 +    /*
    4.67 +    * We have to assume that the bits that aren't used by other
    4.68 +     *  colors is alpha, and it's one complete byte, since some formats
    4.69 +     *  leave alpha with a zero mask, but we should still swizzle the bits.
    4.70 +     */
    4.71 +    /* ARGB */
    4.72 +    const static struct SDL_PixelFormat default_pixel_format = {
    4.73 +        NULL, 0, 0,
    4.74 +        0, 0, 0, 0,
    4.75 +        16, 8, 0, 24,
    4.76 +        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
    4.77 +        0, 0};
    4.78 +    if (!srcfmt) {
    4.79 +        srcfmt = &default_pixel_format;
    4.80 +    }
    4.81 +    if (!dstfmt) {
    4.82 +        dstfmt = &default_pixel_format;
    4.83 +    }
    4.84 +    vector unsigned char plus = (vector unsigned char)( 0x00, 0x00, 0x00, 0x00,
    4.85 +                                      0x04, 0x04, 0x04, 0x04,
    4.86 +                                      0x08, 0x08, 0x08, 0x08,
    4.87 +                                      0x0C, 0x0C, 0x0C, 0x0C );
    4.88 +    vector unsigned char vswiz;
    4.89 +    vector unsigned int srcvec;
    4.90 +#define RESHIFT(X) (3 - ((X) >> 3))
    4.91 +    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
    4.92 +    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
    4.93 +    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
    4.94 +    Uint32 amask;
    4.95 +    /* Use zero for alpha if either surface doesn't have alpha */
    4.96 +    if (dstfmt->Amask) {
    4.97 +        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
    4.98 +    } else {    
    4.99 +        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   4.100 +    }           
   4.101 +#undef RESHIFT  
   4.102 +    ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
   4.103 +    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   4.104 +    return(vswiz);
   4.105 +}
   4.106 +
   4.107 +static void Blit_RGB888_RGB565(SDL_BlitInfo *info);
   4.108 +static void Blit_RGB888_RGB565Altivec(SDL_BlitInfo *info) {
   4.109 +    int height = info->d_height;
   4.110 +    Uint8 *src = (Uint8 *) info->s_pixels;
   4.111 +    int srcskip = info->s_skip;
   4.112 +    Uint8 *dst = (Uint8 *) info->d_pixels;
   4.113 +    int dstskip = info->d_skip;
   4.114 +    SDL_PixelFormat *srcfmt = info->src;
   4.115 +    vector unsigned char valpha = vec_splat_u8(0);
   4.116 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   4.117 +    vector unsigned char vgmerge = (vector unsigned char)(
   4.118 +        0x00, 0x02, 0x00, 0x06,
   4.119 +        0x00, 0x0a, 0x00, 0x0e,
   4.120 +        0x00, 0x12, 0x00, 0x16,
   4.121 +        0x00, 0x1a, 0x00, 0x1e);
   4.122 +    vector unsigned short v1 = vec_splat_u16(1);
   4.123 +    vector unsigned short v3 = vec_splat_u16(3);
   4.124 +    vector unsigned short v3f = (vector unsigned short)(
   4.125 +        0x003f, 0x003f, 0x003f, 0x003f,
   4.126 +        0x003f, 0x003f, 0x003f, 0x003f);
   4.127 +    vector unsigned short vfc = (vector unsigned short)(
   4.128 +        0x00fc, 0x00fc, 0x00fc, 0x00fc,
   4.129 +        0x00fc, 0x00fc, 0x00fc, 0x00fc);
   4.130 +    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   4.131 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
   4.132 +
   4.133 +    while (height--) {
   4.134 +        vector unsigned char valigner;
   4.135 +        vector unsigned char voverflow;
   4.136 +        vector unsigned char vsrc;
   4.137 +
   4.138 +        int width = info->d_width;
   4.139 +        int extrawidth;
   4.140 +
   4.141 +        /* do scalar until we can align... */
   4.142 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   4.143 +        while (condition) { \
   4.144 +            Uint32 pixel; \
   4.145 +            unsigned sR, sG, sB, sA; \
   4.146 +            DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, pixel, \
   4.147 +                          sR, sG, sB, sA); \
   4.148 +            *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
   4.149 +                                ((sG << 3) & 0x000007E0) | \
   4.150 +                                ((sB >> 3) & 0x0000001F)); \
   4.151 +            dst += 2; \
   4.152 +            src += 4; \
   4.153 +            widthvar--; \
   4.154 +        }
   4.155 +
   4.156 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
   4.157 +
   4.158 +        /* After all that work, here's the vector part! */
   4.159 +        extrawidth = (width % 8);  /* trailing unaligned stores */
   4.160 +        width -= extrawidth;
   4.161 +        vsrc = vec_ld(0, src);
   4.162 +        valigner = VEC_ALIGNER(src);
   4.163 +
   4.164 +        while (width) {
   4.165 +            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   4.166 +            vector unsigned int vsrc1, vsrc2;
   4.167 +            vector unsigned char vdst;
   4.168 +
   4.169 +            voverflow = vec_ld(15, src);
   4.170 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   4.171 +            vsrc1 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
   4.172 +            src += 16;
   4.173 +            vsrc = voverflow;
   4.174 +            voverflow = vec_ld(15, src);
   4.175 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   4.176 +            vsrc2 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
   4.177 +            /* 1555 */
   4.178 +            vpixel = (vector unsigned short)vec_packpx(vsrc1, vsrc2);
   4.179 +            vgpixel = (vector unsigned short)vec_perm(vsrc1, vsrc2, vgmerge);
   4.180 +            vgpixel = vec_and(vgpixel, vfc);
   4.181 +            vgpixel = vec_sl(vgpixel, v3);
   4.182 +            vrpixel = vec_sl(vpixel, v1);
   4.183 +            vrpixel = vec_and(vrpixel, vf800);
   4.184 +            vbpixel = vec_and(vpixel, v3f);
   4.185 +            vdst = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   4.186 +            /* 565 */
   4.187 +            vdst = vec_or(vdst, (vector unsigned char)vbpixel);
   4.188 +            vec_st(vdst, 0, dst);
   4.189 +
   4.190 +            width -= 8;
   4.191 +            src += 16;
   4.192 +            dst += 16;
   4.193 +            vsrc = voverflow;
   4.194 +        }
   4.195 +
   4.196 +        assert(width == 0);
   4.197 +
   4.198 +
   4.199 +        /* do scalar until we can align... */
   4.200 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
   4.201 +#undef ONE_PIXEL_BLEND
   4.202 +
   4.203 +        src += srcskip;  /* move to next row, accounting for pitch. */
   4.204 +        dst += dstskip;
   4.205 +    }
   4.206 +
   4.207 +
   4.208 +}
   4.209 +
   4.210 +static void Blit_RGB565_32Altivec(SDL_BlitInfo *info) {
   4.211 +    int height = info->d_height;
   4.212 +    Uint8 *src = (Uint8 *) info->s_pixels;
   4.213 +    int srcskip = info->s_skip;
   4.214 +    Uint8 *dst = (Uint8 *) info->d_pixels;
   4.215 +    int dstskip = info->d_skip;
   4.216 +    SDL_PixelFormat *srcfmt = info->src;
   4.217 +    SDL_PixelFormat *dstfmt = info->dst;
   4.218 +    unsigned alpha;
   4.219 +    vector unsigned char valpha;
   4.220 +    vector unsigned char vpermute;
   4.221 +    vector unsigned short vf800;
   4.222 +    vector unsigned int v8 = vec_splat_u32(8);
   4.223 +    vector unsigned int v16 = vec_add(v8, v8);
   4.224 +    vector unsigned short v2 = vec_splat_u16(2);
   4.225 +    vector unsigned short v3 = vec_splat_u16(3);
   4.226 +    /* 
   4.227 +        0x10 - 0x1f is the alpha
   4.228 +        0x00 - 0x0e evens are the red
   4.229 +        0x01 - 0x0f odds are zero
   4.230 +    */
   4.231 +    vector unsigned char vredalpha1 = (vector unsigned char)(
   4.232 +        0x10, 0x00, 0x01, 0x01,
   4.233 +        0x10, 0x02, 0x01, 0x01,
   4.234 +        0x10, 0x04, 0x01, 0x01,
   4.235 +        0x10, 0x06, 0x01, 0x01
   4.236 +    );
   4.237 +    vector unsigned char vredalpha2 = (vector unsigned char)(
   4.238 +        vec_add((vector unsigned int)vredalpha1, vec_sl(v8, v16))
   4.239 +    );
   4.240 +    /*
   4.241 +        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   4.242 +        0x11 - 0x0f odds are blue
   4.243 +    */
   4.244 +    vector unsigned char vblue1 = (vector unsigned char)(
   4.245 +        0x00, 0x01, 0x02, 0x11,
   4.246 +        0x04, 0x05, 0x06, 0x13,
   4.247 +        0x08, 0x09, 0x0a, 0x15,
   4.248 +        0x0c, 0x0d, 0x0e, 0x17
   4.249 +    );
   4.250 +    vector unsigned char vblue2 = (vector unsigned char)(
   4.251 +        vec_add((vector unsigned int)vblue1, v8)
   4.252 +    );
   4.253 +    /*
   4.254 +        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   4.255 +        0x10 - 0x0e evens are green
   4.256 +    */
   4.257 +    vector unsigned char vgreen1 = (vector unsigned char)(
   4.258 +        0x00, 0x01, 0x10, 0x03,
   4.259 +        0x04, 0x05, 0x12, 0x07,
   4.260 +        0x08, 0x09, 0x14, 0x0b,
   4.261 +        0x0c, 0x0d, 0x16, 0x0f
   4.262 +    );
   4.263 +    vector unsigned char vgreen2 = (vector unsigned char)(
   4.264 +        vec_add((vector unsigned int)vgreen1, vec_sl(v8, v8))
   4.265 +    );
   4.266 +    
   4.267 +
   4.268 +    assert(srcfmt->BytesPerPixel == 2);
   4.269 +    assert(dstfmt->BytesPerPixel == 4);
   4.270 +
   4.271 +    vf800 = (vector unsigned short)vec_splat_u8(-7);
   4.272 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
   4.273 +
   4.274 +    if (dstfmt->Amask && srcfmt->alpha) {
   4.275 +        ((unsigned char *)&valpha)[0] = alpha = srcfmt->alpha;
   4.276 +        valpha = vec_splat(valpha, 0);
   4.277 +    } else {
   4.278 +        alpha = 0;
   4.279 +        valpha = vec_splat_u8(0);
   4.280 +    }
   4.281 +
   4.282 +    vpermute = calc_swizzle32(NULL, dstfmt);
   4.283 +    while (height--) {
   4.284 +        vector unsigned char valigner;
   4.285 +        vector unsigned char voverflow;
   4.286 +        vector unsigned char vsrc;
   4.287 +
   4.288 +        int width = info->d_width;
   4.289 +        int extrawidth;
   4.290 +
   4.291 +        /* do scalar until we can align... */
   4.292 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   4.293 +        while (condition) { \
   4.294 +            unsigned sR, sG, sB; \
   4.295 +            unsigned short pixel = *((unsigned short *)src); \
   4.296 +            sR = (pixel >> 8) & 0xf8; \
   4.297 +            sG = (pixel >> 3) & 0xfc; \
   4.298 +            sB = (pixel << 3) & 0xf8; \
   4.299 +            ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
   4.300 +            src += 2; \
   4.301 +            dst += 4; \
   4.302 +            widthvar--; \
   4.303 +        }
   4.304 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
   4.305 +
   4.306 +        /* After all that work, here's the vector part! */
   4.307 +        extrawidth = (width % 8);  /* trailing unaligned stores */
   4.308 +        width -= extrawidth;
   4.309 +        vsrc = vec_ld(0, src);
   4.310 +        valigner = VEC_ALIGNER(src);
   4.311 +
   4.312 +        while (width) {
   4.313 +            vector unsigned short vR, vG, vB;
   4.314 +            vector unsigned char vdst1, vdst2;
   4.315 +
   4.316 +            voverflow = vec_ld(15, src);
   4.317 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   4.318 +
   4.319 +            vR = vec_and((vector unsigned short)vsrc, vf800);
   4.320 +            vB = vec_sl((vector unsigned short)vsrc, v3);
   4.321 +            vG = vec_sl(vB, v2);
   4.322 +
   4.323 +            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha1);
   4.324 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   4.325 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   4.326 +            vdst1 = vec_perm(vdst1, valpha, vpermute);
   4.327 +            vec_st(vdst1, 0, dst);
   4.328 +
   4.329 +            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha2);
   4.330 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   4.331 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   4.332 +            vdst2 = vec_perm(vdst2, valpha, vpermute);
   4.333 +            vec_st(vdst2, 16, dst);
   4.334 +            
   4.335 +            width -= 8;
   4.336 +            dst += 32;
   4.337 +            src += 16;
   4.338 +            vsrc = voverflow;
   4.339 +        }
   4.340 +
   4.341 +        assert(width == 0);
   4.342 +
   4.343 +
   4.344 +        /* do scalar until we can align... */
   4.345 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
   4.346 +#undef ONE_PIXEL_BLEND
   4.347 +
   4.348 +        src += srcskip;  /* move to next row, accounting for pitch. */
   4.349 +        dst += dstskip;
   4.350 +    }
   4.351 +
   4.352 +}
   4.353 +
   4.354 +static void BlitNtoNKey(SDL_BlitInfo *info);
   4.355 +static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info);
   4.356 +static void Blit32to32KeyAltivec(SDL_BlitInfo *info)
   4.357 +{
   4.358 +    int height = info->d_height;
   4.359 +    Uint32 *srcp = (Uint32 *) info->s_pixels;
   4.360 +    int srcskip = info->s_skip;
   4.361 +    Uint32 *dstp = (Uint32 *) info->d_pixels;
   4.362 +    int dstskip = info->d_skip;
   4.363 +    SDL_PixelFormat *srcfmt = info->src;
   4.364 +    int srcbpp = srcfmt->BytesPerPixel;
   4.365 +    SDL_PixelFormat *dstfmt = info->dst;
   4.366 +    int dstbpp = dstfmt->BytesPerPixel;
   4.367 +    int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
   4.368 +	unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0;
   4.369 +    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   4.370 +	Uint32 ckey = info->src->colorkey;
   4.371 +    vector unsigned int valpha;
   4.372 +    vector unsigned char vpermute;
   4.373 +    vector unsigned char vzero;
   4.374 +    vector unsigned int vckey;
   4.375 +    vector unsigned int vrgbmask;
   4.376 +    vpermute = calc_swizzle32(srcfmt, dstfmt);
   4.377 +    if (info->d_width < 16) {
   4.378 +        if(copy_alpha) {
   4.379 +            return BlitNtoNKeyCopyAlpha(info);
   4.380 +        } else {
   4.381 +            return BlitNtoNKey(info);
   4.382 +        }
   4.383 +    }
   4.384 +    vzero = vec_splat_u8(0);
   4.385 +    if (alpha) {
   4.386 +        ((unsigned char *)&valpha)[0] = (unsigned char)alpha;
   4.387 +        valpha = (vector unsigned int)vec_splat((vector unsigned char)valpha, 0);
   4.388 +    } else {
   4.389 +        valpha = (vector unsigned int)vzero;
   4.390 +    }
   4.391 +    ckey &= rgbmask;
   4.392 +    ((unsigned int *)&vckey)[0] = ckey;
   4.393 +    vckey = vec_splat(vckey, 0);
   4.394 +    ((unsigned int *)&vrgbmask)[0] = rgbmask;
   4.395 +    vrgbmask = vec_splat(vrgbmask, 0);
   4.396 +
   4.397 +    while (height--) {
   4.398 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   4.399 +        if (copy_alpha) { \
   4.400 +            while (condition) { \
   4.401 +                Uint32 pixel; \
   4.402 +                unsigned sR, sG, sB, sA; \
   4.403 +                DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, pixel, \
   4.404 +                          sR, sG, sB, sA); \
   4.405 +                if ( (pixel & rgbmask) != ckey ) { \
   4.406 +                      ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
   4.407 +                            sR, sG, sB, sA); \
   4.408 +                } \
   4.409 +                ((Uint8 *)dstp) += dstbpp; \
   4.410 +                ((Uint8 *)srcp) += srcbpp; \
   4.411 +                widthvar--; \
   4.412 +            } \
   4.413 +        } else { \
   4.414 +            while (condition) { \
   4.415 +                Uint32 pixel; \
   4.416 +                unsigned sR, sG, sB; \
   4.417 +                RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, pixel); \
   4.418 +                if ( pixel != ckey ) { \
   4.419 +                    RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
   4.420 +                    ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
   4.421 +                              sR, sG, sB, alpha); \
   4.422 +                } \
   4.423 +                ((Uint8 *)dstp) += dstbpp; \
   4.424 +                ((Uint8 *)srcp) += srcbpp; \
   4.425 +                widthvar--; \
   4.426 +            } \
   4.427 +        }
   4.428 +        int width = info->d_width;
   4.429 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   4.430 +        assert(width > 0);
   4.431 +        if (width > 0) {
   4.432 +            int extrawidth = (width % 4);
   4.433 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   4.434 +            vector unsigned int vs = vec_ld(0, srcp);
   4.435 +            width -= extrawidth;
   4.436 +            assert(width >= 4);
   4.437 +            while (width) {
   4.438 +                vector unsigned char vsel;
   4.439 +                vector unsigned int vd;
   4.440 +                vector unsigned int voverflow = vec_ld(15, srcp);
   4.441 +                /* load the source vec */
   4.442 +                vs = vec_perm(vs, voverflow, valigner);
   4.443 +                /* vsel is set for items that match the key */
   4.444 +                vsel = (vector unsigned char)vec_and(vs, vrgbmask);
   4.445 +                vsel = (vector unsigned char)vec_cmpeq(vs, vckey);
   4.446 +                /* permute the src vec to the dest format */
   4.447 +                vs = vec_perm(vs, valpha, vpermute);
   4.448 +                /* load the destination vec */
   4.449 +                vd = vec_ld(0, dstp);
   4.450 +                /* select the source and dest into vs */
   4.451 +                vd = (vector unsigned int)vec_sel((vector unsigned char)vs, (vector unsigned char)vd, vsel);
   4.452 +                
   4.453 +                vec_st(vd, 0, dstp);
   4.454 +                srcp += 4;
   4.455 +                width -= 4;
   4.456 +                dstp += 4;
   4.457 +                vs = voverflow;
   4.458 +            }
   4.459 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   4.460 +#undef ONE_PIXEL_BLEND
   4.461 +            srcp += srcskip >> 2;
   4.462 +            dstp += dstskip >> 2;
   4.463 +        }
   4.464 +    }
   4.465 +}
   4.466 +
   4.467 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
   4.468 +/* Use this on a G5 */
   4.469 +static void ConvertAltivec32to32_noprefetch(SDL_BlitInfo *info)
   4.470 +{
   4.471 +    int height = info->d_height;
   4.472 +    Uint32 *src = (Uint32 *) info->s_pixels;
   4.473 +    int srcskip = info->s_skip;
   4.474 +    Uint32 *dst = (Uint32 *) info->d_pixels;
   4.475 +    int dstskip = info->d_skip;
   4.476 +    SDL_PixelFormat *srcfmt = info->src;
   4.477 +    int srcbpp = srcfmt->BytesPerPixel;
   4.478 +    SDL_PixelFormat *dstfmt = info->dst;
   4.479 +    int dstbpp = dstfmt->BytesPerPixel;
   4.480 +    vector unsigned int vzero = vec_splat_u32(0);
   4.481 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
   4.482 +    if (dstfmt->Amask && !srcfmt->Amask) {
   4.483 +        if (srcfmt->alpha) {
   4.484 +            vector unsigned char valpha;
   4.485 +            ((unsigned char *)&valpha)[0] = srcfmt->alpha;
   4.486 +            vzero = (vector unsigned int)vec_splat(valpha, 0);
   4.487 +        }
   4.488 +    }
   4.489 +
   4.490 +    assert(srcbpp == 4);
   4.491 +    assert(dstbpp == 4);
   4.492 +
   4.493 +    while (height--) {
   4.494 +        vector unsigned char valigner;
   4.495 +        vector unsigned int vbits;
   4.496 +        vector unsigned int voverflow;
   4.497 +        Uint32 bits;
   4.498 +        Uint8 r, g, b, a;
   4.499 +
   4.500 +        int width = info->d_width;
   4.501 +        int extrawidth;
   4.502 +
   4.503 +        /* do scalar until we can align... */
   4.504 +        while ((UNALIGNED_PTR(dst)) && (width)) {
   4.505 +            bits = *(src++);
   4.506 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   4.507 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   4.508 +            width--;
   4.509 +        }
   4.510 +
   4.511 +        /* After all that work, here's the vector part! */
   4.512 +        extrawidth = (width % 4);
   4.513 +        width -= extrawidth;
   4.514 +        valigner = VEC_ALIGNER(src);
   4.515 +        vbits = vec_ld(0, src);
   4.516 +
   4.517 +       while (width) {
   4.518 +            voverflow = vec_ld(15, src);
   4.519 +            src += 4;
   4.520 +            width -= 4;
   4.521 +            vbits = vec_perm(vbits, voverflow, valigner);  /* src is ready. */
   4.522 +            vbits = vec_perm(vbits, vzero, vpermute);  /* swizzle it. */
   4.523 +            vec_st(vbits, 0, dst);  /* store it back out. */
   4.524 +            dst += 4;
   4.525 +            vbits = voverflow;
   4.526 +        }
   4.527 +
   4.528 +        assert(width == 0);
   4.529 +
   4.530 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
   4.531 +        while (extrawidth) {
   4.532 +            bits = *(src++);  /* max 7 pixels, don't bother with prefetch. */
   4.533 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   4.534 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   4.535 +            extrawidth--;
   4.536 +        }
   4.537 +
   4.538 +        src += srcskip >> 2;  /* move to next row, accounting for pitch. */
   4.539 +        dst += dstskip >> 2;
   4.540 +    }
   4.541 +
   4.542 +}
   4.543 +
   4.544 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
   4.545 +/* Use this on a G4 */
   4.546 +static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info)
   4.547 +{
   4.548 +    const int scalar_dst_lead = sizeof (Uint32) * 4;
   4.549 +    const int vector_dst_lead = sizeof (Uint32) * 16;
   4.550 +
   4.551 +    int height = info->d_height;
   4.552 +    Uint32 *src = (Uint32 *) info->s_pixels;
   4.553 +    int srcskip = info->s_skip;
   4.554 +    Uint32 *dst = (Uint32 *) info->d_pixels;
   4.555 +    int dstskip = info->d_skip;
   4.556 +    SDL_PixelFormat *srcfmt = info->src;
   4.557 +    int srcbpp = srcfmt->BytesPerPixel;
   4.558 +    SDL_PixelFormat *dstfmt = info->dst;
   4.559 +    int dstbpp = dstfmt->BytesPerPixel;
   4.560 +    vector unsigned int vzero = vec_splat_u32(0);
   4.561 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
   4.562 +    if (dstfmt->Amask && !srcfmt->Amask) {
   4.563 +        if (srcfmt->alpha) {
   4.564 +            vector unsigned char valpha;
   4.565 +            ((unsigned char *)&valpha)[0] = srcfmt->alpha;
   4.566 +            vzero = (vector unsigned int)vec_splat(valpha, 0);
   4.567 +        }
   4.568 +    }
   4.569 +
   4.570 +    assert(srcbpp == 4);
   4.571 +    assert(dstbpp == 4);
   4.572 +
   4.573 +    while (height--) {
   4.574 +        vector unsigned char valigner;
   4.575 +        vector unsigned int vbits;
   4.576 +        vector unsigned int voverflow;
   4.577 +        Uint32 bits;
   4.578 +        Uint8 r, g, b, a;
   4.579 +
   4.580 +        int width = info->d_width;
   4.581 +        int extrawidth;
   4.582 +
   4.583 +        /* do scalar until we can align... */
   4.584 +        while ((UNALIGNED_PTR(dst)) && (width)) {
   4.585 +            vec_dstt(src+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
   4.586 +            vec_dstst(dst+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
   4.587 +            bits = *(src++);
   4.588 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   4.589 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   4.590 +            width--;
   4.591 +        }
   4.592 +
   4.593 +        /* After all that work, here's the vector part! */
   4.594 +        extrawidth = (width % 4);
   4.595 +        width -= extrawidth;
   4.596 +        valigner = VEC_ALIGNER(src);
   4.597 +        vbits = vec_ld(0, src);
   4.598 +
   4.599 +        while (width) {
   4.600 +            vec_dstt(src+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
   4.601 +            vec_dstst(dst+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
   4.602 +            voverflow = vec_ld(15, src);
   4.603 +            src += 4;
   4.604 +            width -= 4;
   4.605 +            vbits = vec_perm(vbits, voverflow, valigner);  /* src is ready. */
   4.606 +            vbits = vec_perm(vbits, vzero, vpermute);  /* swizzle it. */
   4.607 +            vec_st(vbits, 0, dst);  /* store it back out. */
   4.608 +            dst += 4;
   4.609 +            vbits = voverflow;
   4.610 +        }
   4.611 +        
   4.612 +        assert(width == 0);
   4.613 +
   4.614 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
   4.615 +        while (extrawidth) {
   4.616 +            bits = *(src++);  /* max 7 pixels, don't bother with prefetch. */
   4.617 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   4.618 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   4.619 +            extrawidth--;
   4.620 +        }
   4.621 +
   4.622 +        src += srcskip >> 2;  /* move to next row, accounting for pitch. */
   4.623 +        dst += dstskip >> 2;
   4.624 +    }
   4.625 +
   4.626 +    vec_dss(DST_CHAN_SRC);
   4.627 +    vec_dss(DST_CHAN_DEST);
   4.628 +}
   4.629 +
   4.630 +static Uint32 GetBlitFeatures( void )
   4.631 +{
   4.632 +    static Uint32 features = 0xffffffff;
   4.633 +    if (features == 0xffffffff) {
   4.634 +        /* Provide an override for testing .. */
   4.635 +        char *override = getenv("SDL_ALTIVEC_BLIT_FEATURES");
   4.636 +        if (override) {
   4.637 +            features = 0;
   4.638 +            sscanf(override, "%u", &features);
   4.639 +        } else {
   4.640 +            features = ( 0
   4.641 +                /* Feature 1 is has-MMX */
   4.642 +                | ((SDL_HasMMX()) ? 1 : 0)
   4.643 +                /* Feature 2 is has-AltiVec */
   4.644 +                | ((SDL_HasAltiVec()) ? 2 : 0)
   4.645 +                /* Feature 4 is dont-use-prefetch */
   4.646 +                | ((GetL3CacheSize() == 0) ? 4 : 0)
   4.647 +            );
   4.648 +        }
   4.649 +    }
   4.650 +    return features;
   4.651 +}
   4.652 +#else
   4.653 +/* Feature 1 is has-MMX */
   4.654 +#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
   4.655 +#endif
   4.656 +
   4.657  #ifdef USE_ASMBLIT
   4.658  
   4.659  /* Heheheh, we coerce Hermes into using SDL blit information */
   4.660 @@ -406,11 +1056,7 @@
   4.661  
   4.662  
   4.663  /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
   4.664 -#if ( SDL_BYTEORDER == SDL_LIL_ENDIAN )
   4.665 -#define RGB565_32(dst, src, map) (map[src[0]*2] + map[src[1]*2+1])
   4.666 -#else /* ( SDL_BYTEORDER == SDL_BIG_ENDIAN ) */
   4.667 -#define RGB565_32(dst, src, map) (map[src[1]*2] + map[src[0]*2+1])
   4.668 -#endif
   4.669 +#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
   4.670  static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map)
   4.671  {
   4.672  #ifndef USE_DUFFS_LOOP
   4.673 @@ -1422,10 +2068,10 @@
   4.674  	Uint32 srcR, srcG, srcB;
   4.675  	int dstbpp;
   4.676  	Uint32 dstR, dstG, dstB;
   4.677 -	SDL_bool cpu_mmx;
   4.678 +	Uint32 blit_features;
   4.679  	void *aux_data;
   4.680  	SDL_loblit blitfunc;
   4.681 -        enum { NO_ALPHA, SET_ALPHA, COPY_ALPHA } alpha;
   4.682 +	enum { NO_ALPHA=1, SET_ALPHA=2, COPY_ALPHA=4 } alpha;
   4.683  };
   4.684  static const struct blit_table normal_blit_1[] = {
   4.685  	/* Default for 8-bit RGB source, an invalid combination */
   4.686 @@ -1440,6 +2086,11 @@
   4.687      { 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00,
   4.688        0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA },
   4.689  #endif
   4.690 +#ifdef USE_ALTIVEC_BLITTERS
   4.691 +    /* has-altivec */
   4.692 +    { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000,
   4.693 +      2, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   4.694 +#endif
   4.695      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
   4.696        0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA },
   4.697      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x000000FF,0x0000FF00,0x00FF0000,
   4.698 @@ -1485,6 +2136,17 @@
   4.699      { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000,
   4.700        0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA },
   4.701  #else
   4.702 +#ifdef USE_ALTIVEC_BLITTERS
   4.703 +    /* has-altivec | dont-use-prefetch */
   4.704 +    { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
   4.705 +      6, NULL, ConvertAltivec32to32_noprefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   4.706 +    /* has-altivec */
   4.707 +    { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
   4.708 +      2, NULL, ConvertAltivec32to32_prefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   4.709 +    /* has-altivec */
   4.710 +    { 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F,
   4.711 +      2, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA },
   4.712 +#endif
   4.713      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F,
   4.714        0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
   4.715      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
   4.716 @@ -1497,6 +2159,9 @@
   4.717  	normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4
   4.718  };
   4.719  
   4.720 +/* Mask matches table, or table entry is zero */
   4.721 +#define MASKOK(x, y) (((x) == (y)) || ((y) == 0x00000000))
   4.722 +
   4.723  SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
   4.724  {
   4.725  	struct private_swaccel *sdata;
   4.726 @@ -1532,6 +2197,12 @@
   4.727  	    else if(dstfmt->BytesPerPixel == 1)
   4.728  		return BlitNto1Key;
   4.729  	    else {
   4.730 +#ifdef USE_ALTIVEC_BLITTERS
   4.731 +        if((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) && SDL_HasAltiVec()) {
   4.732 +            return Blit32to32KeyAltivec;
   4.733 +        } else
   4.734 +#endif
   4.735 +
   4.736  		if(srcfmt->Amask && dstfmt->Amask)
   4.737  		    return BlitNtoNKeyCopyAlpha;
   4.738  		else
   4.739 @@ -1561,20 +2232,20 @@
   4.740  		}
   4.741  	} else {
   4.742  		/* Now the meat, choose the blitter we want */
   4.743 -	        int a_need = 0;
   4.744 +		int a_need = 0;
   4.745  		if(dstfmt->Amask)
   4.746  		    a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA;
   4.747  		table = normal_blit[srcfmt->BytesPerPixel-1];
   4.748 -		for ( which=0; table[which].srcR; ++which ) {
   4.749 -			if ( srcfmt->Rmask == table[which].srcR &&
   4.750 -			     srcfmt->Gmask == table[which].srcG &&
   4.751 -			     srcfmt->Bmask == table[which].srcB &&
   4.752 -			     dstfmt->BytesPerPixel == table[which].dstbpp &&
   4.753 -			     dstfmt->Rmask == table[which].dstR &&
   4.754 -			     dstfmt->Gmask == table[which].dstG &&
   4.755 -			     dstfmt->Bmask == table[which].dstB &&
   4.756 -			     (a_need & table[which].alpha) == a_need &&
   4.757 -			     (table[which].cpu_mmx == SDL_HasMMX())) 
   4.758 +		for ( which=0; table[which].dstbpp; ++which ) {
   4.759 +			if ( MASKOK(srcfmt->Rmask, table[which].srcR) &&
   4.760 +			    MASKOK(srcfmt->Gmask, table[which].srcG) &&
   4.761 +			    MASKOK(srcfmt->Bmask, table[which].srcB) &&
   4.762 +			    MASKOK(dstfmt->Rmask, table[which].dstR) &&
   4.763 +			    MASKOK(dstfmt->Gmask, table[which].dstG) &&
   4.764 +			    MASKOK(dstfmt->Bmask, table[which].dstB) &&
   4.765 +			    dstfmt->BytesPerPixel == table[which].dstbpp &&
   4.766 +			    (a_need & table[which].alpha) == a_need &&
   4.767 +			    ((table[which].blit_features & GetBlitFeatures()) == table[which].blit_features) )
   4.768  				break;
   4.769  		}
   4.770  		sdata->aux_data = table[which].aux_data;