src/video/SDL_blit_N.c
changeset 1047 ffaaf7ecf685
parent 769 b8d311d90021
child 1053 f596fa4f17a6
     1.1 --- a/src/video/SDL_blit_N.c	Sun Apr 17 10:16:30 2005 +0000
     1.2 +++ b/src/video/SDL_blit_N.c	Sun Apr 17 10:19:22 2005 +0000
     1.3 @@ -35,6 +35,656 @@
     1.4  
     1.5  /* Functions to blit from N-bit surfaces to other surfaces */
     1.6  
     1.7 +#ifdef USE_ALTIVEC_BLITTERS
     1.8 +#include <assert.h>
     1.9 +#ifdef MACOSX
    1.10 +#include <sys/sysctl.h>
    1.11 +#include <stdlib.h>
    1.12 +static size_t GetL3CacheSize( void )
    1.13 +{
    1.14 +    const char key[] = "hw.l3cachesize";
    1.15 +    u_int64_t result = 0;
    1.16 +    size_t typeSize = sizeof( result );
    1.17 +
    1.18 +
    1.19 +    int err = sysctlbyname( key, &result, &typeSize, NULL, 0 );
    1.20 +    if( 0 != err ) return 0;
    1.21 +
    1.22 +    return result;
    1.23 +}
    1.24 +#else
    1.25 +static size_t GetL3CacheSize( void )
    1.26 +{
    1.27 +    /* XXX: Just guess G4 */
    1.28 +    return 2097152;
    1.29 +}
    1.30 +#endif /* MACOSX */
    1.31 +
    1.32 +#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
    1.33 +#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
    1.34 +                               ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
    1.35 +                                 0x04+a, 0x04+b, 0x04+c, 0x04+d, \
    1.36 +                                 0x08+a, 0x08+b, 0x08+c, 0x08+d, \
    1.37 +                                 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
    1.38 +
    1.39 +#define MAKE8888(dstfmt, r, g, b, a)  \
    1.40 +    ( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
    1.41 +      ((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
    1.42 +      ((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
    1.43 +      ((a<<dstfmt->Ashift)&dstfmt->Amask) )
    1.44 +
    1.45 +/*
    1.46 + * Data Stream Touch...Altivec cache prefetching.
    1.47 + *
    1.48 + *  Don't use this on a G5...however, the speed boost is very significant
    1.49 + *   on a G4.
    1.50 + */
    1.51 +#define DST_CHAN_SRC 1
    1.52 +#define DST_CHAN_DEST 2
    1.53 +
    1.54 +/* macro to set DST control word value... */
    1.55 +#define DST_CTRL(size, count, stride) \
    1.56 +    (((size) << 24) | ((count) << 16) | (stride))
    1.57 +
    1.58 +#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
    1.59 +    ? vec_lvsl(0, src) \
    1.60 +    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
    1.61 +
    1.62 +/* Calculate the permute vector used for 32->32 swizzling */
    1.63 +static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
    1.64 +                                  const SDL_PixelFormat *dstfmt)
    1.65 +{
    1.66 +    /*
    1.67 +    * We have to assume that the bits that aren't used by other
    1.68 +     *  colors is alpha, and it's one complete byte, since some formats
    1.69 +     *  leave alpha with a zero mask, but we should still swizzle the bits.
    1.70 +     */
    1.71 +    /* ARGB */
    1.72 +    const static struct SDL_PixelFormat default_pixel_format = {
    1.73 +        NULL, 0, 0,
    1.74 +        0, 0, 0, 0,
    1.75 +        16, 8, 0, 24,
    1.76 +        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
    1.77 +        0, 0};
    1.78 +    if (!srcfmt) {
    1.79 +        srcfmt = &default_pixel_format;
    1.80 +    }
    1.81 +    if (!dstfmt) {
    1.82 +        dstfmt = &default_pixel_format;
    1.83 +    }
    1.84 +    vector unsigned char plus = (vector unsigned char)( 0x00, 0x00, 0x00, 0x00,
    1.85 +                                      0x04, 0x04, 0x04, 0x04,
    1.86 +                                      0x08, 0x08, 0x08, 0x08,
    1.87 +                                      0x0C, 0x0C, 0x0C, 0x0C );
    1.88 +    vector unsigned char vswiz;
    1.89 +    vector unsigned int srcvec;
    1.90 +#define RESHIFT(X) (3 - ((X) >> 3))
    1.91 +    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
    1.92 +    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
    1.93 +    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
    1.94 +    Uint32 amask;
    1.95 +    /* Use zero for alpha if either surface doesn't have alpha */
    1.96 +    if (dstfmt->Amask) {
    1.97 +        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
    1.98 +    } else {    
    1.99 +        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
   1.100 +    }           
   1.101 +#undef RESHIFT  
   1.102 +    ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
   1.103 +    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
   1.104 +    return(vswiz);
   1.105 +}
   1.106 +
   1.107 +static void Blit_RGB888_RGB565(SDL_BlitInfo *info);
   1.108 +static void Blit_RGB888_RGB565Altivec(SDL_BlitInfo *info) {
   1.109 +    int height = info->d_height;
   1.110 +    Uint8 *src = (Uint8 *) info->s_pixels;
   1.111 +    int srcskip = info->s_skip;
   1.112 +    Uint8 *dst = (Uint8 *) info->d_pixels;
   1.113 +    int dstskip = info->d_skip;
   1.114 +    SDL_PixelFormat *srcfmt = info->src;
   1.115 +    vector unsigned char valpha = vec_splat_u8(0);
   1.116 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   1.117 +    vector unsigned char vgmerge = (vector unsigned char)(
   1.118 +        0x00, 0x02, 0x00, 0x06,
   1.119 +        0x00, 0x0a, 0x00, 0x0e,
   1.120 +        0x00, 0x12, 0x00, 0x16,
   1.121 +        0x00, 0x1a, 0x00, 0x1e);
   1.122 +    vector unsigned short v1 = vec_splat_u16(1);
   1.123 +    vector unsigned short v3 = vec_splat_u16(3);
   1.124 +    vector unsigned short v3f = (vector unsigned short)(
   1.125 +        0x003f, 0x003f, 0x003f, 0x003f,
   1.126 +        0x003f, 0x003f, 0x003f, 0x003f);
   1.127 +    vector unsigned short vfc = (vector unsigned short)(
   1.128 +        0x00fc, 0x00fc, 0x00fc, 0x00fc,
   1.129 +        0x00fc, 0x00fc, 0x00fc, 0x00fc);
   1.130 +    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
   1.131 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
   1.132 +
   1.133 +    while (height--) {
   1.134 +        vector unsigned char valigner;
   1.135 +        vector unsigned char voverflow;
   1.136 +        vector unsigned char vsrc;
   1.137 +
   1.138 +        int width = info->d_width;
   1.139 +        int extrawidth;
   1.140 +
   1.141 +        /* do scalar until we can align... */
   1.142 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   1.143 +        while (condition) { \
   1.144 +            Uint32 pixel; \
   1.145 +            unsigned sR, sG, sB, sA; \
   1.146 +            DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, pixel, \
   1.147 +                          sR, sG, sB, sA); \
   1.148 +            *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
   1.149 +                                ((sG << 3) & 0x000007E0) | \
   1.150 +                                ((sB >> 3) & 0x0000001F)); \
   1.151 +            dst += 2; \
   1.152 +            src += 4; \
   1.153 +            widthvar--; \
   1.154 +        }
   1.155 +
   1.156 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
   1.157 +
   1.158 +        /* After all that work, here's the vector part! */
   1.159 +        extrawidth = (width % 8);  /* trailing unaligned stores */
   1.160 +        width -= extrawidth;
   1.161 +        vsrc = vec_ld(0, src);
   1.162 +        valigner = VEC_ALIGNER(src);
   1.163 +
   1.164 +        while (width) {
   1.165 +            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   1.166 +            vector unsigned int vsrc1, vsrc2;
   1.167 +            vector unsigned char vdst;
   1.168 +
   1.169 +            voverflow = vec_ld(15, src);
   1.170 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   1.171 +            vsrc1 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
   1.172 +            src += 16;
   1.173 +            vsrc = voverflow;
   1.174 +            voverflow = vec_ld(15, src);
   1.175 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   1.176 +            vsrc2 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
   1.177 +            /* 1555 */
   1.178 +            vpixel = (vector unsigned short)vec_packpx(vsrc1, vsrc2);
   1.179 +            vgpixel = (vector unsigned short)vec_perm(vsrc1, vsrc2, vgmerge);
   1.180 +            vgpixel = vec_and(vgpixel, vfc);
   1.181 +            vgpixel = vec_sl(vgpixel, v3);
   1.182 +            vrpixel = vec_sl(vpixel, v1);
   1.183 +            vrpixel = vec_and(vrpixel, vf800);
   1.184 +            vbpixel = vec_and(vpixel, v3f);
   1.185 +            vdst = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
   1.186 +            /* 565 */
   1.187 +            vdst = vec_or(vdst, (vector unsigned char)vbpixel);
   1.188 +            vec_st(vdst, 0, dst);
   1.189 +
   1.190 +            width -= 8;
   1.191 +            src += 16;
   1.192 +            dst += 16;
   1.193 +            vsrc = voverflow;
   1.194 +        }
   1.195 +
   1.196 +        assert(width == 0);
   1.197 +
   1.198 +
   1.199 +        /* do scalar until we can align... */
   1.200 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1.201 +#undef ONE_PIXEL_BLEND
   1.202 +
   1.203 +        src += srcskip;  /* move to next row, accounting for pitch. */
   1.204 +        dst += dstskip;
   1.205 +    }
   1.206 +
   1.207 +
   1.208 +}
   1.209 +
   1.210 +static void Blit_RGB565_32Altivec(SDL_BlitInfo *info) {
   1.211 +    int height = info->d_height;
   1.212 +    Uint8 *src = (Uint8 *) info->s_pixels;
   1.213 +    int srcskip = info->s_skip;
   1.214 +    Uint8 *dst = (Uint8 *) info->d_pixels;
   1.215 +    int dstskip = info->d_skip;
   1.216 +    SDL_PixelFormat *srcfmt = info->src;
   1.217 +    SDL_PixelFormat *dstfmt = info->dst;
   1.218 +    unsigned alpha;
   1.219 +    vector unsigned char valpha;
   1.220 +    vector unsigned char vpermute;
   1.221 +    vector unsigned short vf800;
   1.222 +    vector unsigned int v8 = vec_splat_u32(8);
   1.223 +    vector unsigned int v16 = vec_add(v8, v8);
   1.224 +    vector unsigned short v2 = vec_splat_u16(2);
   1.225 +    vector unsigned short v3 = vec_splat_u16(3);
   1.226 +    /* 
   1.227 +        0x10 - 0x1f is the alpha
   1.228 +        0x00 - 0x0e evens are the red
   1.229 +        0x01 - 0x0f odds are zero
   1.230 +    */
   1.231 +    vector unsigned char vredalpha1 = (vector unsigned char)(
   1.232 +        0x10, 0x00, 0x01, 0x01,
   1.233 +        0x10, 0x02, 0x01, 0x01,
   1.234 +        0x10, 0x04, 0x01, 0x01,
   1.235 +        0x10, 0x06, 0x01, 0x01
   1.236 +    );
   1.237 +    vector unsigned char vredalpha2 = (vector unsigned char)(
   1.238 +        vec_add((vector unsigned int)vredalpha1, vec_sl(v8, v16))
   1.239 +    );
   1.240 +    /*
   1.241 +        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   1.242 +        0x11 - 0x0f odds are blue
   1.243 +    */
   1.244 +    vector unsigned char vblue1 = (vector unsigned char)(
   1.245 +        0x00, 0x01, 0x02, 0x11,
   1.246 +        0x04, 0x05, 0x06, 0x13,
   1.247 +        0x08, 0x09, 0x0a, 0x15,
   1.248 +        0x0c, 0x0d, 0x0e, 0x17
   1.249 +    );
   1.250 +    vector unsigned char vblue2 = (vector unsigned char)(
   1.251 +        vec_add((vector unsigned int)vblue1, v8)
   1.252 +    );
   1.253 +    /*
   1.254 +        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   1.255 +        0x10 - 0x0e evens are green
   1.256 +    */
   1.257 +    vector unsigned char vgreen1 = (vector unsigned char)(
   1.258 +        0x00, 0x01, 0x10, 0x03,
   1.259 +        0x04, 0x05, 0x12, 0x07,
   1.260 +        0x08, 0x09, 0x14, 0x0b,
   1.261 +        0x0c, 0x0d, 0x16, 0x0f
   1.262 +    );
   1.263 +    vector unsigned char vgreen2 = (vector unsigned char)(
   1.264 +        vec_add((vector unsigned int)vgreen1, vec_sl(v8, v8))
   1.265 +    );
   1.266 +    
   1.267 +
   1.268 +    assert(srcfmt->BytesPerPixel == 2);
   1.269 +    assert(dstfmt->BytesPerPixel == 4);
   1.270 +
   1.271 +    vf800 = (vector unsigned short)vec_splat_u8(-7);
   1.272 +    vf800 = vec_sl(vf800, vec_splat_u16(8));
   1.273 +
   1.274 +    if (dstfmt->Amask && srcfmt->alpha) {
   1.275 +        ((unsigned char *)&valpha)[0] = alpha = srcfmt->alpha;
   1.276 +        valpha = vec_splat(valpha, 0);
   1.277 +    } else {
   1.278 +        alpha = 0;
   1.279 +        valpha = vec_splat_u8(0);
   1.280 +    }
   1.281 +
   1.282 +    vpermute = calc_swizzle32(NULL, dstfmt);
   1.283 +    while (height--) {
   1.284 +        vector unsigned char valigner;
   1.285 +        vector unsigned char voverflow;
   1.286 +        vector unsigned char vsrc;
   1.287 +
   1.288 +        int width = info->d_width;
   1.289 +        int extrawidth;
   1.290 +
   1.291 +        /* do scalar until we can align... */
   1.292 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   1.293 +        while (condition) { \
   1.294 +            unsigned sR, sG, sB; \
   1.295 +            unsigned short pixel = *((unsigned short *)src); \
   1.296 +            sR = (pixel >> 8) & 0xf8; \
   1.297 +            sG = (pixel >> 3) & 0xfc; \
   1.298 +            sB = (pixel << 3) & 0xf8; \
   1.299 +            ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
   1.300 +            src += 2; \
   1.301 +            dst += 4; \
   1.302 +            widthvar--; \
   1.303 +        }
   1.304 +        ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
   1.305 +
   1.306 +        /* After all that work, here's the vector part! */
   1.307 +        extrawidth = (width % 8);  /* trailing unaligned stores */
   1.308 +        width -= extrawidth;
   1.309 +        vsrc = vec_ld(0, src);
   1.310 +        valigner = VEC_ALIGNER(src);
   1.311 +
   1.312 +        while (width) {
   1.313 +            vector unsigned short vR, vG, vB;
   1.314 +            vector unsigned char vdst1, vdst2;
   1.315 +
   1.316 +            voverflow = vec_ld(15, src);
   1.317 +            vsrc = vec_perm(vsrc, voverflow, valigner);
   1.318 +
   1.319 +            vR = vec_and((vector unsigned short)vsrc, vf800);
   1.320 +            vB = vec_sl((vector unsigned short)vsrc, v3);
   1.321 +            vG = vec_sl(vB, v2);
   1.322 +
   1.323 +            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha1);
   1.324 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
   1.325 +            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
   1.326 +            vdst1 = vec_perm(vdst1, valpha, vpermute);
   1.327 +            vec_st(vdst1, 0, dst);
   1.328 +
   1.329 +            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha2);
   1.330 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
   1.331 +            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
   1.332 +            vdst2 = vec_perm(vdst2, valpha, vpermute);
   1.333 +            vec_st(vdst2, 16, dst);
   1.334 +            
   1.335 +            width -= 8;
   1.336 +            dst += 32;
   1.337 +            src += 16;
   1.338 +            vsrc = voverflow;
   1.339 +        }
   1.340 +
   1.341 +        assert(width == 0);
   1.342 +
   1.343 +
   1.344 +        /* do scalar until we can align... */
   1.345 +        ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1.346 +#undef ONE_PIXEL_BLEND
   1.347 +
   1.348 +        src += srcskip;  /* move to next row, accounting for pitch. */
   1.349 +        dst += dstskip;
   1.350 +    }
   1.351 +
   1.352 +}
   1.353 +
   1.354 +static void BlitNtoNKey(SDL_BlitInfo *info);
   1.355 +static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info);
   1.356 +static void Blit32to32KeyAltivec(SDL_BlitInfo *info)
   1.357 +{
   1.358 +    int height = info->d_height;
   1.359 +    Uint32 *srcp = (Uint32 *) info->s_pixels;
   1.360 +    int srcskip = info->s_skip;
   1.361 +    Uint32 *dstp = (Uint32 *) info->d_pixels;
   1.362 +    int dstskip = info->d_skip;
   1.363 +    SDL_PixelFormat *srcfmt = info->src;
   1.364 +    int srcbpp = srcfmt->BytesPerPixel;
   1.365 +    SDL_PixelFormat *dstfmt = info->dst;
   1.366 +    int dstbpp = dstfmt->BytesPerPixel;
   1.367 +    int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
   1.368 +	unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0;
   1.369 +    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
   1.370 +	Uint32 ckey = info->src->colorkey;
   1.371 +    vector unsigned int valpha;
   1.372 +    vector unsigned char vpermute;
   1.373 +    vector unsigned char vzero;
   1.374 +    vector unsigned int vckey;
   1.375 +    vector unsigned int vrgbmask;
   1.376 +    vpermute = calc_swizzle32(srcfmt, dstfmt);
   1.377 +    if (info->d_width < 16) {
   1.378 +        if(copy_alpha) {
   1.379 +            return BlitNtoNKeyCopyAlpha(info);
   1.380 +        } else {
   1.381 +            return BlitNtoNKey(info);
   1.382 +        }
   1.383 +    }
   1.384 +    vzero = vec_splat_u8(0);
   1.385 +    if (alpha) {
   1.386 +        ((unsigned char *)&valpha)[0] = (unsigned char)alpha;
   1.387 +        valpha = (vector unsigned int)vec_splat((vector unsigned char)valpha, 0);
   1.388 +    } else {
   1.389 +        valpha = (vector unsigned int)vzero;
   1.390 +    }
   1.391 +    ckey &= rgbmask;
   1.392 +    ((unsigned int *)&vckey)[0] = ckey;
   1.393 +    vckey = vec_splat(vckey, 0);
   1.394 +    ((unsigned int *)&vrgbmask)[0] = rgbmask;
   1.395 +    vrgbmask = vec_splat(vrgbmask, 0);
   1.396 +
   1.397 +    while (height--) {
   1.398 +#define ONE_PIXEL_BLEND(condition, widthvar) \
   1.399 +        if (copy_alpha) { \
   1.400 +            while (condition) { \
   1.401 +                Uint32 pixel; \
   1.402 +                unsigned sR, sG, sB, sA; \
   1.403 +                DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, pixel, \
   1.404 +                          sR, sG, sB, sA); \
   1.405 +                if ( (pixel & rgbmask) != ckey ) { \
   1.406 +                      ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
   1.407 +                            sR, sG, sB, sA); \
   1.408 +                } \
   1.409 +                ((Uint8 *)dstp) += dstbpp; \
   1.410 +                ((Uint8 *)srcp) += srcbpp; \
   1.411 +                widthvar--; \
   1.412 +            } \
   1.413 +        } else { \
   1.414 +            while (condition) { \
   1.415 +                Uint32 pixel; \
   1.416 +                unsigned sR, sG, sB; \
   1.417 +                RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, pixel); \
   1.418 +                if ( pixel != ckey ) { \
   1.419 +                    RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
   1.420 +                    ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
   1.421 +                              sR, sG, sB, alpha); \
   1.422 +                } \
   1.423 +                ((Uint8 *)dstp) += dstbpp; \
   1.424 +                ((Uint8 *)srcp) += srcbpp; \
   1.425 +                widthvar--; \
   1.426 +            } \
   1.427 +        }
   1.428 +        int width = info->d_width;
   1.429 +        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
   1.430 +        assert(width > 0);
   1.431 +        if (width > 0) {
   1.432 +            int extrawidth = (width % 4);
   1.433 +            vector unsigned char valigner = VEC_ALIGNER(srcp);
   1.434 +            vector unsigned int vs = vec_ld(0, srcp);
   1.435 +            width -= extrawidth;
   1.436 +            assert(width >= 4);
   1.437 +            while (width) {
   1.438 +                vector unsigned char vsel;
   1.439 +                vector unsigned int vd;
   1.440 +                vector unsigned int voverflow = vec_ld(15, srcp);
   1.441 +                /* load the source vec */
   1.442 +                vs = vec_perm(vs, voverflow, valigner);
   1.443 +                /* vsel is set for items that match the key */
   1.444 +                vsel = (vector unsigned char)vec_and(vs, vrgbmask);
   1.445 +                vsel = (vector unsigned char)vec_cmpeq(vs, vckey);
   1.446 +                /* permute the src vec to the dest format */
   1.447 +                vs = vec_perm(vs, valpha, vpermute);
   1.448 +                /* load the destination vec */
   1.449 +                vd = vec_ld(0, dstp);
   1.450 +                /* select the source and dest into vs */
   1.451 +                vd = (vector unsigned int)vec_sel((vector unsigned char)vs, (vector unsigned char)vd, vsel);
   1.452 +                
   1.453 +                vec_st(vd, 0, dstp);
   1.454 +                srcp += 4;
   1.455 +                width -= 4;
   1.456 +                dstp += 4;
   1.457 +                vs = voverflow;
   1.458 +            }
   1.459 +            ONE_PIXEL_BLEND((extrawidth), extrawidth);
   1.460 +#undef ONE_PIXEL_BLEND
   1.461 +            srcp += srcskip >> 2;
   1.462 +            dstp += dstskip >> 2;
   1.463 +        }
   1.464 +    }
   1.465 +}
   1.466 +
   1.467 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
   1.468 +/* Use this on a G5 */
   1.469 +static void ConvertAltivec32to32_noprefetch(SDL_BlitInfo *info)
   1.470 +{
   1.471 +    int height = info->d_height;
   1.472 +    Uint32 *src = (Uint32 *) info->s_pixels;
   1.473 +    int srcskip = info->s_skip;
   1.474 +    Uint32 *dst = (Uint32 *) info->d_pixels;
   1.475 +    int dstskip = info->d_skip;
   1.476 +    SDL_PixelFormat *srcfmt = info->src;
   1.477 +    int srcbpp = srcfmt->BytesPerPixel;
   1.478 +    SDL_PixelFormat *dstfmt = info->dst;
   1.479 +    int dstbpp = dstfmt->BytesPerPixel;
   1.480 +    vector unsigned int vzero = vec_splat_u32(0);
   1.481 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
   1.482 +    if (dstfmt->Amask && !srcfmt->Amask) {
   1.483 +        if (srcfmt->alpha) {
   1.484 +            vector unsigned char valpha;
   1.485 +            ((unsigned char *)&valpha)[0] = srcfmt->alpha;
   1.486 +            vzero = (vector unsigned int)vec_splat(valpha, 0);
   1.487 +        }
   1.488 +    }
   1.489 +
   1.490 +    assert(srcbpp == 4);
   1.491 +    assert(dstbpp == 4);
   1.492 +
   1.493 +    while (height--) {
   1.494 +        vector unsigned char valigner;
   1.495 +        vector unsigned int vbits;
   1.496 +        vector unsigned int voverflow;
   1.497 +        Uint32 bits;
   1.498 +        Uint8 r, g, b, a;
   1.499 +
   1.500 +        int width = info->d_width;
   1.501 +        int extrawidth;
   1.502 +
   1.503 +        /* do scalar until we can align... */
   1.504 +        while ((UNALIGNED_PTR(dst)) && (width)) {
   1.505 +            bits = *(src++);
   1.506 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   1.507 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   1.508 +            width--;
   1.509 +        }
   1.510 +
   1.511 +        /* After all that work, here's the vector part! */
   1.512 +        extrawidth = (width % 4);
   1.513 +        width -= extrawidth;
   1.514 +        valigner = VEC_ALIGNER(src);
   1.515 +        vbits = vec_ld(0, src);
   1.516 +
   1.517 +       while (width) {
   1.518 +            voverflow = vec_ld(15, src);
   1.519 +            src += 4;
   1.520 +            width -= 4;
   1.521 +            vbits = vec_perm(vbits, voverflow, valigner);  /* src is ready. */
   1.522 +            vbits = vec_perm(vbits, vzero, vpermute);  /* swizzle it. */
   1.523 +            vec_st(vbits, 0, dst);  /* store it back out. */
   1.524 +            dst += 4;
   1.525 +            vbits = voverflow;
   1.526 +        }
   1.527 +
   1.528 +        assert(width == 0);
   1.529 +
   1.530 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
   1.531 +        while (extrawidth) {
   1.532 +            bits = *(src++);  /* max 7 pixels, don't bother with prefetch. */
   1.533 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   1.534 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   1.535 +            extrawidth--;
   1.536 +        }
   1.537 +
   1.538 +        src += srcskip >> 2;  /* move to next row, accounting for pitch. */
   1.539 +        dst += dstskip >> 2;
   1.540 +    }
   1.541 +
   1.542 +}
   1.543 +
   1.544 +/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
   1.545 +/* Use this on a G4 */
   1.546 +static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info)
   1.547 +{
   1.548 +    const int scalar_dst_lead = sizeof (Uint32) * 4;
   1.549 +    const int vector_dst_lead = sizeof (Uint32) * 16;
   1.550 +
   1.551 +    int height = info->d_height;
   1.552 +    Uint32 *src = (Uint32 *) info->s_pixels;
   1.553 +    int srcskip = info->s_skip;
   1.554 +    Uint32 *dst = (Uint32 *) info->d_pixels;
   1.555 +    int dstskip = info->d_skip;
   1.556 +    SDL_PixelFormat *srcfmt = info->src;
   1.557 +    int srcbpp = srcfmt->BytesPerPixel;
   1.558 +    SDL_PixelFormat *dstfmt = info->dst;
   1.559 +    int dstbpp = dstfmt->BytesPerPixel;
   1.560 +    vector unsigned int vzero = vec_splat_u32(0);
   1.561 +    vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
   1.562 +    if (dstfmt->Amask && !srcfmt->Amask) {
   1.563 +        if (srcfmt->alpha) {
   1.564 +            vector unsigned char valpha;
   1.565 +            ((unsigned char *)&valpha)[0] = srcfmt->alpha;
   1.566 +            vzero = (vector unsigned int)vec_splat(valpha, 0);
   1.567 +        }
   1.568 +    }
   1.569 +
   1.570 +    assert(srcbpp == 4);
   1.571 +    assert(dstbpp == 4);
   1.572 +
   1.573 +    while (height--) {
   1.574 +        vector unsigned char valigner;
   1.575 +        vector unsigned int vbits;
   1.576 +        vector unsigned int voverflow;
   1.577 +        Uint32 bits;
   1.578 +        Uint8 r, g, b, a;
   1.579 +
   1.580 +        int width = info->d_width;
   1.581 +        int extrawidth;
   1.582 +
   1.583 +        /* do scalar until we can align... */
   1.584 +        while ((UNALIGNED_PTR(dst)) && (width)) {
   1.585 +            vec_dstt(src+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
   1.586 +            vec_dstst(dst+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
   1.587 +            bits = *(src++);
   1.588 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   1.589 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   1.590 +            width--;
   1.591 +        }
   1.592 +
   1.593 +        /* After all that work, here's the vector part! */
   1.594 +        extrawidth = (width % 4);
   1.595 +        width -= extrawidth;
   1.596 +        valigner = VEC_ALIGNER(src);
   1.597 +        vbits = vec_ld(0, src);
   1.598 +
   1.599 +        while (width) {
   1.600 +            vec_dstt(src+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
   1.601 +            vec_dstst(dst+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
   1.602 +            voverflow = vec_ld(15, src);
   1.603 +            src += 4;
   1.604 +            width -= 4;
   1.605 +            vbits = vec_perm(vbits, voverflow, valigner);  /* src is ready. */
   1.606 +            vbits = vec_perm(vbits, vzero, vpermute);  /* swizzle it. */
   1.607 +            vec_st(vbits, 0, dst);  /* store it back out. */
   1.608 +            dst += 4;
   1.609 +            vbits = voverflow;
   1.610 +        }
   1.611 +        
   1.612 +        assert(width == 0);
   1.613 +
   1.614 +        /* cover pixels at the end of the row that didn't fit in 16 bytes. */
   1.615 +        while (extrawidth) {
   1.616 +            bits = *(src++);  /* max 7 pixels, don't bother with prefetch. */
   1.617 +            RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
   1.618 +            *(dst++) = MAKE8888(dstfmt, r, g, b, a);
   1.619 +            extrawidth--;
   1.620 +        }
   1.621 +
   1.622 +        src += srcskip >> 2;  /* move to next row, accounting for pitch. */
   1.623 +        dst += dstskip >> 2;
   1.624 +    }
   1.625 +
   1.626 +    vec_dss(DST_CHAN_SRC);
   1.627 +    vec_dss(DST_CHAN_DEST);
   1.628 +}
   1.629 +
   1.630 +static Uint32 GetBlitFeatures( void )
   1.631 +{
   1.632 +    static Uint32 features = 0xffffffff;
   1.633 +    if (features == 0xffffffff) {
   1.634 +        /* Provide an override for testing .. */
   1.635 +        char *override = getenv("SDL_ALTIVEC_BLIT_FEATURES");
   1.636 +        if (override) {
   1.637 +            features = 0;
   1.638 +            sscanf(override, "%u", &features);
   1.639 +        } else {
   1.640 +            features = ( 0
   1.641 +                /* Feature 1 is has-MMX */
   1.642 +                | ((SDL_HasMMX()) ? 1 : 0)
   1.643 +                /* Feature 2 is has-AltiVec */
   1.644 +                | ((SDL_HasAltiVec()) ? 2 : 0)
   1.645 +                /* Feature 4 is dont-use-prefetch */
   1.646 +                | ((GetL3CacheSize() == 0) ? 4 : 0)
   1.647 +            );
   1.648 +        }
   1.649 +    }
   1.650 +    return features;
   1.651 +}
   1.652 +#else
   1.653 +/* Feature 1 is has-MMX */
   1.654 +#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
   1.655 +#endif
   1.656 +
   1.657  #ifdef USE_ASMBLIT
   1.658  
   1.659  /* Heheheh, we coerce Hermes into using SDL blit information */
   1.660 @@ -406,11 +1056,7 @@
   1.661  
   1.662  
   1.663  /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
   1.664 -#if ( SDL_BYTEORDER == SDL_LIL_ENDIAN )
   1.665 -#define RGB565_32(dst, src, map) (map[src[0]*2] + map[src[1]*2+1])
   1.666 -#else /* ( SDL_BYTEORDER == SDL_BIG_ENDIAN ) */
   1.667 -#define RGB565_32(dst, src, map) (map[src[1]*2] + map[src[0]*2+1])
   1.668 -#endif
   1.669 +#define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
   1.670  static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map)
   1.671  {
   1.672  #ifndef USE_DUFFS_LOOP
   1.673 @@ -1422,10 +2068,10 @@
   1.674  	Uint32 srcR, srcG, srcB;
   1.675  	int dstbpp;
   1.676  	Uint32 dstR, dstG, dstB;
   1.677 -	SDL_bool cpu_mmx;
   1.678 +	Uint32 blit_features;
   1.679  	void *aux_data;
   1.680  	SDL_loblit blitfunc;
   1.681 -        enum { NO_ALPHA, SET_ALPHA, COPY_ALPHA } alpha;
   1.682 +	enum { NO_ALPHA=1, SET_ALPHA=2, COPY_ALPHA=4 } alpha;
   1.683  };
   1.684  static const struct blit_table normal_blit_1[] = {
   1.685  	/* Default for 8-bit RGB source, an invalid combination */
   1.686 @@ -1440,6 +2086,11 @@
   1.687      { 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00,
   1.688        0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA },
   1.689  #endif
   1.690 +#ifdef USE_ALTIVEC_BLITTERS
   1.691 +    /* has-altivec */
   1.692 +    { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000,
   1.693 +      2, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   1.694 +#endif
   1.695      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
   1.696        0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA },
   1.697      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x000000FF,0x0000FF00,0x00FF0000,
   1.698 @@ -1485,6 +2136,17 @@
   1.699      { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000,
   1.700        0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA },
   1.701  #else
   1.702 +#ifdef USE_ALTIVEC_BLITTERS
   1.703 +    /* has-altivec | dont-use-prefetch */
   1.704 +    { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
   1.705 +      6, NULL, ConvertAltivec32to32_noprefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   1.706 +    /* has-altivec */
   1.707 +    { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
   1.708 +      2, NULL, ConvertAltivec32to32_prefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
   1.709 +    /* has-altivec */
   1.710 +    { 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F,
   1.711 +      2, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA },
   1.712 +#endif
   1.713      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F,
   1.714        0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
   1.715      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
   1.716 @@ -1497,6 +2159,9 @@
   1.717  	normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4
   1.718  };
   1.719  
   1.720 +/* Mask matches table, or table entry is zero */
   1.721 +#define MASKOK(x, y) (((x) == (y)) || ((y) == 0x00000000))
   1.722 +
   1.723  SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
   1.724  {
   1.725  	struct private_swaccel *sdata;
   1.726 @@ -1532,6 +2197,12 @@
   1.727  	    else if(dstfmt->BytesPerPixel == 1)
   1.728  		return BlitNto1Key;
   1.729  	    else {
   1.730 +#ifdef USE_ALTIVEC_BLITTERS
   1.731 +        if((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) && SDL_HasAltiVec()) {
   1.732 +            return Blit32to32KeyAltivec;
   1.733 +        } else
   1.734 +#endif
   1.735 +
   1.736  		if(srcfmt->Amask && dstfmt->Amask)
   1.737  		    return BlitNtoNKeyCopyAlpha;
   1.738  		else
   1.739 @@ -1561,20 +2232,20 @@
   1.740  		}
   1.741  	} else {
   1.742  		/* Now the meat, choose the blitter we want */
   1.743 -	        int a_need = 0;
   1.744 +		int a_need = 0;
   1.745  		if(dstfmt->Amask)
   1.746  		    a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA;
   1.747  		table = normal_blit[srcfmt->BytesPerPixel-1];
   1.748 -		for ( which=0; table[which].srcR; ++which ) {
   1.749 -			if ( srcfmt->Rmask == table[which].srcR &&
   1.750 -			     srcfmt->Gmask == table[which].srcG &&
   1.751 -			     srcfmt->Bmask == table[which].srcB &&
   1.752 -			     dstfmt->BytesPerPixel == table[which].dstbpp &&
   1.753 -			     dstfmt->Rmask == table[which].dstR &&
   1.754 -			     dstfmt->Gmask == table[which].dstG &&
   1.755 -			     dstfmt->Bmask == table[which].dstB &&
   1.756 -			     (a_need & table[which].alpha) == a_need &&
   1.757 -			     (table[which].cpu_mmx == SDL_HasMMX())) 
   1.758 +		for ( which=0; table[which].dstbpp; ++which ) {
   1.759 +			if ( MASKOK(srcfmt->Rmask, table[which].srcR) &&
   1.760 +			    MASKOK(srcfmt->Gmask, table[which].srcG) &&
   1.761 +			    MASKOK(srcfmt->Bmask, table[which].srcB) &&
   1.762 +			    MASKOK(dstfmt->Rmask, table[which].dstR) &&
   1.763 +			    MASKOK(dstfmt->Gmask, table[which].dstG) &&
   1.764 +			    MASKOK(dstfmt->Bmask, table[which].dstB) &&
   1.765 +			    dstfmt->BytesPerPixel == table[which].dstbpp &&
   1.766 +			    (a_need & table[which].alpha) == a_need &&
   1.767 +			    ((table[which].blit_features & GetBlitFeatures()) == table[which].blit_features) )
   1.768  				break;
   1.769  		}
   1.770  		sdata->aux_data = table[which].aux_data;