src/video/SDL_blit_N.c
changeset 5259 6a65c1fc07af
parent 3697 f7b03b6838cb
child 5262 b530ef003506
equal deleted inserted replaced
5258:f26314c20071 5259:6a65c1fc07af
    26 #include "SDL_cpuinfo.h"
    26 #include "SDL_cpuinfo.h"
    27 #include "SDL_blit.h"
    27 #include "SDL_blit.h"
    28 
    28 
    29 /* Functions to blit from N-bit surfaces to other surfaces */
    29 /* Functions to blit from N-bit surfaces to other surfaces */
    30 
    30 
    31 #if SDL_ALTIVEC_BLITTERS
       
    32 #if __MWERKS__
       
    33 #pragma altivec_model on
       
    34 #endif
       
    35 #ifdef HAVE_ALTIVEC_H
       
    36 #include <altivec.h>
       
    37 #endif
       
    38 #define assert(X)
       
    39 #ifdef __MACOSX__
       
    40 #include <sys/sysctl.h>
       
    41 static size_t
       
    42 GetL3CacheSize(void)
       
    43 {
       
    44     const char key[] = "hw.l3cachesize";
       
    45     u_int64_t result = 0;
       
    46     size_t typeSize = sizeof(result);
       
    47 
       
    48 
       
    49     int err = sysctlbyname(key, &result, &typeSize, NULL, 0);
       
    50     if (0 != err)
       
    51         return 0;
       
    52 
       
    53     return result;
       
    54 }
       
    55 #else
       
    56 static size_t
       
    57 GetL3CacheSize(void)
       
    58 {
       
    59     /* XXX: Just guess G4 */
       
    60     return 2097152;
       
    61 }
       
    62 #endif /* __MACOSX__ */
       
    63 
       
    64 #if (defined(__MACOSX__) && (__GNUC__ < 4))
       
    65 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
       
    66         (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
       
    67 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
       
    68         (vector unsigned short) ( a,b,c,d,e,f,g,h )
       
    69 #else
       
    70 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
       
    71         (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
       
    72 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
       
    73         (vector unsigned short) { a,b,c,d,e,f,g,h }
       
    74 #endif
       
    75 
       
    76 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
       
    77 #define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
       
    78                                ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
       
    79                                  0x04+a, 0x04+b, 0x04+c, 0x04+d, \
       
    80                                  0x08+a, 0x08+b, 0x08+c, 0x08+d, \
       
    81                                  0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
       
    82 
       
    83 #define MAKE8888(dstfmt, r, g, b, a)  \
       
    84     ( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
       
    85       ((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
       
    86       ((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
       
    87       ((a<<dstfmt->Ashift)&dstfmt->Amask) )
       
    88 
       
    89 /*
       
    90  * Data Stream Touch...Altivec cache prefetching.
       
    91  *
       
    92  *  Don't use this on a G5...however, the speed boost is very significant
       
    93  *   on a G4.
       
    94  */
       
    95 #define DST_CHAN_SRC 1
       
    96 #define DST_CHAN_DEST 2
       
    97 
       
    98 /* macro to set DST control word value... */
       
    99 #define DST_CTRL(size, count, stride) \
       
   100     (((size) << 24) | ((count) << 16) | (stride))
       
   101 
       
   102 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
       
   103     ? vec_lvsl(0, src) \
       
   104     : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
       
   105 
       
   106 /* Calculate the permute vector used for 32->32 swizzling */
       
   107 static vector unsigned char
       
   108 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
       
   109 {
       
   110     /*
       
   111      * We have to assume that the bits that aren't used by other
       
   112      *  colors is alpha, and it's one complete byte, since some formats
       
   113      *  leave alpha with a zero mask, but we should still swizzle the bits.
       
   114      */
       
   115     /* ARGB */
       
   116     const static const struct SDL_PixelFormat default_pixel_format = {
       
   117         NULL, 32, 4,
       
   118         0, 0, 0, 0,
       
   119         16, 8, 0, 24,
       
   120         0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
       
   121     };
       
   122     if (!srcfmt) {
       
   123         srcfmt = &default_pixel_format;
       
   124     }
       
   125     if (!dstfmt) {
       
   126         dstfmt = &default_pixel_format;
       
   127     }
       
   128     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
       
   129                                                        0x04, 0x04, 0x04, 0x04,
       
   130                                                        0x08, 0x08, 0x08, 0x08,
       
   131                                                        0x0C, 0x0C, 0x0C,
       
   132                                                        0x0C);
       
   133     vector unsigned char vswiz;
       
   134     vector unsigned int srcvec;
       
   135 #define RESHIFT(X) (3 - ((X) >> 3))
       
   136     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
       
   137     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
       
   138     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
       
   139     Uint32 amask;
       
   140     /* Use zero for alpha if either surface doesn't have alpha */
       
   141     if (dstfmt->Amask) {
       
   142         amask =
       
   143             ((srcfmt->Amask) ? RESHIFT(srcfmt->
       
   144                                        Ashift) : 0x10) << (dstfmt->Ashift);
       
   145     } else {
       
   146         amask =
       
   147             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
       
   148                           0xFFFFFFFF);
       
   149     }
       
   150 #undef RESHIFT
       
   151     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
       
   152     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
       
   153     return (vswiz);
       
   154 }
       
   155 
       
   156 static void Blit_RGB888_RGB565(SDL_BlitInfo * info);
       
   157 static void
       
   158 Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info)
       
   159 {
       
   160     int height = info->dst_h;
       
   161     Uint8 *src = (Uint8 *) info->src;
       
   162     int srcskip = info->src_skip;
       
   163     Uint8 *dst = (Uint8 *) info->dst;
       
   164     int dstskip = info->dst_skip;
       
   165     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   166     vector unsigned char valpha = vec_splat_u8(0);
       
   167     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
       
   168     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
       
   169                                                     0x00, 0x0a, 0x00, 0x0e,
       
   170                                                     0x00, 0x12, 0x00, 0x16,
       
   171                                                     0x00, 0x1a, 0x00, 0x1e);
       
   172     vector unsigned short v1 = vec_splat_u16(1);
       
   173     vector unsigned short v3 = vec_splat_u16(3);
       
   174     vector unsigned short v3f =
       
   175         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
       
   176                           0x003f, 0x003f, 0x003f, 0x003f);
       
   177     vector unsigned short vfc =
       
   178         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
       
   179                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
       
   180     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
       
   181     vf800 = vec_sl(vf800, vec_splat_u16(8));
       
   182 
       
   183     while (height--) {
       
   184         vector unsigned char valigner;
       
   185         vector unsigned char voverflow;
       
   186         vector unsigned char vsrc;
       
   187 
       
   188         int width = info->dst_w;
       
   189         int extrawidth;
       
   190 
       
   191         /* do scalar until we can align... */
       
   192 #define ONE_PIXEL_BLEND(condition, widthvar) \
       
   193         while (condition) { \
       
   194             Uint32 Pixel; \
       
   195             unsigned sR, sG, sB, sA; \
       
   196             DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \
       
   197                           sR, sG, sB, sA); \
       
   198             *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
       
   199                                 ((sG << 3) & 0x000007E0) | \
       
   200                                 ((sB >> 3) & 0x0000001F)); \
       
   201             dst += 2; \
       
   202             src += 4; \
       
   203             widthvar--; \
       
   204         }
       
   205 
       
   206         ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
       
   207 
       
   208         /* After all that work, here's the vector part! */
       
   209         extrawidth = (width % 8);       /* trailing unaligned stores */
       
   210         width -= extrawidth;
       
   211         vsrc = vec_ld(0, src);
       
   212         valigner = VEC_ALIGNER(src);
       
   213 
       
   214         while (width) {
       
   215             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
       
   216             vector unsigned int vsrc1, vsrc2;
       
   217             vector unsigned char vdst;
       
   218 
       
   219             voverflow = vec_ld(15, src);
       
   220             vsrc = vec_perm(vsrc, voverflow, valigner);
       
   221             vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
       
   222             src += 16;
       
   223             vsrc = voverflow;
       
   224             voverflow = vec_ld(15, src);
       
   225             vsrc = vec_perm(vsrc, voverflow, valigner);
       
   226             vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
       
   227             /* 1555 */
       
   228             vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2);
       
   229             vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge);
       
   230             vgpixel = vec_and(vgpixel, vfc);
       
   231             vgpixel = vec_sl(vgpixel, v3);
       
   232             vrpixel = vec_sl(vpixel, v1);
       
   233             vrpixel = vec_and(vrpixel, vf800);
       
   234             vbpixel = vec_and(vpixel, v3f);
       
   235             vdst =
       
   236                 vec_or((vector unsigned char) vrpixel,
       
   237                        (vector unsigned char) vgpixel);
       
   238             /* 565 */
       
   239             vdst = vec_or(vdst, (vector unsigned char) vbpixel);
       
   240             vec_st(vdst, 0, dst);
       
   241 
       
   242             width -= 8;
       
   243             src += 16;
       
   244             dst += 16;
       
   245             vsrc = voverflow;
       
   246         }
       
   247 
       
   248         assert(width == 0);
       
   249 
       
   250         /* do scalar until we can align... */
       
   251         ONE_PIXEL_BLEND((extrawidth), extrawidth);
       
   252 #undef ONE_PIXEL_BLEND
       
   253 
       
   254         src += srcskip;         /* move to next row, accounting for pitch. */
       
   255         dst += dstskip;
       
   256     }
       
   257 
       
   258 
       
   259 }
       
   260 
       
   261 static void
       
   262 Blit_RGB565_32Altivec(SDL_BlitInfo * info)
       
   263 {
       
   264     int height = info->dst_h;
       
   265     Uint8 *src = (Uint8 *) info->src;
       
   266     int srcskip = info->src_skip;
       
   267     Uint8 *dst = (Uint8 *) info->dst;
       
   268     int dstskip = info->dst_skip;
       
   269     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   270     SDL_PixelFormat *dstfmt = info->dst_fmt;
       
   271     unsigned alpha;
       
   272     vector unsigned char valpha;
       
   273     vector unsigned char vpermute;
       
   274     vector unsigned short vf800;
       
   275     vector unsigned int v8 = vec_splat_u32(8);
       
   276     vector unsigned int v16 = vec_add(v8, v8);
       
   277     vector unsigned short v2 = vec_splat_u16(2);
       
   278     vector unsigned short v3 = vec_splat_u16(3);
       
   279     /* 
       
   280        0x10 - 0x1f is the alpha
       
   281        0x00 - 0x0e evens are the red
       
   282        0x01 - 0x0f odds are zero
       
   283      */
       
   284     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
       
   285                                                        0x10, 0x02, 0x01, 0x01,
       
   286                                                        0x10, 0x04, 0x01, 0x01,
       
   287                                                        0x10, 0x06, 0x01,
       
   288                                                        0x01);
       
   289     vector unsigned char vredalpha2 =
       
   290         (vector unsigned
       
   291          char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
       
   292         );
       
   293     /*
       
   294        0x00 - 0x0f is ARxx ARxx ARxx ARxx
       
   295        0x11 - 0x0f odds are blue
       
   296      */
       
   297     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
       
   298                                                    0x04, 0x05, 0x06, 0x13,
       
   299                                                    0x08, 0x09, 0x0a, 0x15,
       
   300                                                    0x0c, 0x0d, 0x0e, 0x17);
       
   301     vector unsigned char vblue2 =
       
   302         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
       
   303         );
       
   304     /*
       
   305        0x00 - 0x0f is ARxB ARxB ARxB ARxB
       
   306        0x10 - 0x0e evens are green
       
   307      */
       
   308     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
       
   309                                                     0x04, 0x05, 0x12, 0x07,
       
   310                                                     0x08, 0x09, 0x14, 0x0b,
       
   311                                                     0x0c, 0x0d, 0x16, 0x0f);
       
   312     vector unsigned char vgreen2 =
       
   313         (vector unsigned
       
   314          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
       
   315         );
       
   316 
       
   317 
       
   318     assert(srcfmt->BytesPerPixel == 2);
       
   319     assert(dstfmt->BytesPerPixel == 4);
       
   320 
       
   321     vf800 = (vector unsigned short) vec_splat_u8(-7);
       
   322     vf800 = vec_sl(vf800, vec_splat_u16(8));
       
   323 
       
   324     if (dstfmt->Amask && info->a) {
       
   325         ((unsigned char *) &valpha)[0] = alpha = info->a;
       
   326         valpha = vec_splat(valpha, 0);
       
   327     } else {
       
   328         alpha = 0;
       
   329         valpha = vec_splat_u8(0);
       
   330     }
       
   331 
       
   332     vpermute = calc_swizzle32(NULL, dstfmt);
       
   333     while (height--) {
       
   334         vector unsigned char valigner;
       
   335         vector unsigned char voverflow;
       
   336         vector unsigned char vsrc;
       
   337 
       
   338         int width = info->dst_w;
       
   339         int extrawidth;
       
   340 
       
   341         /* do scalar until we can align... */
       
   342 #define ONE_PIXEL_BLEND(condition, widthvar) \
       
   343         while (condition) { \
       
   344             unsigned sR, sG, sB; \
       
   345             unsigned short Pixel = *((unsigned short *)src); \
       
   346             sR = (Pixel >> 8) & 0xf8; \
       
   347             sG = (Pixel >> 3) & 0xfc; \
       
   348             sB = (Pixel << 3) & 0xf8; \
       
   349             ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
       
   350             src += 2; \
       
   351             dst += 4; \
       
   352             widthvar--; \
       
   353         }
       
   354         ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
       
   355 
       
   356         /* After all that work, here's the vector part! */
       
   357         extrawidth = (width % 8);       /* trailing unaligned stores */
       
   358         width -= extrawidth;
       
   359         vsrc = vec_ld(0, src);
       
   360         valigner = VEC_ALIGNER(src);
       
   361 
       
   362         while (width) {
       
   363             vector unsigned short vR, vG, vB;
       
   364             vector unsigned char vdst1, vdst2;
       
   365 
       
   366             voverflow = vec_ld(15, src);
       
   367             vsrc = vec_perm(vsrc, voverflow, valigner);
       
   368 
       
   369             vR = vec_and((vector unsigned short) vsrc, vf800);
       
   370             vB = vec_sl((vector unsigned short) vsrc, v3);
       
   371             vG = vec_sl(vB, v2);
       
   372 
       
   373             vdst1 =
       
   374                 (vector unsigned char) vec_perm((vector unsigned char) vR,
       
   375                                                 valpha, vredalpha1);
       
   376             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
       
   377             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
       
   378             vdst1 = vec_perm(vdst1, valpha, vpermute);
       
   379             vec_st(vdst1, 0, dst);
       
   380 
       
   381             vdst2 =
       
   382                 (vector unsigned char) vec_perm((vector unsigned char) vR,
       
   383                                                 valpha, vredalpha2);
       
   384             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
       
   385             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
       
   386             vdst2 = vec_perm(vdst2, valpha, vpermute);
       
   387             vec_st(vdst2, 16, dst);
       
   388 
       
   389             width -= 8;
       
   390             dst += 32;
       
   391             src += 16;
       
   392             vsrc = voverflow;
       
   393         }
       
   394 
       
   395         assert(width == 0);
       
   396 
       
   397 
       
   398         /* do scalar until we can align... */
       
   399         ONE_PIXEL_BLEND((extrawidth), extrawidth);
       
   400 #undef ONE_PIXEL_BLEND
       
   401 
       
   402         src += srcskip;         /* move to next row, accounting for pitch. */
       
   403         dst += dstskip;
       
   404     }
       
   405 
       
   406 }
       
   407 
       
   408 
       
   409 static void
       
   410 Blit_RGB555_32Altivec(SDL_BlitInfo * info)
       
   411 {
       
   412     int height = info->dst_h;
       
   413     Uint8 *src = (Uint8 *) info->src;
       
   414     int srcskip = info->src_skip;
       
   415     Uint8 *dst = (Uint8 *) info->dst;
       
   416     int dstskip = info->dst_skip;
       
   417     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   418     SDL_PixelFormat *dstfmt = info->dst_fmt;
       
   419     unsigned alpha;
       
   420     vector unsigned char valpha;
       
   421     vector unsigned char vpermute;
       
   422     vector unsigned short vf800;
       
   423     vector unsigned int v8 = vec_splat_u32(8);
       
   424     vector unsigned int v16 = vec_add(v8, v8);
       
   425     vector unsigned short v1 = vec_splat_u16(1);
       
   426     vector unsigned short v3 = vec_splat_u16(3);
       
   427     /* 
       
   428        0x10 - 0x1f is the alpha
       
   429        0x00 - 0x0e evens are the red
       
   430        0x01 - 0x0f odds are zero
       
   431      */
       
   432     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
       
   433                                                        0x10, 0x02, 0x01, 0x01,
       
   434                                                        0x10, 0x04, 0x01, 0x01,
       
   435                                                        0x10, 0x06, 0x01,
       
   436                                                        0x01);
       
   437     vector unsigned char vredalpha2 =
       
   438         (vector unsigned
       
   439          char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
       
   440         );
       
   441     /*
       
   442        0x00 - 0x0f is ARxx ARxx ARxx ARxx
       
   443        0x11 - 0x0f odds are blue
       
   444      */
       
   445     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
       
   446                                                    0x04, 0x05, 0x06, 0x13,
       
   447                                                    0x08, 0x09, 0x0a, 0x15,
       
   448                                                    0x0c, 0x0d, 0x0e, 0x17);
       
   449     vector unsigned char vblue2 =
       
   450         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
       
   451         );
       
   452     /*
       
   453        0x00 - 0x0f is ARxB ARxB ARxB ARxB
       
   454        0x10 - 0x0e evens are green
       
   455      */
       
   456     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
       
   457                                                     0x04, 0x05, 0x12, 0x07,
       
   458                                                     0x08, 0x09, 0x14, 0x0b,
       
   459                                                     0x0c, 0x0d, 0x16, 0x0f);
       
   460     vector unsigned char vgreen2 =
       
   461         (vector unsigned
       
   462          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
       
   463         );
       
   464 
       
   465 
       
   466     assert(srcfmt->BytesPerPixel == 2);
       
   467     assert(dstfmt->BytesPerPixel == 4);
       
   468 
       
   469     vf800 = (vector unsigned short) vec_splat_u8(-7);
       
   470     vf800 = vec_sl(vf800, vec_splat_u16(8));
       
   471 
       
   472     if (dstfmt->Amask && info->a) {
       
   473         ((unsigned char *) &valpha)[0] = alpha = info->a;
       
   474         valpha = vec_splat(valpha, 0);
       
   475     } else {
       
   476         alpha = 0;
       
   477         valpha = vec_splat_u8(0);
       
   478     }
       
   479 
       
   480     vpermute = calc_swizzle32(NULL, dstfmt);
       
   481     while (height--) {
       
   482         vector unsigned char valigner;
       
   483         vector unsigned char voverflow;
       
   484         vector unsigned char vsrc;
       
   485 
       
   486         int width = info->dst_w;
       
   487         int extrawidth;
       
   488 
       
   489         /* do scalar until we can align... */
       
   490 #define ONE_PIXEL_BLEND(condition, widthvar) \
       
   491         while (condition) { \
       
   492             unsigned sR, sG, sB; \
       
   493             unsigned short Pixel = *((unsigned short *)src); \
       
   494             sR = (Pixel >> 7) & 0xf8; \
       
   495             sG = (Pixel >> 2) & 0xf8; \
       
   496             sB = (Pixel << 3) & 0xf8; \
       
   497             ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
       
   498             src += 2; \
       
   499             dst += 4; \
       
   500             widthvar--; \
       
   501         }
       
   502         ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
       
   503 
       
   504         /* After all that work, here's the vector part! */
       
   505         extrawidth = (width % 8);       /* trailing unaligned stores */
       
   506         width -= extrawidth;
       
   507         vsrc = vec_ld(0, src);
       
   508         valigner = VEC_ALIGNER(src);
       
   509 
       
   510         while (width) {
       
   511             vector unsigned short vR, vG, vB;
       
   512             vector unsigned char vdst1, vdst2;
       
   513 
       
   514             voverflow = vec_ld(15, src);
       
   515             vsrc = vec_perm(vsrc, voverflow, valigner);
       
   516 
       
   517             vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800);
       
   518             vB = vec_sl((vector unsigned short) vsrc, v3);
       
   519             vG = vec_sl(vB, v3);
       
   520 
       
   521             vdst1 =
       
   522                 (vector unsigned char) vec_perm((vector unsigned char) vR,
       
   523                                                 valpha, vredalpha1);
       
   524             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
       
   525             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
       
   526             vdst1 = vec_perm(vdst1, valpha, vpermute);
       
   527             vec_st(vdst1, 0, dst);
       
   528 
       
   529             vdst2 =
       
   530                 (vector unsigned char) vec_perm((vector unsigned char) vR,
       
   531                                                 valpha, vredalpha2);
       
   532             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
       
   533             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
       
   534             vdst2 = vec_perm(vdst2, valpha, vpermute);
       
   535             vec_st(vdst2, 16, dst);
       
   536 
       
   537             width -= 8;
       
   538             dst += 32;
       
   539             src += 16;
       
   540             vsrc = voverflow;
       
   541         }
       
   542 
       
   543         assert(width == 0);
       
   544 
       
   545 
       
   546         /* do scalar until we can align... */
       
   547         ONE_PIXEL_BLEND((extrawidth), extrawidth);
       
   548 #undef ONE_PIXEL_BLEND
       
   549 
       
   550         src += srcskip;         /* move to next row, accounting for pitch. */
       
   551         dst += dstskip;
       
   552     }
       
   553 
       
   554 }
       
   555 
       
   556 static void BlitNtoNKey(SDL_BlitInfo * info);
       
   557 static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info);
       
   558 static void
       
   559 Blit32to32KeyAltivec(SDL_BlitInfo * info)
       
   560 {
       
   561     int height = info->dst_h;
       
   562     Uint32 *srcp = (Uint32 *) info->src;
       
   563     int srcskip = info->src_skip / 4;
       
   564     Uint32 *dstp = (Uint32 *) info->dst;
       
   565     int dstskip = info->dst_skip / 4;
       
   566     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   567     int srcbpp = srcfmt->BytesPerPixel;
       
   568     SDL_PixelFormat *dstfmt = info->dst_fmt;
       
   569     int dstbpp = dstfmt->BytesPerPixel;
       
   570     int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
       
   571     unsigned alpha = dstfmt->Amask ? info->a : 0;
       
   572     Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
       
   573     Uint32 ckey = info->colorkey;
       
   574     vector unsigned int valpha;
       
   575     vector unsigned char vpermute;
       
   576     vector unsigned char vzero;
       
   577     vector unsigned int vckey;
       
   578     vector unsigned int vrgbmask;
       
   579     vpermute = calc_swizzle32(srcfmt, dstfmt);
       
   580     if (info->dst_w < 16) {
       
   581         if (copy_alpha) {
       
   582             BlitNtoNKeyCopyAlpha(info);
       
   583         } else {
       
   584             BlitNtoNKey(info);
       
   585         }
       
   586         return;
       
   587     }
       
   588     vzero = vec_splat_u8(0);
       
   589     if (alpha) {
       
   590         ((unsigned char *) &valpha)[0] = (unsigned char) alpha;
       
   591         valpha =
       
   592             (vector unsigned int) vec_splat((vector unsigned char) valpha, 0);
       
   593     } else {
       
   594         valpha = (vector unsigned int) vzero;
       
   595     }
       
   596     ckey &= rgbmask;
       
   597     ((unsigned int *) (char *) &vckey)[0] = ckey;
       
   598     vckey = vec_splat(vckey, 0);
       
   599     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
       
   600     vrgbmask = vec_splat(vrgbmask, 0);
       
   601 
       
   602     while (height--) {
       
   603 #define ONE_PIXEL_BLEND(condition, widthvar) \
       
   604         if (copy_alpha) { \
       
   605             while (condition) { \
       
   606                 Uint32 Pixel; \
       
   607                 unsigned sR, sG, sB, sA; \
       
   608                 DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \
       
   609                           sR, sG, sB, sA); \
       
   610                 if ( (Pixel & rgbmask) != ckey ) { \
       
   611                       ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
       
   612                             sR, sG, sB, sA); \
       
   613                 } \
       
   614                 dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \
       
   615                 srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \
       
   616                 widthvar--; \
       
   617             } \
       
   618         } else { \
       
   619             while (condition) { \
       
   620                 Uint32 Pixel; \
       
   621                 unsigned sR, sG, sB; \
       
   622                 RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \
       
   623                 if ( Pixel != ckey ) { \
       
   624                     RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
       
   625                     ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
       
   626                               sR, sG, sB, alpha); \
       
   627                 } \
       
   628                 dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \
       
   629                 srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \
       
   630                 widthvar--; \
       
   631             } \
       
   632         }
       
   633         int width = info->dst_w;
       
   634         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
       
   635         assert(width > 0);
       
   636         if (width > 0) {
       
   637             int extrawidth = (width % 4);
       
   638             vector unsigned char valigner = VEC_ALIGNER(srcp);
       
   639             vector unsigned int vs = vec_ld(0, srcp);
       
   640             width -= extrawidth;
       
   641             assert(width >= 4);
       
   642             while (width) {
       
   643                 vector unsigned char vsel;
       
   644                 vector unsigned int vd;
       
   645                 vector unsigned int voverflow = vec_ld(15, srcp);
       
   646                 /* load the source vec */
       
   647                 vs = vec_perm(vs, voverflow, valigner);
       
   648                 /* vsel is set for items that match the key */
       
   649                 vsel = (vector unsigned char) vec_and(vs, vrgbmask);
       
   650                 vsel = (vector unsigned char) vec_cmpeq(vs, vckey);
       
   651                 /* permute the src vec to the dest format */
       
   652                 vs = vec_perm(vs, valpha, vpermute);
       
   653                 /* load the destination vec */
       
   654                 vd = vec_ld(0, dstp);
       
   655                 /* select the source and dest into vs */
       
   656                 vd = (vector unsigned int) vec_sel((vector unsigned char) vs,
       
   657                                                    (vector unsigned char) vd,
       
   658                                                    vsel);
       
   659 
       
   660                 vec_st(vd, 0, dstp);
       
   661                 srcp += 4;
       
   662                 width -= 4;
       
   663                 dstp += 4;
       
   664                 vs = voverflow;
       
   665             }
       
   666             ONE_PIXEL_BLEND((extrawidth), extrawidth);
       
   667 #undef ONE_PIXEL_BLEND
       
   668             srcp += srcskip;
       
   669             dstp += dstskip;
       
   670         }
       
   671     }
       
   672 }
       
   673 
       
   674 /* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
       
   675 /* Use this on a G5 */
       
   676 static void
       
   677 ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info)
       
   678 {
       
   679     int height = info->dst_h;
       
   680     Uint32 *src = (Uint32 *) info->src;
       
   681     int srcskip = info->src_skip / 4;
       
   682     Uint32 *dst = (Uint32 *) info->dst;
       
   683     int dstskip = info->dst_skip / 4;
       
   684     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   685     SDL_PixelFormat *dstfmt = info->dst_fmt;
       
   686     vector unsigned int vzero = vec_splat_u32(0);
       
   687     vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
       
   688     if (dstfmt->Amask && !srcfmt->Amask) {
       
   689         if (info->a) {
       
   690             vector unsigned char valpha;
       
   691             ((unsigned char *) &valpha)[0] = info->a;
       
   692             vzero = (vector unsigned int) vec_splat(valpha, 0);
       
   693         }
       
   694     }
       
   695 
       
   696     assert(srcfmt->BytesPerPixel == 4);
       
   697     assert(dstfmt->BytesPerPixel == 4);
       
   698 
       
   699     while (height--) {
       
   700         vector unsigned char valigner;
       
   701         vector unsigned int vbits;
       
   702         vector unsigned int voverflow;
       
   703         Uint32 bits;
       
   704         Uint8 r, g, b, a;
       
   705 
       
   706         int width = info->dst_w;
       
   707         int extrawidth;
       
   708 
       
   709         /* do scalar until we can align... */
       
   710         while ((UNALIGNED_PTR(dst)) && (width)) {
       
   711             bits = *(src++);
       
   712             RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
       
   713             *(dst++) = MAKE8888(dstfmt, r, g, b, a);
       
   714             width--;
       
   715         }
       
   716 
       
   717         /* After all that work, here's the vector part! */
       
   718         extrawidth = (width % 4);
       
   719         width -= extrawidth;
       
   720         valigner = VEC_ALIGNER(src);
       
   721         vbits = vec_ld(0, src);
       
   722 
       
   723         while (width) {
       
   724             voverflow = vec_ld(15, src);
       
   725             src += 4;
       
   726             width -= 4;
       
   727             vbits = vec_perm(vbits, voverflow, valigner);       /* src is ready. */
       
   728             vbits = vec_perm(vbits, vzero, vpermute);   /* swizzle it. */
       
   729             vec_st(vbits, 0, dst);      /* store it back out. */
       
   730             dst += 4;
       
   731             vbits = voverflow;
       
   732         }
       
   733 
       
   734         assert(width == 0);
       
   735 
       
   736         /* cover pixels at the end of the row that didn't fit in 16 bytes. */
       
   737         while (extrawidth) {
       
   738             bits = *(src++);    /* max 7 pixels, don't bother with prefetch. */
       
   739             RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
       
   740             *(dst++) = MAKE8888(dstfmt, r, g, b, a);
       
   741             extrawidth--;
       
   742         }
       
   743 
       
   744         src += srcskip;
       
   745         dst += dstskip;
       
   746     }
       
   747 
       
   748 }
       
   749 
       
   750 /* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
       
   751 /* Use this on a G4 */
       
   752 static void
       
   753 ConvertAltivec32to32_prefetch(SDL_BlitInfo * info)
       
   754 {
       
   755     const int scalar_dst_lead = sizeof(Uint32) * 4;
       
   756     const int vector_dst_lead = sizeof(Uint32) * 16;
       
   757 
       
   758     int height = info->dst_h;
       
   759     Uint32 *src = (Uint32 *) info->src;
       
   760     int srcskip = info->src_skip / 4;
       
   761     Uint32 *dst = (Uint32 *) info->dst;
       
   762     int dstskip = info->dst_skip / 4;
       
   763     SDL_PixelFormat *srcfmt = info->src_fmt;
       
   764     SDL_PixelFormat *dstfmt = info->dst_fmt;
       
   765     vector unsigned int vzero = vec_splat_u32(0);
       
   766     vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
       
   767     if (dstfmt->Amask && !srcfmt->Amask) {
       
   768         if (info->a) {
       
   769             vector unsigned char valpha;
       
   770             ((unsigned char *) &valpha)[0] = info->a;
       
   771             vzero = (vector unsigned int) vec_splat(valpha, 0);
       
   772         }
       
   773     }
       
   774 
       
   775     assert(srcfmt->BytesPerPixel == 4);
       
   776     assert(dstfmt->BytesPerPixel == 4);
       
   777 
       
   778     while (height--) {
       
   779         vector unsigned char valigner;
       
   780         vector unsigned int vbits;
       
   781         vector unsigned int voverflow;
       
   782         Uint32 bits;
       
   783         Uint8 r, g, b, a;
       
   784 
       
   785         int width = info->dst_w;
       
   786         int extrawidth;
       
   787 
       
   788         /* do scalar until we can align... */
       
   789         while ((UNALIGNED_PTR(dst)) && (width)) {
       
   790             vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024),
       
   791                      DST_CHAN_SRC);
       
   792             vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024),
       
   793                       DST_CHAN_DEST);
       
   794             bits = *(src++);
       
   795             RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
       
   796             *(dst++) = MAKE8888(dstfmt, r, g, b, a);
       
   797             width--;
       
   798         }
       
   799 
       
   800         /* After all that work, here's the vector part! */
       
   801         extrawidth = (width % 4);
       
   802         width -= extrawidth;
       
   803         valigner = VEC_ALIGNER(src);
       
   804         vbits = vec_ld(0, src);
       
   805 
       
   806         while (width) {
       
   807             vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024),
       
   808                      DST_CHAN_SRC);
       
   809             vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024),
       
   810                       DST_CHAN_DEST);
       
   811             voverflow = vec_ld(15, src);
       
   812             src += 4;
       
   813             width -= 4;
       
   814             vbits = vec_perm(vbits, voverflow, valigner);       /* src is ready. */
       
   815             vbits = vec_perm(vbits, vzero, vpermute);   /* swizzle it. */
       
   816             vec_st(vbits, 0, dst);      /* store it back out. */
       
   817             dst += 4;
       
   818             vbits = voverflow;
       
   819         }
       
   820 
       
   821         assert(width == 0);
       
   822 
       
   823         /* cover pixels at the end of the row that didn't fit in 16 bytes. */
       
   824         while (extrawidth) {
       
   825             bits = *(src++);    /* max 7 pixels, don't bother with prefetch. */
       
   826             RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
       
   827             *(dst++) = MAKE8888(dstfmt, r, g, b, a);
       
   828             extrawidth--;
       
   829         }
       
   830 
       
   831         src += srcskip;
       
   832         dst += dstskip;
       
   833     }
       
   834 
       
   835     vec_dss(DST_CHAN_SRC);
       
   836     vec_dss(DST_CHAN_DEST);
       
   837 }
       
   838 
       
   839 static Uint32
       
   840 GetBlitFeatures(void)
       
   841 {
       
   842     static Uint32 features = 0xffffffff;
       
   843     if (features == 0xffffffff) {
       
   844         /* Provide an override for testing .. */
       
   845         char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES");
       
   846         if (override) {
       
   847             features = 0;
       
   848             SDL_sscanf(override, "%u", &features);
       
   849         } else {
       
   850             features = (0
       
   851                         /* Feature 1 is has-MMX */
       
   852                         | ((SDL_HasMMX())? 1 : 0)
       
   853                         /* Feature 2 is has-AltiVec */
       
   854                         | ((SDL_HasAltiVec())? 2 : 0)
       
   855                         /* Feature 4 is dont-use-prefetch */
       
   856                         /* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */
       
   857                         | ((GetL3CacheSize() == 0) ? 4 : 0)
       
   858                 );
       
   859         }
       
   860     }
       
   861     return features;
       
   862 }
       
   863 
       
   864 #if __MWERKS__
       
   865 #pragma altivec_model off
       
   866 #endif
       
   867 #else
       
   868 /* Feature 1 is has-MMX */
    31 /* Feature 1 is has-MMX */
   869 #define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
    32 #define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
   870 #endif
       
   871 
    33 
   872 /* This is now endian dependent */
    34 /* This is now endian dependent */
   873 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
    35 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
   874 #define HI	1
    36 #define HI	1
   875 #define LO	0
    37 #define LO	0
  2344     /* Default for 8-bit RGB source, an invalid combination */
  1506     /* Default for 8-bit RGB source, an invalid combination */
  2345     {0, 0, 0, 0, 0, 0, 0, 0, NULL},
  1507     {0, 0, 0, 0, 0, 0, 0, 0, NULL},
  2346 };
  1508 };
  2347 
  1509 
  2348 static const struct blit_table normal_blit_2[] = {
  1510 static const struct blit_table normal_blit_2[] = {
  2349 #if SDL_ALTIVEC_BLITTERS
       
  2350     /* has-altivec */
       
  2351     {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00000000, 0x00000000,
       
  2352      0x00000000,
       
  2353      2, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA},
       
  2354     {0x00007C00, 0x000003E0, 0x0000001F, 4, 0x00000000, 0x00000000,
       
  2355      0x00000000,
       
  2356      2, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA},
       
  2357 #endif
       
  2358     {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00,
  1511     {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x00FF0000, 0x0000FF00,
  2359      0x000000FF,
  1512      0x000000FF,
  2360      0, Blit_RGB565_ARGB8888, SET_ALPHA},
  1513      0, Blit_RGB565_ARGB8888, SET_ALPHA},
  2361     {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x000000FF, 0x0000FF00,
  1514     {0x0000F800, 0x000007E0, 0x0000001F, 4, 0x000000FF, 0x0000FF00,
  2362      0x00FF0000,
  1515      0x00FF0000,
  2376     /* Default for 24-bit RGB source, never optimized */
  1529     /* Default for 24-bit RGB source, never optimized */
  2377     {0, 0, 0, 0, 0, 0, 0, 0, BlitNtoN, 0}
  1530     {0, 0, 0, 0, 0, 0, 0, 0, BlitNtoN, 0}
  2378 };
  1531 };
  2379 
  1532 
  2380 static const struct blit_table normal_blit_4[] = {
  1533 static const struct blit_table normal_blit_4[] = {
  2381 #if SDL_ALTIVEC_BLITTERS
       
  2382     /* has-altivec | dont-use-prefetch */
       
  2383     {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
       
  2384      0x00000000,
       
  2385      6, ConvertAltivec32to32_noprefetch,
       
  2386      NO_ALPHA | COPY_ALPHA | SET_ALPHA},
       
  2387     /* has-altivec */
       
  2388     {0x00000000, 0x00000000, 0x00000000, 4, 0x00000000, 0x00000000,
       
  2389      0x00000000,
       
  2390      2, ConvertAltivec32to32_prefetch,
       
  2391      NO_ALPHA | COPY_ALPHA | SET_ALPHA},
       
  2392     /* has-altivec */
       
  2393     {0x00000000, 0x00000000, 0x00000000, 2, 0x0000F800, 0x000007E0,
       
  2394      0x0000001F,
       
  2395      2, Blit_RGB888_RGB565Altivec, NO_ALPHA},
       
  2396 #endif
       
  2397     {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
  1534     {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0,
  2398      0x0000001F,
  1535      0x0000001F,
  2399      0, Blit_RGB888_RGB565, NO_ALPHA},
  1536      0, Blit_RGB888_RGB565, NO_ALPHA},
  2400     {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
  1537     {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0,
  2401      0x0000001F,
  1538      0x0000001F,
  2489         if (srcfmt->BytesPerPixel == 2 && surface->map->identity)
  1626         if (srcfmt->BytesPerPixel == 2 && surface->map->identity)
  2490             return Blit2to2Key;
  1627             return Blit2to2Key;
  2491         else if (dstfmt->BytesPerPixel == 1)
  1628         else if (dstfmt->BytesPerPixel == 1)
  2492             return BlitNto1Key;
  1629             return BlitNto1Key;
  2493         else {
  1630         else {
  2494 #if SDL_ALTIVEC_BLITTERS
       
  2495             if ((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4)
       
  2496                 && SDL_HasAltiVec()) {
       
  2497                 return Blit32to32KeyAltivec;
       
  2498             } else
       
  2499 #endif
       
  2500             if (srcfmt->Amask && dstfmt->Amask) {
  1631             if (srcfmt->Amask && dstfmt->Amask) {
  2501                 return BlitNtoNKeyCopyAlpha;
  1632                 return BlitNtoNKeyCopyAlpha;
  2502             } else {
  1633             } else {
  2503                 return BlitNtoNKey;
  1634                 return BlitNtoNKey;
  2504             }
  1635             }