Added SSE and MMX optimization for SDL_FillRect()
authorSam Lantinga <slouken@libsdl.org>
Thu, 16 Aug 2007 05:56:24 +0000
changeset 22495a58b57b6724
parent 2248 5cd2a2293cf0
child 2250 e1d228456537
Added SSE and MMX optimization for SDL_FillRect()
src/video/SDL_blit.c
src/video/SDL_blit.h
src/video/SDL_blit_copy.c
src/video/SDL_surface.c
     1.1 --- a/src/video/SDL_blit.c	Thu Aug 16 02:14:13 2007 +0000
     1.2 +++ b/src/video/SDL_blit.c	Thu Aug 16 05:56:24 2007 +0000
     1.3 @@ -110,7 +110,8 @@
     1.4  #ifdef __MACOSX__
     1.5  #include <sys/sysctl.h>
     1.6  
     1.7 -static SDL_bool SDL_UseAltivecPrefetch()
     1.8 +static SDL_bool
     1.9 +SDL_UseAltivecPrefetch()
    1.10  {
    1.11      const char key[] = "hw.l3cachesize";
    1.12      u_int64_t result = 0;
    1.13 @@ -123,14 +124,16 @@
    1.14      }
    1.15  }
    1.16  #else
    1.17 -static SDL_bool SDL_UseAltivecPrefetch()
    1.18 +static SDL_bool
    1.19 +SDL_UseAltivecPrefetch()
    1.20  {
    1.21      /* Just guess G4 */
    1.22      return SDL_TRUE;
    1.23  }
    1.24  #endif /* __MACOSX__ */
    1.25  
    1.26 -static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
    1.27 +static SDL_loblit
    1.28 +SDL_ChooseBlitFunc(SDL_BlitEntry * entries, int count)
    1.29  {
    1.30      int i;
    1.31      static Uint32 features = 0xffffffff;
     2.1 --- a/src/video/SDL_blit.h	Thu Aug 16 02:14:13 2007 +0000
     2.2 +++ b/src/video/SDL_blit.h	Thu Aug 16 05:56:24 2007 +0000
     2.3 @@ -24,6 +24,13 @@
     2.4  #ifndef _SDL_blit_h
     2.5  #define _SDL_blit_h
     2.6  
     2.7 +#ifdef __MMX__
     2.8 +#include <mmintrin.h>
     2.9 +#endif
    2.10 +#ifdef __SSE__
    2.11 +#include <xmmintrin.h>
    2.12 +#endif
    2.13 +
    2.14  #include "SDL_endian.h"
    2.15  
    2.16  /* The structure passed to the low level blit functions */
    2.17 @@ -92,6 +99,14 @@
    2.18   * Useful macros for blitting routines
    2.19   */
    2.20  
    2.21 +#if defined(__GNUC__)
    2.22 +#define DECLARE_ALIGNED(t,v,a)  t __attribute__((aligned(a))) v
    2.23 +#elif defined(_MSC_VER)
    2.24 +#define DECLARE_ALIGNED(t,v,a)  t __declspec(align(a)) v
    2.25 +#else
    2.26 +#define DECLARE_ALIGNED(t,v,a)  t v
    2.27 +#endif
    2.28 +
    2.29  #define FORMAT_EQUAL(A, B)						\
    2.30      ((A)->BitsPerPixel == (B)->BitsPerPixel				\
    2.31       && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))
     3.1 --- a/src/video/SDL_blit_copy.c	Thu Aug 16 02:14:13 2007 +0000
     3.2 +++ b/src/video/SDL_blit_copy.c	Thu Aug 16 05:56:24 2007 +0000
     3.3 @@ -23,13 +23,8 @@
     3.4  
     3.5  #include "SDL_video.h"
     3.6  #include "SDL_blit.h"
     3.7 +#include "SDL_blit_copy.h"
     3.8  
     3.9 -#ifdef __MMX__
    3.10 -#include <mmintrin.h>
    3.11 -#endif
    3.12 -#ifdef __SSE__
    3.13 -#include <xmmintrin.h>
    3.14 -#endif
    3.15  
    3.16  #ifdef __MMX__
    3.17  static __inline__ void
     4.1 --- a/src/video/SDL_surface.c	Thu Aug 16 02:14:13 2007 +0000
     4.2 +++ b/src/video/SDL_surface.c	Thu Aug 16 05:56:24 2007 +0000
     4.3 @@ -509,20 +509,220 @@
     4.4      return 0;
     4.5  }
     4.6  
     4.7 -static int
     4.8 -SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
     4.9 -{
    4.10 -    /* FIXME: We have to worry about packing order.. *sigh* */
    4.11 -    SDL_SetError("1-bpp rect fill not yet implemented");
    4.12 -    return -1;
    4.13 +#ifdef __SSE__
    4.14 +/* *INDENT-OFF* */
    4.15 +
    4.16 +#define SSE_BEGIN \
    4.17 +    DECLARE_ALIGNED(Uint32, cccc[4], 16); \
    4.18 +    cccc[0] = color; \
    4.19 +    cccc[1] = color; \
    4.20 +    cccc[2] = color; \
    4.21 +    cccc[3] = color; \
    4.22 +    __m128 c128 = *(__m128 *)cccc;
    4.23 +
    4.24 +#define SSE_WORK \
    4.25 +    for (i = n / 64; i--;) { \
    4.26 +        _mm_stream_ps((float *)(p+0), c128); \
    4.27 +        _mm_stream_ps((float *)(p+16), c128); \
    4.28 +        _mm_stream_ps((float *)(p+32), c128); \
    4.29 +        _mm_stream_ps((float *)(p+48), c128); \
    4.30 +        p += 64; \
    4.31 +    }
    4.32 +
    4.33 +#define SSE_END
    4.34 +
    4.35 +#define DEFINE_SSE_FILLRECT(bpp, type) \
    4.36 +static void \
    4.37 +SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
    4.38 +{ \
    4.39 +    SSE_BEGIN; \
    4.40 + \
    4.41 +    while (h--) { \
    4.42 +        int i, n = w * bpp; \
    4.43 +        Uint8 *p = pixels; \
    4.44 + \
    4.45 +        if (n > 15) { \
    4.46 +            int adjust = 16 - ((uintptr_t)p & 15); \
    4.47 +            if (adjust < 16) { \
    4.48 +                n -= adjust; \
    4.49 +                adjust /= bpp; \
    4.50 +                while(adjust--) { \
    4.51 +                    *((type *)p) = (type)color; \
    4.52 +                    p += bpp; \
    4.53 +                } \
    4.54 +            } \
    4.55 +            SSE_WORK; \
    4.56 +        } \
    4.57 +        if (n & 63) { \
    4.58 +            int remainder = (n & 63); \
    4.59 +            remainder /= bpp; \
    4.60 +            while(remainder--) { \
    4.61 +                *((type *)p) = (type)color; \
    4.62 +                p += bpp; \
    4.63 +            } \
    4.64 +        } \
    4.65 +        pixels += pitch; \
    4.66 +    } \
    4.67 + \
    4.68 +    SSE_END; \
    4.69  }
    4.70  
    4.71 -static int
    4.72 -SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
    4.73 +DEFINE_SSE_FILLRECT(1, Uint8)
    4.74 +DEFINE_SSE_FILLRECT(2, Uint16)
    4.75 +DEFINE_SSE_FILLRECT(4, Uint32)
    4.76 +
    4.77 +/* *INDENT-ON* */
    4.78 +#endif /* __SSE__ */
    4.79 +
    4.80 +#ifdef __MMX__
    4.81 +/* *INDENT-OFF* */
    4.82 +
    4.83 +#define MMX_BEGIN \
    4.84 +    __m64 c64 = _mm_set_pi32(color, color)
    4.85 +
    4.86 +#define MMX_WORK \
    4.87 +    for (i = n / 64; i--;) { \
    4.88 +        _mm_stream_pi((__m64 *)(p+0), c64); \
    4.89 +        _mm_stream_pi((__m64 *)(p+8), c64); \
    4.90 +        _mm_stream_pi((__m64 *)(p+16), c64); \
    4.91 +        _mm_stream_pi((__m64 *)(p+24), c64); \
    4.92 +        _mm_stream_pi((__m64 *)(p+32), c64); \
    4.93 +        _mm_stream_pi((__m64 *)(p+40), c64); \
    4.94 +        _mm_stream_pi((__m64 *)(p+48), c64); \
    4.95 +        _mm_stream_pi((__m64 *)(p+56), c64); \
    4.96 +        p += 64; \
    4.97 +    }
    4.98 +
    4.99 +#define MMX_END \
   4.100 +    _mm_empty()
   4.101 +
   4.102 +#define DEFINE_MMX_FILLRECT(bpp, type) \
   4.103 +static void \
   4.104 +SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
   4.105 +{ \
   4.106 +    MMX_BEGIN; \
   4.107 + \
   4.108 +    while (h--) { \
   4.109 +        int i, n = w * bpp; \
   4.110 +        Uint8 *p = pixels; \
   4.111 + \
   4.112 +        if (n > 7) { \
   4.113 +            int adjust = 8 - ((uintptr_t)p & 7); \
   4.114 +            if (adjust < 8) { \
   4.115 +                n -= adjust; \
   4.116 +                adjust /= bpp; \
   4.117 +                while(adjust--) { \
   4.118 +                    *((type *)p) = (type)color; \
   4.119 +                    p += bpp; \
   4.120 +                } \
   4.121 +            } \
   4.122 +            MMX_WORK; \
   4.123 +        } \
   4.124 +        if (n & 63) { \
   4.125 +            int remainder = (n & 63); \
   4.126 +            remainder /= bpp; \
   4.127 +            while(remainder--) { \
   4.128 +                *((type *)p) = (type)color; \
   4.129 +                p += bpp; \
   4.130 +            } \
   4.131 +        } \
   4.132 +        pixels += pitch; \
   4.133 +    } \
   4.134 + \
   4.135 +    MMX_END; \
   4.136 +}
   4.137 +
   4.138 +DEFINE_MMX_FILLRECT(1, Uint8)
   4.139 +DEFINE_MMX_FILLRECT(2, Uint16)
   4.140 +DEFINE_MMX_FILLRECT(4, Uint32)
   4.141 +
   4.142 +/* *INDENT-ON* */
   4.143 +#endif /* __MMX__ */
   4.144 +
   4.145 +static void
   4.146 +SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   4.147  {
   4.148 -    /* FIXME: We have to worry about packing order.. *sigh* */
   4.149 -    SDL_SetError("4-bpp rect fill not yet implemented");
   4.150 -    return -1;
   4.151 +    while (h--) {
   4.152 +        int n = w;
   4.153 +        Uint8 *p = pixels;
   4.154 +
   4.155 +        if (n > 3) {
   4.156 +            switch ((uintptr_t) p & 3) {
   4.157 +            case 1:
   4.158 +                *p++ = (Uint8) color;
   4.159 +                --n;
   4.160 +            case 2:
   4.161 +                *p++ = (Uint8) color;
   4.162 +                --n;
   4.163 +            case 3:
   4.164 +                *p++ = (Uint8) color;
   4.165 +                --n;
   4.166 +            }
   4.167 +            SDL_memset4(p, color, (n >> 2));
   4.168 +        }
   4.169 +        if (n & 3) {
   4.170 +            p += (n & ~3);
   4.171 +            switch (n & 3) {
   4.172 +            case 3:
   4.173 +                *p++ = (Uint8) color;
   4.174 +            case 2:
   4.175 +                *p++ = (Uint8) color;
   4.176 +            case 1:
   4.177 +                *p++ = (Uint8) color;
   4.178 +            }
   4.179 +        }
   4.180 +        pixels += pitch;
   4.181 +    }
   4.182 +}
   4.183 +
   4.184 +static void
   4.185 +SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   4.186 +{
   4.187 +    while (h--) {
   4.188 +        int n = w;
   4.189 +        Uint16 *p = (Uint16 *) pixels;
   4.190 +
   4.191 +        if (n > 1) {
   4.192 +            if ((uintptr_t) p & 2) {
   4.193 +                *p++ = (Uint16) color;
   4.194 +                --n;
   4.195 +            }
   4.196 +            SDL_memset4(p, color, (n >> 1));
   4.197 +        }
   4.198 +        if (n & 1) {
   4.199 +            p[n - 1] = (Uint16) color;
   4.200 +        }
   4.201 +        pixels += pitch;
   4.202 +    }
   4.203 +}
   4.204 +
   4.205 +static void
   4.206 +SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   4.207 +{
   4.208 +    Uint8 r = (Uint8) (color & 0xFF);
   4.209 +    Uint8 g = (Uint8) ((color >> 8) & 0xFF);
   4.210 +    Uint8 b = (Uint8) ((color >> 16) & 0xFF);
   4.211 +
   4.212 +    while (h--) {
   4.213 +        int n = w;
   4.214 +        Uint8 *p = pixels;
   4.215 +
   4.216 +        while (n--) {
   4.217 +            *p++ = r;
   4.218 +            *p++ = g;
   4.219 +            *p++ = b;
   4.220 +        }
   4.221 +        pixels += pitch;
   4.222 +    }
   4.223 +}
   4.224 +
   4.225 +static void
   4.226 +SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   4.227 +{
   4.228 +    while (h--) {
   4.229 +        SDL_memset4(pixels, color, w);
   4.230 +        pixels += pitch;
   4.231 +    }
   4.232  }
   4.233  
   4.234  /* 
   4.235 @@ -531,23 +731,12 @@
   4.236  int
   4.237  SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
   4.238  {
   4.239 -    int x, y;
   4.240 -    Uint8 *row;
   4.241 +    Uint8 *pixels;
   4.242  
   4.243      /* This function doesn't work on surfaces < 8 bpp */
   4.244      if (dst->format->BitsPerPixel < 8) {
   4.245 -        switch (dst->format->BitsPerPixel) {
   4.246 -        case 1:
   4.247 -            return SDL_FillRect1(dst, dstrect, color);
   4.248 -            break;
   4.249 -        case 4:
   4.250 -            return SDL_FillRect4(dst, dstrect, color);
   4.251 -            break;
   4.252 -        default:
   4.253 -            SDL_SetError("Fill rect on unsupported surface format");
   4.254 -            return (-1);
   4.255 -            break;
   4.256 -        }
   4.257 +        SDL_SetError("Fill rect on unsupported surface format");
   4.258 +        return (-1);
   4.259      }
   4.260  
   4.261      /* If 'dstrect' == NULL, then fill the whole surface */
   4.262 @@ -564,97 +753,83 @@
   4.263      if (SDL_LockSurface(dst) != 0) {
   4.264          return (-1);
   4.265      }
   4.266 -    row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
   4.267 +
   4.268 +    pixels =
   4.269 +        (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
   4.270          dstrect->x * dst->format->BytesPerPixel;
   4.271 -    if (dst->format->palette || (color == 0)) {
   4.272 -        x = dstrect->w * dst->format->BytesPerPixel;
   4.273 -#ifndef __MACOSX__              /* memset() is optimized on Mac OS X */
   4.274 -        if (!color && !((uintptr_t) row & 3) && !(x & 3)
   4.275 -            && !(dst->pitch & 3)) {
   4.276 -            int n = x >> 2;
   4.277 -            for (y = dstrect->h; y; --y) {
   4.278 -                SDL_memset4(row, 0, n);
   4.279 -                row += dst->pitch;
   4.280 -            }
   4.281 -        } else
   4.282 -#endif /* !__MACOSX__ */
   4.283 +
   4.284 +    switch (dst->format->BytesPerPixel) {
   4.285 +    case 1:
   4.286          {
   4.287 -            for (y = dstrect->h; y; y--) {
   4.288 -                SDL_memset(row, color, x);
   4.289 -                row += dst->pitch;
   4.290 +            color |= (color << 8);
   4.291 +            color |= (color << 16);
   4.292 +#ifdef __SSE__
   4.293 +            if (SDL_HasSSE()) {
   4.294 +                SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
   4.295 +                                 dstrect->h);
   4.296 +                break;
   4.297              }
   4.298 -        }
   4.299 -    } else {
   4.300 -        switch (dst->format->BytesPerPixel) {
   4.301 -        case 2:
   4.302 -            {
   4.303 -                Uint16 c = (Uint16) color;
   4.304 -                Uint32 cc = (Uint32) c << 16 | c;
   4.305 -                for (y = dstrect->h; y; --y) {
   4.306 -                    Uint16 *pixels = (Uint16 *) row;
   4.307 -                    int n = dstrect->w;
   4.308 -                    if ((uintptr_t) pixels & 3) {
   4.309 -                        *pixels++ = c;
   4.310 -                        n--;
   4.311 -                    }
   4.312 -                    if (n >> 1)
   4.313 -                        SDL_memset4(pixels, cc, n >> 1);
   4.314 -                    if (n & 1)
   4.315 -                        pixels[n - 1] = c;
   4.316 -                    row += dst->pitch;
   4.317 -                }
   4.318 +#endif
   4.319 +#ifdef __MMX__
   4.320 +            if (SDL_HasMMX()) {
   4.321 +                SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
   4.322 +                                 dstrect->h);
   4.323 +                break;
   4.324              }
   4.325 +#endif
   4.326 +            SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   4.327              break;
   4.328 -
   4.329 -        case 3:
   4.330 -#if SDL_BYTEORDER == SDL_BIG_ENDIAN
   4.331 -            color <<= 8;
   4.332 -#endif
   4.333 -            for (y = dstrect->h; y; --y) {
   4.334 -                Uint8 *pixels = row;
   4.335 -                for (x = dstrect->w; x; --x) {
   4.336 -                    SDL_memcpy(pixels, &color, 3);
   4.337 -                    pixels += 3;
   4.338 -                }
   4.339 -                row += dst->pitch;
   4.340 -            }
   4.341 -            break;
   4.342 +        }
   4.343  
   4.344 -        case 4:
   4.345 -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
   4.346 -            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
   4.347 -                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
   4.348 -                color, color, color, color};
   4.349 -                int i, n = dstrect->w / 4;
   4.350 -                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
   4.351 -                                     "r"(cccc):"memory");
   4.352 -                for (y = dstrect->h; y; --y) {
   4.353 -                    Uint8 *pixels = row;
   4.354 -                    for (i = n / 2; i--;) {
   4.355 -                        /* *INDENT-OFF* */
   4.356 -                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
   4.357 -                                             "	movdqa %%xmm0, (%0)\n"
   4.358 -                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
   4.359 -                        /* *INDENT-ON* */
   4.360 -                        pixels += 32;
   4.361 -                    }
   4.362 -                    if (n & 1) {
   4.363 -                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
   4.364 -                                             "r"(pixels):"memory");
   4.365 -                    }
   4.366 -                    row += dst->pitch;
   4.367 -                }
   4.368 -                __asm__ __volatile__("	emms\n"::);
   4.369 +    case 2:
   4.370 +        {
   4.371 +            color |= (color << 16);
   4.372 +#ifdef __SSE__
   4.373 +            if (SDL_HasSSE()) {
   4.374 +                SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
   4.375 +                                 dstrect->h);
   4.376                  break;
   4.377              }
   4.378  #endif
   4.379 -            for (y = dstrect->h; y; --y) {
   4.380 -                SDL_memset4(row, color, dstrect->w);
   4.381 -                row += dst->pitch;
   4.382 +#ifdef __MMX__
   4.383 +            if (SDL_HasMMX()) {
   4.384 +                SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
   4.385 +                                 dstrect->h);
   4.386 +                break;
   4.387              }
   4.388 +#endif
   4.389 +            SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   4.390 +            break;
   4.391 +        }
   4.392 +
   4.393 +    case 3:
   4.394 +        /* 24-bit RGB is a slow path, at least for now. */
   4.395 +        {
   4.396 +            SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   4.397 +            break;
   4.398 +        }
   4.399 +
   4.400 +    case 4:
   4.401 +        {
   4.402 +#ifdef __SSE__
   4.403 +            if (SDL_HasSSE()) {
   4.404 +                SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
   4.405 +                                 dstrect->h);
   4.406 +                break;
   4.407 +            }
   4.408 +#endif
   4.409 +#ifdef __MMX__
   4.410 +            if (SDL_HasMMX()) {
   4.411 +                SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
   4.412 +                                 dstrect->h);
   4.413 +                break;
   4.414 +            }
   4.415 +#endif
   4.416 +            SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   4.417              break;
   4.418          }
   4.419      }
   4.420 +
   4.421      SDL_UnlockSurface(dst);
   4.422  
   4.423      /* We're done! */