Added SSE version of SDL_FillRect() for 32-bit ARGB surfaces
authorSam Lantinga <slouken@libsdl.org>
Mon, 13 Aug 2007 06:24:56 +0000
changeset 223931835fd24b2b
parent 2238 93ea84f77d83
child 2240 eebaa05f8bd9
Added SSE version of SDL_FillRect() for 32-bit ARGB surfaces
src/video/SDL_surface.c
     1.1 --- a/src/video/SDL_surface.c	Mon Aug 13 03:03:23 2007 +0000
     1.2 +++ b/src/video/SDL_surface.c	Mon Aug 13 06:24:56 2007 +0000
     1.3 @@ -587,20 +587,22 @@
     1.4      } else {
     1.5          switch (dst->format->BytesPerPixel) {
     1.6          case 2:
     1.7 -            for (y = dstrect->h; y; --y) {
     1.8 -                Uint16 *pixels = (Uint16 *) row;
     1.9 +            {
    1.10                  Uint16 c = (Uint16) color;
    1.11                  Uint32 cc = (Uint32) c << 16 | c;
    1.12 -                int n = dstrect->w;
    1.13 -                if ((uintptr_t) pixels & 3) {
    1.14 -                    *pixels++ = c;
    1.15 -                    n--;
    1.16 +                for (y = dstrect->h; y; --y) {
    1.17 +                    Uint16 *pixels = (Uint16 *) row;
    1.18 +                    int n = dstrect->w;
    1.19 +                    if ((uintptr_t) pixels & 3) {
    1.20 +                        *pixels++ = c;
    1.21 +                        n--;
    1.22 +                    }
    1.23 +                    if (n >> 1)
    1.24 +                        SDL_memset4(pixels, cc, n >> 1);
    1.25 +                    if (n & 1)
    1.26 +                        pixels[n - 1] = c;
    1.27 +                    row += dst->pitch;
    1.28                  }
    1.29 -                if (n >> 1)
    1.30 -                    SDL_memset4(pixels, cc, n >> 1);
    1.31 -                if (n & 1)
    1.32 -                    pixels[n - 1] = c;
    1.33 -                row += dst->pitch;
    1.34              }
    1.35              break;
    1.36  
    1.37 @@ -619,6 +621,33 @@
    1.38              break;
    1.39  
    1.40          case 4:
    1.41 +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
    1.42 +            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
    1.43 +                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
    1.44 +                color, color, color, color};
    1.45 +                int i, n = dstrect->w / 4;
    1.46 +                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
    1.47 +                                     "r"(cccc):"memory");
    1.48 +                for (y = dstrect->h; y; --y) {
    1.49 +                    Uint8 *pixels = row;
    1.50 +                    for (i = n / 2; i--;) {
    1.51 +                        /* *INDENT-OFF* */
    1.52 +                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
    1.53 +                                             "	movdqa %%xmm0, (%0)\n"
    1.54 +                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
    1.55 +                        /* *INDENT-ON* */
    1.56 +                        pixels += 32;
    1.57 +                    }
    1.58 +                    if (n & 1) {
    1.59 +                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
    1.60 +                                             "r"(pixels):"memory");
    1.61 +                    }
    1.62 +                    row += dst->pitch;
    1.63 +                }
    1.64 +                __asm__ __volatile__("	emms\n"::);
    1.65 +                break;
    1.66 +            }
    1.67 +#endif
    1.68              for (y = dstrect->h; y; --y) {
    1.69                  SDL_memset4(row, color, dstrect->w);
    1.70                  row += dst->pitch;