src/video/SDL_surface.c
changeset 2249 5a58b57b6724
parent 2239 31835fd24b2b
child 2251 292bee385630
     1.1 --- a/src/video/SDL_surface.c	Thu Aug 16 02:14:13 2007 +0000
     1.2 +++ b/src/video/SDL_surface.c	Thu Aug 16 05:56:24 2007 +0000
     1.3 @@ -509,20 +509,220 @@
     1.4      return 0;
     1.5  }
     1.6  
     1.7 -static int
     1.8 -SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
     1.9 -{
    1.10 -    /* FIXME: We have to worry about packing order.. *sigh* */
    1.11 -    SDL_SetError("1-bpp rect fill not yet implemented");
    1.12 -    return -1;
    1.13 +#ifdef __SSE__
    1.14 +/* *INDENT-OFF* */
    1.15 +
    1.16 +#define SSE_BEGIN \
    1.17 +    DECLARE_ALIGNED(Uint32, cccc[4], 16); \
    1.18 +    cccc[0] = color; \
    1.19 +    cccc[1] = color; \
    1.20 +    cccc[2] = color; \
    1.21 +    cccc[3] = color; \
    1.22 +    __m128 c128 = *(__m128 *)cccc;
    1.23 +
    1.24 +#define SSE_WORK \
    1.25 +    for (i = n / 64; i--;) { \
    1.26 +        _mm_stream_ps((float *)(p+0), c128); \
    1.27 +        _mm_stream_ps((float *)(p+16), c128); \
    1.28 +        _mm_stream_ps((float *)(p+32), c128); \
    1.29 +        _mm_stream_ps((float *)(p+48), c128); \
    1.30 +        p += 64; \
    1.31 +    }
    1.32 +
    1.33 +#define SSE_END
    1.34 +
    1.35 +#define DEFINE_SSE_FILLRECT(bpp, type) \
    1.36 +static void \
    1.37 +SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
    1.38 +{ \
    1.39 +    SSE_BEGIN; \
    1.40 + \
    1.41 +    while (h--) { \
    1.42 +        int i, n = w * bpp; \
    1.43 +        Uint8 *p = pixels; \
    1.44 + \
    1.45 +        if (n > 15) { \
    1.46 +            int adjust = 16 - ((uintptr_t)p & 15); \
    1.47 +            if (adjust < 16) { \
    1.48 +                n -= adjust; \
    1.49 +                adjust /= bpp; \
    1.50 +                while(adjust--) { \
    1.51 +                    *((type *)p) = (type)color; \
    1.52 +                    p += bpp; \
    1.53 +                } \
    1.54 +            } \
    1.55 +            SSE_WORK; \
    1.56 +        } \
    1.57 +        if (n & 63) { \
    1.58 +            int remainder = (n & 63); \
    1.59 +            remainder /= bpp; \
    1.60 +            while(remainder--) { \
    1.61 +                *((type *)p) = (type)color; \
    1.62 +                p += bpp; \
    1.63 +            } \
    1.64 +        } \
    1.65 +        pixels += pitch; \
    1.66 +    } \
    1.67 + \
    1.68 +    SSE_END; \
    1.69  }
    1.70  
    1.71 -static int
    1.72 -SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
    1.73 +DEFINE_SSE_FILLRECT(1, Uint8)
    1.74 +DEFINE_SSE_FILLRECT(2, Uint16)
    1.75 +DEFINE_SSE_FILLRECT(4, Uint32)
    1.76 +
    1.77 +/* *INDENT-ON* */
    1.78 +#endif /* __SSE__ */
    1.79 +
    1.80 +#ifdef __MMX__
    1.81 +/* *INDENT-OFF* */
    1.82 +
    1.83 +#define MMX_BEGIN \
    1.84 +    __m64 c64 = _mm_set_pi32(color, color)
    1.85 +
    1.86 +#define MMX_WORK \
    1.87 +    for (i = n / 64; i--;) { \
    1.88 +        _mm_stream_pi((__m64 *)(p+0), c64); \
    1.89 +        _mm_stream_pi((__m64 *)(p+8), c64); \
    1.90 +        _mm_stream_pi((__m64 *)(p+16), c64); \
    1.91 +        _mm_stream_pi((__m64 *)(p+24), c64); \
    1.92 +        _mm_stream_pi((__m64 *)(p+32), c64); \
    1.93 +        _mm_stream_pi((__m64 *)(p+40), c64); \
    1.94 +        _mm_stream_pi((__m64 *)(p+48), c64); \
    1.95 +        _mm_stream_pi((__m64 *)(p+56), c64); \
    1.96 +        p += 64; \
    1.97 +    }
    1.98 +
    1.99 +#define MMX_END \
   1.100 +    _mm_empty()
   1.101 +
   1.102 +#define DEFINE_MMX_FILLRECT(bpp, type) \
   1.103 +static void \
   1.104 +SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
   1.105 +{ \
   1.106 +    MMX_BEGIN; \
   1.107 + \
   1.108 +    while (h--) { \
   1.109 +        int i, n = w * bpp; \
   1.110 +        Uint8 *p = pixels; \
   1.111 + \
   1.112 +        if (n > 7) { \
   1.113 +            int adjust = 8 - ((uintptr_t)p & 7); \
   1.114 +            if (adjust < 8) { \
   1.115 +                n -= adjust; \
   1.116 +                adjust /= bpp; \
   1.117 +                while(adjust--) { \
   1.118 +                    *((type *)p) = (type)color; \
   1.119 +                    p += bpp; \
   1.120 +                } \
   1.121 +            } \
   1.122 +            MMX_WORK; \
   1.123 +        } \
   1.124 +        if (n & 63) { \
   1.125 +            int remainder = (n & 63); \
   1.126 +            remainder /= bpp; \
   1.127 +            while(remainder--) { \
   1.128 +                *((type *)p) = (type)color; \
   1.129 +                p += bpp; \
   1.130 +            } \
   1.131 +        } \
   1.132 +        pixels += pitch; \
   1.133 +    } \
   1.134 + \
   1.135 +    MMX_END; \
   1.136 +}
   1.137 +
   1.138 +DEFINE_MMX_FILLRECT(1, Uint8)
   1.139 +DEFINE_MMX_FILLRECT(2, Uint16)
   1.140 +DEFINE_MMX_FILLRECT(4, Uint32)
   1.141 +
   1.142 +/* *INDENT-ON* */
   1.143 +#endif /* __MMX__ */
   1.144 +
   1.145 +static void
   1.146 +SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   1.147  {
   1.148 -    /* FIXME: We have to worry about packing order.. *sigh* */
   1.149 -    SDL_SetError("4-bpp rect fill not yet implemented");
   1.150 -    return -1;
   1.151 +    while (h--) {
   1.152 +        int n = w;
   1.153 +        Uint8 *p = pixels;
   1.154 +
   1.155 +        if (n > 3) {
   1.156 +            switch ((uintptr_t) p & 3) {
   1.157 +            case 1:
   1.158 +                *p++ = (Uint8) color;
   1.159 +                --n;
   1.160 +            case 2:
   1.161 +                *p++ = (Uint8) color;
   1.162 +                --n;
   1.163 +            case 3:
   1.164 +                *p++ = (Uint8) color;
   1.165 +                --n;
   1.166 +            }
   1.167 +            SDL_memset4(p, color, (n >> 2));
   1.168 +        }
   1.169 +        if (n & 3) {
   1.170 +            p += (n & ~3);
   1.171 +            switch (n & 3) {
   1.172 +            case 3:
   1.173 +                *p++ = (Uint8) color;
   1.174 +            case 2:
   1.175 +                *p++ = (Uint8) color;
   1.176 +            case 1:
   1.177 +                *p++ = (Uint8) color;
   1.178 +            }
   1.179 +        }
   1.180 +        pixels += pitch;
   1.181 +    }
   1.182 +}
   1.183 +
   1.184 +static void
   1.185 +SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   1.186 +{
   1.187 +    while (h--) {
   1.188 +        int n = w;
   1.189 +        Uint16 *p = (Uint16 *) pixels;
   1.190 +
   1.191 +        if (n > 1) {
   1.192 +            if ((uintptr_t) p & 2) {
   1.193 +                *p++ = (Uint16) color;
   1.194 +                --n;
   1.195 +            }
   1.196 +            SDL_memset4(p, color, (n >> 1));
   1.197 +        }
   1.198 +        if (n & 1) {
   1.199 +            p[n - 1] = (Uint16) color;
   1.200 +        }
   1.201 +        pixels += pitch;
   1.202 +    }
   1.203 +}
   1.204 +
   1.205 +static void
   1.206 +SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   1.207 +{
   1.208 +    Uint8 r = (Uint8) (color & 0xFF);
   1.209 +    Uint8 g = (Uint8) ((color >> 8) & 0xFF);
   1.210 +    Uint8 b = (Uint8) ((color >> 16) & 0xFF);
   1.211 +
   1.212 +    while (h--) {
   1.213 +        int n = w;
   1.214 +        Uint8 *p = pixels;
   1.215 +
   1.216 +        while (n--) {
   1.217 +            *p++ = r;
   1.218 +            *p++ = g;
   1.219 +            *p++ = b;
   1.220 +        }
   1.221 +        pixels += pitch;
   1.222 +    }
   1.223 +}
   1.224 +
   1.225 +static void
   1.226 +SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
   1.227 +{
   1.228 +    while (h--) {
   1.229 +        SDL_memset4(pixels, color, w);
   1.230 +        pixels += pitch;
   1.231 +    }
   1.232  }
   1.233  
   1.234  /* 
   1.235 @@ -531,23 +731,12 @@
   1.236  int
   1.237  SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
   1.238  {
   1.239 -    int x, y;
   1.240 -    Uint8 *row;
   1.241 +    Uint8 *pixels;
   1.242  
   1.243      /* This function doesn't work on surfaces < 8 bpp */
   1.244      if (dst->format->BitsPerPixel < 8) {
   1.245 -        switch (dst->format->BitsPerPixel) {
   1.246 -        case 1:
   1.247 -            return SDL_FillRect1(dst, dstrect, color);
   1.248 -            break;
   1.249 -        case 4:
   1.250 -            return SDL_FillRect4(dst, dstrect, color);
   1.251 -            break;
   1.252 -        default:
   1.253 -            SDL_SetError("Fill rect on unsupported surface format");
   1.254 -            return (-1);
   1.255 -            break;
   1.256 -        }
   1.257 +        SDL_SetError("Fill rect on unsupported surface format");
   1.258 +        return (-1);
   1.259      }
   1.260  
   1.261      /* If 'dstrect' == NULL, then fill the whole surface */
   1.262 @@ -564,97 +753,83 @@
   1.263      if (SDL_LockSurface(dst) != 0) {
   1.264          return (-1);
   1.265      }
   1.266 -    row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
   1.267 +
   1.268 +    pixels =
   1.269 +        (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
   1.270          dstrect->x * dst->format->BytesPerPixel;
   1.271 -    if (dst->format->palette || (color == 0)) {
   1.272 -        x = dstrect->w * dst->format->BytesPerPixel;
   1.273 -#ifndef __MACOSX__              /* memset() is optimized on Mac OS X */
   1.274 -        if (!color && !((uintptr_t) row & 3) && !(x & 3)
   1.275 -            && !(dst->pitch & 3)) {
   1.276 -            int n = x >> 2;
   1.277 -            for (y = dstrect->h; y; --y) {
   1.278 -                SDL_memset4(row, 0, n);
   1.279 -                row += dst->pitch;
   1.280 -            }
   1.281 -        } else
   1.282 -#endif /* !__MACOSX__ */
   1.283 +
   1.284 +    switch (dst->format->BytesPerPixel) {
   1.285 +    case 1:
   1.286          {
   1.287 -            for (y = dstrect->h; y; y--) {
   1.288 -                SDL_memset(row, color, x);
   1.289 -                row += dst->pitch;
   1.290 -            }
   1.291 -        }
   1.292 -    } else {
   1.293 -        switch (dst->format->BytesPerPixel) {
   1.294 -        case 2:
   1.295 -            {
   1.296 -                Uint16 c = (Uint16) color;
   1.297 -                Uint32 cc = (Uint32) c << 16 | c;
   1.298 -                for (y = dstrect->h; y; --y) {
   1.299 -                    Uint16 *pixels = (Uint16 *) row;
   1.300 -                    int n = dstrect->w;
   1.301 -                    if ((uintptr_t) pixels & 3) {
   1.302 -                        *pixels++ = c;
   1.303 -                        n--;
   1.304 -                    }
   1.305 -                    if (n >> 1)
   1.306 -                        SDL_memset4(pixels, cc, n >> 1);
   1.307 -                    if (n & 1)
   1.308 -                        pixels[n - 1] = c;
   1.309 -                    row += dst->pitch;
   1.310 -                }
   1.311 -            }
   1.312 -            break;
   1.313 -
   1.314 -        case 3:
   1.315 -#if SDL_BYTEORDER == SDL_BIG_ENDIAN
   1.316 -            color <<= 8;
   1.317 -#endif
   1.318 -            for (y = dstrect->h; y; --y) {
   1.319 -                Uint8 *pixels = row;
   1.320 -                for (x = dstrect->w; x; --x) {
   1.321 -                    SDL_memcpy(pixels, &color, 3);
   1.322 -                    pixels += 3;
   1.323 -                }
   1.324 -                row += dst->pitch;
   1.325 -            }
   1.326 -            break;
   1.327 -
   1.328 -        case 4:
   1.329 -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
   1.330 -            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
   1.331 -                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
   1.332 -                color, color, color, color};
   1.333 -                int i, n = dstrect->w / 4;
   1.334 -                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
   1.335 -                                     "r"(cccc):"memory");
   1.336 -                for (y = dstrect->h; y; --y) {
   1.337 -                    Uint8 *pixels = row;
   1.338 -                    for (i = n / 2; i--;) {
   1.339 -                        /* *INDENT-OFF* */
   1.340 -                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
   1.341 -                                             "	movdqa %%xmm0, (%0)\n"
   1.342 -                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
   1.343 -                        /* *INDENT-ON* */
   1.344 -                        pixels += 32;
   1.345 -                    }
   1.346 -                    if (n & 1) {
   1.347 -                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
   1.348 -                                             "r"(pixels):"memory");
   1.349 -                    }
   1.350 -                    row += dst->pitch;
   1.351 -                }
   1.352 -                __asm__ __volatile__("	emms\n"::);
   1.353 +            color |= (color << 8);
   1.354 +            color |= (color << 16);
   1.355 +#ifdef __SSE__
   1.356 +            if (SDL_HasSSE()) {
   1.357 +                SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
   1.358 +                                 dstrect->h);
   1.359                  break;
   1.360              }
   1.361  #endif
   1.362 -            for (y = dstrect->h; y; --y) {
   1.363 -                SDL_memset4(row, color, dstrect->w);
   1.364 -                row += dst->pitch;
   1.365 +#ifdef __MMX__
   1.366 +            if (SDL_HasMMX()) {
   1.367 +                SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
   1.368 +                                 dstrect->h);
   1.369 +                break;
   1.370              }
   1.371 +#endif
   1.372 +            SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   1.373 +            break;
   1.374 +        }
   1.375 +
   1.376 +    case 2:
   1.377 +        {
   1.378 +            color |= (color << 16);
   1.379 +#ifdef __SSE__
   1.380 +            if (SDL_HasSSE()) {
   1.381 +                SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
   1.382 +                                 dstrect->h);
   1.383 +                break;
   1.384 +            }
   1.385 +#endif
   1.386 +#ifdef __MMX__
   1.387 +            if (SDL_HasMMX()) {
   1.388 +                SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
   1.389 +                                 dstrect->h);
   1.390 +                break;
   1.391 +            }
   1.392 +#endif
   1.393 +            SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   1.394 +            break;
   1.395 +        }
   1.396 +
   1.397 +    case 3:
   1.398 +        /* 24-bit RGB is a slow path, at least for now. */
   1.399 +        {
   1.400 +            SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   1.401 +            break;
   1.402 +        }
   1.403 +
   1.404 +    case 4:
   1.405 +        {
   1.406 +#ifdef __SSE__
   1.407 +            if (SDL_HasSSE()) {
   1.408 +                SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
   1.409 +                                 dstrect->h);
   1.410 +                break;
   1.411 +            }
   1.412 +#endif
   1.413 +#ifdef __MMX__
   1.414 +            if (SDL_HasMMX()) {
   1.415 +                SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
   1.416 +                                 dstrect->h);
   1.417 +                break;
   1.418 +            }
   1.419 +#endif
   1.420 +            SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
   1.421              break;
   1.422          }
   1.423      }
   1.424 +
   1.425      SDL_UnlockSurface(dst);
   1.426  
   1.427      /* We're done! */