From 344ba2c0da5b32b7fbd37963ec251eda4afc3689 Mon Sep 17 00:00:00 2001 From: Sam Lantinga Date: Thu, 16 Aug 2007 05:56:24 +0000 Subject: [PATCH] Added SSE and MMX optimization for SDL_FillRect() --- src/video/SDL_blit.c | 9 +- src/video/SDL_blit.h | 15 ++ src/video/SDL_blit_copy.c | 7 +- src/video/SDL_surface.c | 371 ++++++++++++++++++++++++++++---------- 4 files changed, 295 insertions(+), 107 deletions(-) diff --git a/src/video/SDL_blit.c b/src/video/SDL_blit.c index 13eaf0140..ffd74f04c 100644 --- a/src/video/SDL_blit.c +++ b/src/video/SDL_blit.c @@ -110,7 +110,8 @@ SDL_SoftBlit(SDL_Surface * src, SDL_Rect * srcrect, #ifdef __MACOSX__ #include -static SDL_bool SDL_UseAltivecPrefetch() +static SDL_bool +SDL_UseAltivecPrefetch() { const char key[] = "hw.l3cachesize"; u_int64_t result = 0; @@ -123,14 +124,16 @@ static SDL_bool SDL_UseAltivecPrefetch() } } #else -static SDL_bool SDL_UseAltivecPrefetch() +static SDL_bool +SDL_UseAltivecPrefetch() { /* Just guess G4 */ return SDL_TRUE; } #endif /* __MACOSX__ */ -static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count) +static SDL_loblit +SDL_ChooseBlitFunc(SDL_BlitEntry * entries, int count) { int i; static Uint32 features = 0xffffffff; diff --git a/src/video/SDL_blit.h b/src/video/SDL_blit.h index 3595d6ff1..8795b2b08 100644 --- a/src/video/SDL_blit.h +++ b/src/video/SDL_blit.h @@ -24,6 +24,13 @@ #ifndef _SDL_blit_h #define _SDL_blit_h +#ifdef __MMX__ +#include +#endif +#ifdef __SSE__ +#include +#endif + #include "SDL_endian.h" /* The structure passed to the low level blit functions */ @@ -92,6 +99,14 @@ extern SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface * surface, int complex); * Useful macros for blitting routines */ +#if defined(__GNUC__) +#define DECLARE_ALIGNED(t,v,a) t __attribute__((aligned(a))) v +#elif defined(_MSC_VER) +#define DECLARE_ALIGNED(t,v,a) t __declspec(align(a)) v +#else +#define DECLARE_ALIGNED(t,v,a) t v +#endif + #define FORMAT_EQUAL(A, B) \ ((A)->BitsPerPixel == (B)->BitsPerPixel \ && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask)) diff --git a/src/video/SDL_blit_copy.c b/src/video/SDL_blit_copy.c index 84831089d..01be9ecc5 100644 --- a/src/video/SDL_blit_copy.c +++ b/src/video/SDL_blit_copy.c @@ -23,13 +23,8 @@ #include "SDL_video.h" #include "SDL_blit.h" +#include "SDL_blit_copy.h" -#ifdef __MMX__ -#include -#endif -#ifdef __SSE__ -#include -#endif #ifdef __MMX__ static __inline__ void diff --git a/src/video/SDL_surface.c b/src/video/SDL_surface.c index 64fdcf6f0..eabab7de5 100644 --- a/src/video/SDL_surface.c +++ b/src/video/SDL_surface.c @@ -509,20 +509,220 @@ SDL_UpperBlit(SDL_Surface * src, SDL_Rect * srcrect, return 0; } -static int -SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) +#ifdef __SSE__ +/* *INDENT-OFF* */ + +#define SSE_BEGIN \ + DECLARE_ALIGNED(Uint32, cccc[4], 16); \ + cccc[0] = color; \ + cccc[1] = color; \ + cccc[2] = color; \ + cccc[3] = color; \ + __m128 c128 = *(__m128 *)cccc; + +#define SSE_WORK \ + for (i = n / 64; i--;) { \ + _mm_stream_ps((float *)(p+0), c128); \ + _mm_stream_ps((float *)(p+16), c128); \ + _mm_stream_ps((float *)(p+32), c128); \ + _mm_stream_ps((float *)(p+48), c128); \ + p += 64; \ + } + +#define SSE_END + +#define DEFINE_SSE_FILLRECT(bpp, type) \ +static void \ +SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ +{ \ + SSE_BEGIN; \ + \ + while (h--) { \ + int i, n = w * bpp; \ + Uint8 *p = pixels; \ + \ + if (n > 15) { \ + int adjust = 16 - ((uintptr_t)p & 15); \ + if (adjust < 16) { \ + n -= adjust; \ + adjust /= bpp; \ + while(adjust--) { \ + *((type *)p) = (type)color; \ + p += bpp; \ + } \ + } \ + SSE_WORK; \ + } \ + if (n & 63) { \ + int remainder = (n & 63); \ + remainder /= bpp; \ + while(remainder--) { \ + *((type *)p) = (type)color; \ + p += bpp; \ + } \ + } \ + pixels += pitch; \ + } \ + \ + SSE_END; \ +} + +DEFINE_SSE_FILLRECT(1, Uint8) +DEFINE_SSE_FILLRECT(2, Uint16) +DEFINE_SSE_FILLRECT(4, Uint32) + +/* *INDENT-ON* */ +#endif /* __SSE__ */ + +#ifdef __MMX__ +/* *INDENT-OFF* */ + +#define MMX_BEGIN \ + __m64 c64 = _mm_set_pi32(color, color) + +#define MMX_WORK \ + for (i = n / 64; i--;) { \ + _mm_stream_pi((__m64 *)(p+0), c64); \ + _mm_stream_pi((__m64 *)(p+8), c64); \ + _mm_stream_pi((__m64 *)(p+16), c64); \ + _mm_stream_pi((__m64 *)(p+24), c64); \ + _mm_stream_pi((__m64 *)(p+32), c64); \ + _mm_stream_pi((__m64 *)(p+40), c64); \ + _mm_stream_pi((__m64 *)(p+48), c64); \ + _mm_stream_pi((__m64 *)(p+56), c64); \ + p += 64; \ + } + +#define MMX_END \ + _mm_empty() + +#define DEFINE_MMX_FILLRECT(bpp, type) \ +static void \ +SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ +{ \ + MMX_BEGIN; \ + \ + while (h--) { \ + int i, n = w * bpp; \ + Uint8 *p = pixels; \ + \ + if (n > 7) { \ + int adjust = 8 - ((uintptr_t)p & 7); \ + if (adjust < 8) { \ + n -= adjust; \ + adjust /= bpp; \ + while(adjust--) { \ + *((type *)p) = (type)color; \ + p += bpp; \ + } \ + } \ + MMX_WORK; \ + } \ + if (n & 63) { \ + int remainder = (n & 63); \ + remainder /= bpp; \ + while(remainder--) { \ + *((type *)p) = (type)color; \ + p += bpp; \ + } \ + } \ + pixels += pitch; \ + } \ + \ + MMX_END; \ +} + +DEFINE_MMX_FILLRECT(1, Uint8) +DEFINE_MMX_FILLRECT(2, Uint16) +DEFINE_MMX_FILLRECT(4, Uint32) + +/* *INDENT-ON* */ +#endif /* __MMX__ */ + +static void +SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h) { - /* FIXME: We have to worry about packing order.. *sigh* */ - SDL_SetError("1-bpp rect fill not yet implemented"); - return -1; + while (h--) { + int n = w; + Uint8 *p = pixels; + + if (n > 3) { + switch ((uintptr_t) p & 3) { + case 1: + *p++ = (Uint8) color; + --n; + case 2: + *p++ = (Uint8) color; + --n; + case 3: + *p++ = (Uint8) color; + --n; + } + SDL_memset4(p, color, (n >> 2)); + } + if (n & 3) { + p += (n & ~3); + switch (n & 3) { + case 3: + *p++ = (Uint8) color; + case 2: + *p++ = (Uint8) color; + case 1: + *p++ = (Uint8) color; + } + } + pixels += pitch; + } } -static int -SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) +static void +SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h) +{ + while (h--) { + int n = w; + Uint16 *p = (Uint16 *) pixels; + + if (n > 1) { + if ((uintptr_t) p & 2) { + *p++ = (Uint16) color; + --n; + } + SDL_memset4(p, color, (n >> 1)); + } + if (n & 1) { + p[n - 1] = (Uint16) color; + } + pixels += pitch; + } +} + +static void +SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h) +{ + Uint8 r = (Uint8) (color & 0xFF); + Uint8 g = (Uint8) ((color >> 8) & 0xFF); + Uint8 b = (Uint8) ((color >> 16) & 0xFF); + + while (h--) { + int n = w; + Uint8 *p = pixels; + + while (n--) { + *p++ = r; + *p++ = g; + *p++ = b; + } + pixels += pitch; + } +} + +static void +SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h) { - /* FIXME: We have to worry about packing order.. *sigh* */ - SDL_SetError("4-bpp rect fill not yet implemented"); - return -1; + while (h--) { + SDL_memset4(pixels, color, w); + pixels += pitch; + } } /* @@ -531,23 +731,12 @@ SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) int SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) { - int x, y; - Uint8 *row; + Uint8 *pixels; /* This function doesn't work on surfaces < 8 bpp */ if (dst->format->BitsPerPixel < 8) { - switch (dst->format->BitsPerPixel) { - case 1: - return SDL_FillRect1(dst, dstrect, color); - break; - case 4: - return SDL_FillRect4(dst, dstrect, color); - break; - default: - SDL_SetError("Fill rect on unsupported surface format"); - return (-1); - break; - } + SDL_SetError("Fill rect on unsupported surface format"); + return (-1); } /* If 'dstrect' == NULL, then fill the whole surface */ @@ -564,97 +753,83 @@ SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) if (SDL_LockSurface(dst) != 0) { return (-1); } - row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch + + + pixels = + (Uint8 *) dst->pixels + dstrect->y * dst->pitch + dstrect->x * dst->format->BytesPerPixel; - if (dst->format->palette || (color == 0)) { - x = dstrect->w * dst->format->BytesPerPixel; -#ifndef __MACOSX__ /* memset() is optimized on Mac OS X */ - if (!color && !((uintptr_t) row & 3) && !(x & 3) - && !(dst->pitch & 3)) { - int n = x >> 2; - for (y = dstrect->h; y; --y) { - SDL_memset4(row, 0, n); - row += dst->pitch; - } - } else -#endif /* !__MACOSX__ */ + + switch (dst->format->BytesPerPixel) { + case 1: { - for (y = dstrect->h; y; y--) { - SDL_memset(row, color, x); - row += dst->pitch; + color |= (color << 8); + color |= (color << 16); +#ifdef __SSE__ + if (SDL_HasSSE()) { + SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w, + dstrect->h); + break; } - } - } else { - switch (dst->format->BytesPerPixel) { - case 2: - { - Uint16 c = (Uint16) color; - Uint32 cc = (Uint32) c << 16 | c; - for (y = dstrect->h; y; --y) { - Uint16 *pixels = (Uint16 *) row; - int n = dstrect->w; - if ((uintptr_t) pixels & 3) { - *pixels++ = c; - n--; - } - if (n >> 1) - SDL_memset4(pixels, cc, n >> 1); - if (n & 1) - pixels[n - 1] = c; - row += dst->pitch; - } +#endif +#ifdef __MMX__ + if (SDL_HasMMX()) { + SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w, + dstrect->h); + break; } +#endif + SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h); break; + } - case 3: -#if SDL_BYTEORDER == SDL_BIG_ENDIAN - color <<= 8; + case 2: + { + color |= (color << 16); +#ifdef __SSE__ + if (SDL_HasSSE()) { + SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w, + dstrect->h); + break; + } #endif - for (y = dstrect->h; y; --y) { - Uint8 *pixels = row; - for (x = dstrect->w; x; --x) { - SDL_memcpy(pixels, &color, 3); - pixels += 3; - } - row += dst->pitch; +#ifdef __MMX__ + if (SDL_HasMMX()) { + SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w, + dstrect->h); + break; } +#endif + SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h); + break; + } + + case 3: + /* 24-bit RGB is a slow path, at least for now. */ + { + SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h); break; + } - case 4: -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES - if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) { - Uint32 cccc[4] __attribute__ ((aligned(16))) = { - color, color, color, color}; - int i, n = dstrect->w / 4; - __asm__ __volatile__(" movdqa (%0), %%xmm0\n":: - "r"(cccc):"memory"); - for (y = dstrect->h; y; --y) { - Uint8 *pixels = row; - for (i = n / 2; i--;) { - /* *INDENT-OFF* */ - __asm__ __volatile__(" prefetchnta 256(%0)\n" - " movdqa %%xmm0, (%0)\n" - " movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory"); - /* *INDENT-ON* */ - pixels += 32; - } - if (n & 1) { - __asm__ __volatile__(" movdqa %%xmm0, (%0)\n":: - "r"(pixels):"memory"); - } - row += dst->pitch; - } - __asm__ __volatile__(" emms\n"::); + case 4: + { +#ifdef __SSE__ + if (SDL_HasSSE()) { + SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w, + dstrect->h); break; } #endif - for (y = dstrect->h; y; --y) { - SDL_memset4(row, color, dstrect->w); - row += dst->pitch; +#ifdef __MMX__ + if (SDL_HasMMX()) { + SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w, + dstrect->h); + break; } +#endif + SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h); break; } } + SDL_UnlockSurface(dst); /* We're done! */