src/video/SDL_blit.h
author Sam Lantinga
Thu, 16 Aug 2007 05:56:24 +0000
changeset 2249 5a58b57b6724
parent 2247 93994f65c74c
child 2250 e1d228456537
permissions -rw-r--r--
Added SSE and MMX optimization for SDL_FillRect()
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997-2006 Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Lesser General Public
     7     License as published by the Free Software Foundation; either
     8     version 2.1 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Lesser General Public License for more details.
    14 
    15     You should have received a copy of the GNU Lesser General Public
    16     License along with this library; if not, write to the Free Software
    17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 #include "SDL_config.h"
    23 
    24 #ifndef _SDL_blit_h
    25 #define _SDL_blit_h
    26 
    27 #ifdef __MMX__
    28 #include <mmintrin.h>
    29 #endif
    30 #ifdef __SSE__
    31 #include <xmmintrin.h>
    32 #endif
    33 
    34 #include "SDL_endian.h"
    35 
    36 /* The structure passed to the low level blit functions */
    37 typedef struct
    38 {
    39     Uint8 *s_pixels;
    40     int s_width;
    41     int s_height;
    42     int s_skip;
    43     Uint8 *d_pixels;
    44     int d_width;
    45     int d_height;
    46     int d_skip;
    47     void *aux_data;
    48     SDL_PixelFormat *src;
    49     Uint8 *table;
    50     SDL_PixelFormat *dst;
    51 } SDL_BlitInfo;
    52 
    53 /* The type definition for the low level blit functions */
    54 typedef void (*SDL_loblit) (SDL_BlitInfo * info);
    55 
    56 /* This is the private info structure for software accelerated blits */
    57 struct private_swaccel
    58 {
    59     SDL_loblit blit;
    60     void *aux_data;
    61 };
    62 
    63 /* Blit mapping definition */
    64 typedef struct SDL_BlitMap
    65 {
    66     SDL_Surface *dst;
    67     int identity;
    68     Uint8 *table;
    69     SDL_blit sw_blit;
    70     struct private_swaccel *sw_data;
    71 
    72     /* the version count matches the destination; mismatch indicates
    73        an invalid mapping */
    74     unsigned int format_version;
    75 } SDL_BlitMap;
    76 
    77 #define SDL_BLIT_ANY                0x00000000
    78 #define SDL_BLIT_MMX                0x00000001
    79 #define SDL_BLIT_SSE                0x00000002
    80 #define SDL_BLIT_ALTIVEC_PREFETCH   0x00000004
    81 #define SDL_BLIT_ALTIVEC_NOPREFETCH 0x00000008
    82 
    83 typedef struct SDL_BlitEntry
    84 {
    85     Uint32 features;
    86     SDL_loblit blit;
    87 } SDL_BlitEntry;
    88 
    89 /* Functions found in SDL_blit.c */
    90 extern int SDL_CalculateBlit(SDL_Surface * surface);
    91 
    92 /* Functions found in SDL_blit_{0,1,N,A}.c */
    93 extern SDL_loblit SDL_CalculateBlit0(SDL_Surface * surface, int complex);
    94 extern SDL_loblit SDL_CalculateBlit1(SDL_Surface * surface, int complex);
    95 extern SDL_loblit SDL_CalculateBlitN(SDL_Surface * surface, int complex);
    96 extern SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface * surface, int complex);
    97 
    98 /*
    99  * Useful macros for blitting routines
   100  */
   101 
   102 #if defined(__GNUC__)
   103 #define DECLARE_ALIGNED(t,v,a)  t __attribute__((aligned(a))) v
   104 #elif defined(_MSC_VER)
   105 #define DECLARE_ALIGNED(t,v,a)  t __declspec(align(a)) v
   106 #else
   107 #define DECLARE_ALIGNED(t,v,a)  t v
   108 #endif
   109 
   110 #define FORMAT_EQUAL(A, B)						\
   111     ((A)->BitsPerPixel == (B)->BitsPerPixel				\
   112      && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))
   113 
   114 /* Load pixel of the specified format from a buffer and get its R-G-B values */
   115 /* FIXME: rescale values to 0..255 here? */
   116 #define RGB_FROM_PIXEL(Pixel, fmt, r, g, b)				\
   117 {									\
   118 	r = (((Pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss); 		\
   119 	g = (((Pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss); 		\
   120 	b = (((Pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss); 		\
   121 }
   122 #define RGB_FROM_RGB565(Pixel, r, g, b)					\
   123 {									\
   124 	r = (((Pixel&0xF800)>>11)<<3);		 			\
   125 	g = (((Pixel&0x07E0)>>5)<<2); 					\
   126 	b = ((Pixel&0x001F)<<3); 					\
   127 }
   128 #define RGB_FROM_RGB555(Pixel, r, g, b)					\
   129 {									\
   130 	r = (((Pixel&0x7C00)>>10)<<3);		 			\
   131 	g = (((Pixel&0x03E0)>>5)<<3); 					\
   132 	b = ((Pixel&0x001F)<<3); 					\
   133 }
   134 #define RGB_FROM_RGB888(Pixel, r, g, b)					\
   135 {									\
   136 	r = ((Pixel&0xFF0000)>>16);		 			\
   137 	g = ((Pixel&0xFF00)>>8);		 			\
   138 	b = (Pixel&0xFF);			 			\
   139 }
   140 #define RETRIEVE_RGB_PIXEL(buf, bpp, Pixel)				   \
   141 do {									   \
   142 	switch (bpp) {							   \
   143 		case 2:							   \
   144 			Pixel = *((Uint16 *)(buf));			   \
   145 		break;							   \
   146 									   \
   147 		case 3: {						   \
   148 		        Uint8 *B = (Uint8 *)(buf);			   \
   149 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   150 			        Pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
   151 			} else {					   \
   152 			        Pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
   153 			}						   \
   154 		}							   \
   155 		break;							   \
   156 									   \
   157 		case 4:							   \
   158 			Pixel = *((Uint32 *)(buf));			   \
   159 		break;							   \
   160 									   \
   161 		default:						   \
   162 			Pixel = 0; /* appease gcc */			   \
   163 		break;							   \
   164 	}								   \
   165 } while(0)
   166 
   167 #define DISEMBLE_RGB(buf, bpp, fmt, Pixel, r, g, b)			   \
   168 do {									   \
   169 	switch (bpp) {							   \
   170 		case 2:							   \
   171 			Pixel = *((Uint16 *)(buf));			   \
   172 		break;							   \
   173 									   \
   174 		case 3: {						   \
   175 		        Uint8 *B = (Uint8 *)buf;			   \
   176 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   177 			        Pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
   178 			} else {					   \
   179 			        Pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
   180 			}						   \
   181 		}							   \
   182 		break;							   \
   183 									   \
   184 		case 4:							   \
   185 			Pixel = *((Uint32 *)(buf));			   \
   186 		break;							   \
   187 									   \
   188 	        default:						   \
   189 		        Pixel = 0;	/* prevent gcc from complaining */ \
   190 		break;							   \
   191 	}								   \
   192 	RGB_FROM_PIXEL(Pixel, fmt, r, g, b);				   \
   193 } while(0)
   194 
   195 /* Assemble R-G-B values into a specified pixel format and store them */
   196 #define PIXEL_FROM_RGB(Pixel, fmt, r, g, b)				\
   197 {									\
   198 	Pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
   199 		((g>>fmt->Gloss)<<fmt->Gshift)|				\
   200 		((b>>fmt->Bloss)<<fmt->Bshift);				\
   201 }
   202 #define RGB565_FROM_RGB(Pixel, r, g, b)					\
   203 {									\
   204 	Pixel = ((r>>3)<<11)|((g>>2)<<5)|(b>>3);			\
   205 }
   206 #define RGB555_FROM_RGB(Pixel, r, g, b)					\
   207 {									\
   208 	Pixel = ((r>>3)<<10)|((g>>3)<<5)|(b>>3);			\
   209 }
   210 #define RGB888_FROM_RGB(Pixel, r, g, b)					\
   211 {									\
   212 	Pixel = (r<<16)|(g<<8)|b;					\
   213 }
   214 #define ASSEMBLE_RGB(buf, bpp, fmt, r, g, b) 				\
   215 {									\
   216 	switch (bpp) {							\
   217 		case 2: {						\
   218 			Uint16 Pixel;					\
   219 									\
   220 			PIXEL_FROM_RGB(Pixel, fmt, r, g, b);		\
   221 			*((Uint16 *)(buf)) = Pixel;			\
   222 		}							\
   223 		break;							\
   224 									\
   225 		case 3: {						\
   226                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   227 			        *((buf)+fmt->Rshift/8) = r;		\
   228 				*((buf)+fmt->Gshift/8) = g;		\
   229 				*((buf)+fmt->Bshift/8) = b;		\
   230 			} else {					\
   231 			        *((buf)+2-fmt->Rshift/8) = r;		\
   232 				*((buf)+2-fmt->Gshift/8) = g;		\
   233 				*((buf)+2-fmt->Bshift/8) = b;		\
   234 			}						\
   235 		}							\
   236 		break;							\
   237 									\
   238 		case 4: {						\
   239 			Uint32 Pixel;					\
   240 									\
   241 			PIXEL_FROM_RGB(Pixel, fmt, r, g, b);		\
   242 			*((Uint32 *)(buf)) = Pixel;			\
   243 		}							\
   244 		break;							\
   245 	}								\
   246 }
   247 #define ASSEMBLE_RGB_AMASK(buf, bpp, fmt, r, g, b, Amask)		\
   248 {									\
   249 	switch (bpp) {							\
   250 		case 2: {						\
   251 			Uint16 *bufp;					\
   252 			Uint16 Pixel;					\
   253 									\
   254 			bufp = (Uint16 *)buf;				\
   255 			PIXEL_FROM_RGB(Pixel, fmt, r, g, b);		\
   256 			*bufp = Pixel | (*bufp & Amask);		\
   257 		}							\
   258 		break;							\
   259 									\
   260 		case 3: {						\
   261                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   262 			        *((buf)+fmt->Rshift/8) = r;		\
   263 				*((buf)+fmt->Gshift/8) = g;		\
   264 				*((buf)+fmt->Bshift/8) = b;		\
   265 			} else {					\
   266 			        *((buf)+2-fmt->Rshift/8) = r;		\
   267 				*((buf)+2-fmt->Gshift/8) = g;		\
   268 				*((buf)+2-fmt->Bshift/8) = b;		\
   269 			}						\
   270 		}							\
   271 		break;							\
   272 									\
   273 		case 4: {						\
   274 			Uint32 *bufp;					\
   275 			Uint32 Pixel;					\
   276 									\
   277 			bufp = (Uint32 *)buf;				\
   278 			PIXEL_FROM_RGB(Pixel, fmt, r, g, b);		\
   279 			*bufp = Pixel | (*bufp & Amask);		\
   280 		}							\
   281 		break;							\
   282 	}								\
   283 }
   284 
   285 /* FIXME: Should we rescale alpha into 0..255 here? */
   286 #define RGBA_FROM_PIXEL(Pixel, fmt, r, g, b, a)				\
   287 {									\
   288 	r = ((Pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss; 		\
   289 	g = ((Pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss; 		\
   290 	b = ((Pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss; 		\
   291 	a = ((Pixel&fmt->Amask)>>fmt->Ashift)<<fmt->Aloss;	 	\
   292 }
   293 #define RGBA_FROM_8888(Pixel, fmt, r, g, b, a)	\
   294 {						\
   295 	r = (Pixel&fmt->Rmask)>>fmt->Rshift;	\
   296 	g = (Pixel&fmt->Gmask)>>fmt->Gshift;	\
   297 	b = (Pixel&fmt->Bmask)>>fmt->Bshift;	\
   298 	a = (Pixel&fmt->Amask)>>fmt->Ashift;	\
   299 }
   300 #define RGBA_FROM_RGBA8888(Pixel, r, g, b, a)				\
   301 {									\
   302 	r = (Pixel>>24);						\
   303 	g = ((Pixel>>16)&0xFF);						\
   304 	b = ((Pixel>>8)&0xFF);						\
   305 	a = (Pixel&0xFF);						\
   306 }
   307 #define RGBA_FROM_ARGB8888(Pixel, r, g, b, a)				\
   308 {									\
   309 	r = ((Pixel>>16)&0xFF);						\
   310 	g = ((Pixel>>8)&0xFF);						\
   311 	b = (Pixel&0xFF);						\
   312 	a = (Pixel>>24);						\
   313 }
   314 #define RGBA_FROM_ABGR8888(Pixel, r, g, b, a)				\
   315 {									\
   316 	r = (Pixel&0xFF);						\
   317 	g = ((Pixel>>8)&0xFF);						\
   318 	b = ((Pixel>>16)&0xFF);						\
   319 	a = (Pixel>>24);						\
   320 }
   321 #define DISEMBLE_RGBA(buf, bpp, fmt, Pixel, r, g, b, a)			   \
   322 do {									   \
   323 	switch (bpp) {							   \
   324 		case 2:							   \
   325 			Pixel = *((Uint16 *)(buf));			   \
   326 		break;							   \
   327 									   \
   328 		case 3:	{/* FIXME: broken code (no alpha) */		   \
   329 		        Uint8 *b = (Uint8 *)buf;			   \
   330 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   331 			        Pixel = b[0] + (b[1] << 8) + (b[2] << 16); \
   332 			} else {					   \
   333 			        Pixel = (b[0] << 16) + (b[1] << 8) + b[2]; \
   334 			}						   \
   335 		}							   \
   336 		break;							   \
   337 									   \
   338 		case 4:							   \
   339 			Pixel = *((Uint32 *)(buf));			   \
   340 		break;							   \
   341 									   \
   342 		default:						   \
   343 		        Pixel = 0; /* stop gcc complaints */		   \
   344 		break;							   \
   345 	}								   \
   346 	RGBA_FROM_PIXEL(Pixel, fmt, r, g, b, a);			   \
   347 	Pixel &= ~fmt->Amask;						   \
   348 } while(0)
   349 
   350 /* FIXME: this isn't correct, especially for Alpha (maximum != 255) */
   351 #define PIXEL_FROM_RGBA(Pixel, fmt, r, g, b, a)				\
   352 {									\
   353 	Pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
   354 		((g>>fmt->Gloss)<<fmt->Gshift)|				\
   355 		((b>>fmt->Bloss)<<fmt->Bshift)|				\
   356 		((a>>fmt->Aloss)<<fmt->Ashift);				\
   357 }
   358 #define ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)			\
   359 {									\
   360 	switch (bpp) {							\
   361 		case 2: {						\
   362 			Uint16 Pixel;					\
   363 									\
   364 			PIXEL_FROM_RGBA(Pixel, fmt, r, g, b, a);	\
   365 			*((Uint16 *)(buf)) = Pixel;			\
   366 		}							\
   367 		break;							\
   368 									\
   369 		case 3: { /* FIXME: broken code (no alpha) */		\
   370                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   371 			        *((buf)+fmt->Rshift/8) = r;		\
   372 				*((buf)+fmt->Gshift/8) = g;		\
   373 				*((buf)+fmt->Bshift/8) = b;		\
   374 			} else {					\
   375 			        *((buf)+2-fmt->Rshift/8) = r;		\
   376 				*((buf)+2-fmt->Gshift/8) = g;		\
   377 				*((buf)+2-fmt->Bshift/8) = b;		\
   378 			}						\
   379 		}							\
   380 		break;							\
   381 									\
   382 		case 4: {						\
   383 			Uint32 Pixel;					\
   384 									\
   385 			PIXEL_FROM_RGBA(Pixel, fmt, r, g, b, a);	\
   386 			*((Uint32 *)(buf)) = Pixel;			\
   387 		}							\
   388 		break;							\
   389 	}								\
   390 }
   391 
   392 /* Blend the RGB values of two Pixels based on a source alpha value */
   393 #define ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB)	\
   394 do {						\
   395 	dR = (((sR-dR)*(A))>>8)+dR;		\
   396 	dG = (((sG-dG)*(A))>>8)+dG;		\
   397 	dB = (((sB-dB)*(A))>>8)+dB;		\
   398 } while(0)
   399 
   400 /* Blend the RGB values of two Pixels based on a source alpha value */
   401 #define ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB)	\
   402 do {						\
   403     unsigned tR, tG, tB, tA; \
   404     tA = 255 - sA; \
   405     tR = 1 + (sR * sA) + (dR * tA); \
   406     dR = (tR + (tR >> 8)) >> 8; \
   407     tG = 1 + (sG * sA) + (dG * tA); \
   408     dG = (tG + (tG >> 8)) >> 8; \
   409     tB = 1 + (sB * sA) + (dB * tA); \
   410     dB = (tB + (tB >> 8)) >> 8; \
   411 } while(0)
   412 
   413 
   414 /* This is a very useful loop for optimizing blitters */
   415 #if defined(_MSC_VER) && (_MSC_VER == 1300)
   416 /* There's a bug in the Visual C++ 7 optimizer when compiling this code */
   417 #else
   418 #define USE_DUFFS_LOOP
   419 #endif
   420 #ifdef USE_DUFFS_LOOP
   421 
   422 /* 8-times unrolled loop */
   423 #define DUFFS_LOOP8(pixel_copy_increment, width)			\
   424 { int n = (width+7)/8;							\
   425 	switch (width & 7) {						\
   426 	case 0: do {	pixel_copy_increment;				\
   427 	case 7:		pixel_copy_increment;				\
   428 	case 6:		pixel_copy_increment;				\
   429 	case 5:		pixel_copy_increment;				\
   430 	case 4:		pixel_copy_increment;				\
   431 	case 3:		pixel_copy_increment;				\
   432 	case 2:		pixel_copy_increment;				\
   433 	case 1:		pixel_copy_increment;				\
   434 		} while ( --n > 0 );					\
   435 	}								\
   436 }
   437 
   438 /* 4-times unrolled loop */
   439 #define DUFFS_LOOP4(pixel_copy_increment, width)			\
   440 { int n = (width+3)/4;							\
   441 	switch (width & 3) {						\
   442 	case 0: do {	pixel_copy_increment;				\
   443 	case 3:		pixel_copy_increment;				\
   444 	case 2:		pixel_copy_increment;				\
   445 	case 1:		pixel_copy_increment;				\
   446 		} while ( --n > 0 );					\
   447 	}								\
   448 }
   449 
   450 /* 2 - times unrolled loop */
   451 #define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
   452 				double_pixel_copy_increment, width)	\
   453 { int n, w = width;							\
   454 	if( w & 1 ) {							\
   455 	    pixel_copy_increment;					\
   456 	    w--;							\
   457 	}								\
   458 	if ( w > 0 )	{						\
   459 	    n = ( w + 2) / 4;						\
   460 	    switch( w & 2 ) {						\
   461 	    case 0: do {	double_pixel_copy_increment;		\
   462 	    case 2:		double_pixel_copy_increment;		\
   463 		    } while ( --n > 0 );					\
   464 	    }								\
   465 	}								\
   466 }
   467 
   468 /* 2 - times unrolled loop 4 pixels */
   469 #define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
   470 				double_pixel_copy_increment,		\
   471 				quatro_pixel_copy_increment, width)	\
   472 { int n, w = width;								\
   473         if(w & 1) {							\
   474 	  pixel_copy_increment;						\
   475 	  w--;								\
   476 	}								\
   477 	if(w & 2) {							\
   478 	  double_pixel_copy_increment;					\
   479 	  w -= 2;							\
   480 	}								\
   481 	if ( w > 0 ) {							\
   482 	    n = ( w + 7 ) / 8;						\
   483 	    switch( w & 4 ) {						\
   484 	    case 0: do {	quatro_pixel_copy_increment;		\
   485 	    case 4:		quatro_pixel_copy_increment;		\
   486 		    } while ( --n > 0 );					\
   487 	    }								\
   488 	}								\
   489 }
   490 
   491 /* Use the 8-times version of the loop by default */
   492 #define DUFFS_LOOP(pixel_copy_increment, width)				\
   493 	DUFFS_LOOP8(pixel_copy_increment, width)
   494 
   495 #else
   496 
   497 /* Don't use Duff's device to unroll loops */
   498 #define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
   499 			 double_pixel_copy_increment, width)		\
   500 { int n = width;								\
   501     if( n & 1 ) {							\
   502 	pixel_copy_increment;						\
   503 	n--;								\
   504     }									\
   505     n=n>>1;								\
   506     for(; n > 0; --n) {   						\
   507 	double_pixel_copy_increment;					\
   508     }									\
   509 }
   510 
   511 /* Don't use Duff's device to unroll loops */
   512 #define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
   513 				double_pixel_copy_increment,		\
   514 				quatro_pixel_copy_increment, width)	\
   515 { int n = width;								\
   516         if(n & 1) {							\
   517 	  pixel_copy_increment;						\
   518 	  n--;								\
   519 	}								\
   520 	if(n & 2) {							\
   521 	  double_pixel_copy_increment;					\
   522 	  n -= 2;							\
   523 	}								\
   524 	n=n>>2;								\
   525 	for(; n > 0; --n) {   						\
   526 	  quatro_pixel_copy_increment;					\
   527         }								\
   528 }
   529 
   530 /* Don't use Duff's device to unroll loops */
   531 #define DUFFS_LOOP(pixel_copy_increment, width)				\
   532 { int n;								\
   533 	for ( n=width; n > 0; --n ) {					\
   534 		pixel_copy_increment;					\
   535 	}								\
   536 }
   537 #define DUFFS_LOOP8(pixel_copy_increment, width)			\
   538 	DUFFS_LOOP(pixel_copy_increment, width)
   539 #define DUFFS_LOOP4(pixel_copy_increment, width)			\
   540 	DUFFS_LOOP(pixel_copy_increment, width)
   541 
   542 #endif /* USE_DUFFS_LOOP */
   543 
   544 /* Prevent Visual C++ 6.0 from printing out stupid warnings */
   545 #if defined(_MSC_VER) && (_MSC_VER >= 600)
   546 #pragma warning(disable: 4550)
   547 #endif
   548 
   549 #endif /* _SDL_blit_h */
   550 /* vi: set ts=4 sw=4 expandtab: */