src/video/SDL_blit.h
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 553 417f8709e648
child 769 b8d311d90021
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Library General Public
     7     License as published by the Free Software Foundation; either
     8     version 2 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Library General Public License for more details.
    14 
    15     You should have received a copy of the GNU Library General Public
    16     License along with this library; if not, write to the Free
    17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 
    23 #ifdef SAVE_RCSID
    24 static char rcsid =
    25  "@(#) $Id$";
    26 #endif
    27 
    28 #ifndef _SDL_blit_h
    29 #define _SDL_blit_h
    30 
    31 #include "SDL_endian.h"
    32 
    33 /* The structure passed to the low level blit functions */
    34 typedef struct {
    35 	Uint8 *s_pixels;
    36 	int s_width;
    37 	int s_height;
    38 	int s_skip;
    39 	Uint8 *d_pixels;
    40 	int d_width;
    41 	int d_height;
    42 	int d_skip;
    43 	void *aux_data;
    44 	SDL_PixelFormat *src;
    45 	Uint8 *table;
    46 	SDL_PixelFormat *dst;
    47 } SDL_BlitInfo;
    48 
    49 /* The type definition for the low level blit functions */
    50 typedef void (*SDL_loblit)(SDL_BlitInfo *info);
    51 
    52 /* This is the private info structure for software accelerated blits */
    53 struct private_swaccel {
    54 	SDL_loblit blit;
    55 	void *aux_data;
    56 };
    57 
    58 /* Blit mapping definition */
    59 typedef struct SDL_BlitMap {
    60 	SDL_Surface *dst;
    61 	int identity;
    62 	Uint8 *table;
    63 	SDL_blit hw_blit;
    64 	SDL_blit sw_blit;
    65 	struct private_hwaccel *hw_data;
    66 	struct private_swaccel *sw_data;
    67 
    68 	/* the version count matches the destination; mismatch indicates
    69 	   an invalid mapping */
    70         unsigned int format_version;
    71 } SDL_BlitMap;
    72 
    73 
    74 /* Functions found in SDL_blit.c */
    75 extern int SDL_CalculateBlit(SDL_Surface *surface);
    76 
    77 /* Functions found in SDL_blit_{0,1,N,A}.c */
    78 extern SDL_loblit SDL_CalculateBlit0(SDL_Surface *surface, int complex);
    79 extern SDL_loblit SDL_CalculateBlit1(SDL_Surface *surface, int complex);
    80 extern SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int complex);
    81 extern SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int complex);
    82 
    83 /*
    84  * Useful macros for blitting routines
    85  */
    86 
    87 #define FORMAT_EQUAL(A, B)						\
    88     ((A)->BitsPerPixel == (B)->BitsPerPixel				\
    89      && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))
    90 
    91 /* Load pixel of the specified format from a buffer and get its R-G-B values */
    92 /* FIXME: rescale values to 0..255 here? */
    93 #define RGB_FROM_PIXEL(pixel, fmt, r, g, b)				\
    94 {									\
    95 	r = (((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss); 		\
    96 	g = (((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss); 		\
    97 	b = (((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss); 		\
    98 }
    99 #define RGB_FROM_RGB565(pixel, r, g, b)					\
   100 {									\
   101 	r = (((pixel&0xF800)>>11)<<3);		 			\
   102 	g = (((pixel&0x07E0)>>5)<<2); 					\
   103 	b = ((pixel&0x001F)<<3); 					\
   104 }
   105 #define RGB_FROM_RGB555(pixel, r, g, b)					\
   106 {									\
   107 	r = (((pixel&0x7C00)>>10)<<3);		 			\
   108 	g = (((pixel&0x03E0)>>5)<<3); 					\
   109 	b = ((pixel&0x001F)<<3); 					\
   110 }
   111 #define RGB_FROM_RGB888(pixel, r, g, b)					\
   112 {									\
   113 	r = ((pixel&0xFF0000)>>16);		 			\
   114 	g = ((pixel&0xFF00)>>8);		 			\
   115 	b = (pixel&0xFF);			 			\
   116 }
   117 #define RETRIEVE_RGB_PIXEL(buf, bpp, pixel)				   \
   118 do {									   \
   119 	switch (bpp) {							   \
   120 		case 2:							   \
   121 			pixel = *((Uint16 *)(buf));			   \
   122 		break;							   \
   123 									   \
   124 		case 3: {						   \
   125 		        Uint8 *B = (Uint8 *)(buf);			   \
   126 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   127 			        pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
   128 			} else {					   \
   129 			        pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
   130 			}						   \
   131 		}							   \
   132 		break;							   \
   133 									   \
   134 		case 4:							   \
   135 			pixel = *((Uint32 *)(buf));			   \
   136 		break;							   \
   137 									   \
   138 		default:						   \
   139 			pixel = 0; /* appease gcc */			   \
   140 		break;							   \
   141 	}								   \
   142 } while(0)
   143 
   144 #define DISEMBLE_RGB(buf, bpp, fmt, pixel, r, g, b)			   \
   145 do {									   \
   146 	switch (bpp) {							   \
   147 		case 2:							   \
   148 			pixel = *((Uint16 *)(buf));			   \
   149 		break;							   \
   150 									   \
   151 		case 3: {						   \
   152 		        Uint8 *B = (Uint8 *)buf;			   \
   153 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   154 			        pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
   155 			} else {					   \
   156 			        pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
   157 			}						   \
   158 		}							   \
   159 		break;							   \
   160 									   \
   161 		case 4:							   \
   162 			pixel = *((Uint32 *)(buf));			   \
   163 		break;							   \
   164 									   \
   165 	        default:						   \
   166 		        pixel = 0;	/* prevent gcc from complaining */ \
   167 		break;							   \
   168 	}								   \
   169 	RGB_FROM_PIXEL(pixel, fmt, r, g, b);				   \
   170 } while(0)
   171 
   172 /* Assemble R-G-B values into a specified pixel format and store them */
   173 #define PIXEL_FROM_RGB(pixel, fmt, r, g, b)				\
   174 {									\
   175 	pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
   176 		((g>>fmt->Gloss)<<fmt->Gshift)|				\
   177 		((b>>fmt->Bloss)<<fmt->Bshift);				\
   178 }
   179 #define RGB565_FROM_RGB(pixel, r, g, b)					\
   180 {									\
   181 	pixel = ((r>>3)<<11)|((g>>2)<<5)|(b>>3);			\
   182 }
   183 #define RGB555_FROM_RGB(pixel, r, g, b)					\
   184 {									\
   185 	pixel = ((r>>3)<<10)|((g>>3)<<5)|(b>>3);			\
   186 }
   187 #define RGB888_FROM_RGB(pixel, r, g, b)					\
   188 {									\
   189 	pixel = (r<<16)|(g<<8)|b;					\
   190 }
   191 #define ASSEMBLE_RGB(buf, bpp, fmt, r, g, b) 				\
   192 {									\
   193 	switch (bpp) {							\
   194 		case 2: {						\
   195 			Uint16 pixel;					\
   196 									\
   197 			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
   198 			*((Uint16 *)(buf)) = pixel;			\
   199 		}							\
   200 		break;							\
   201 									\
   202 		case 3: {						\
   203                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   204 			        *((buf)+fmt->Rshift/8) = r;		\
   205 				*((buf)+fmt->Gshift/8) = g;		\
   206 				*((buf)+fmt->Bshift/8) = b;		\
   207 			} else {					\
   208 			        *((buf)+2-fmt->Rshift/8) = r;		\
   209 				*((buf)+2-fmt->Gshift/8) = g;		\
   210 				*((buf)+2-fmt->Bshift/8) = b;		\
   211 			}						\
   212 		}							\
   213 		break;							\
   214 									\
   215 		case 4: {						\
   216 			Uint32 pixel;					\
   217 									\
   218 			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
   219 			*((Uint32 *)(buf)) = pixel;			\
   220 		}							\
   221 		break;							\
   222 	}								\
   223 }
   224 #define ASSEMBLE_RGB_AMASK(buf, bpp, fmt, r, g, b, Amask)		\
   225 {									\
   226 	switch (bpp) {							\
   227 		case 2: {						\
   228 			Uint16 *bufp;					\
   229 			Uint16 pixel;					\
   230 									\
   231 			bufp = (Uint16 *)buf;				\
   232 			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
   233 			*bufp = pixel | (*bufp & Amask);		\
   234 		}							\
   235 		break;							\
   236 									\
   237 		case 3: {						\
   238                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   239 			        *((buf)+fmt->Rshift/8) = r;		\
   240 				*((buf)+fmt->Gshift/8) = g;		\
   241 				*((buf)+fmt->Bshift/8) = b;		\
   242 			} else {					\
   243 			        *((buf)+2-fmt->Rshift/8) = r;		\
   244 				*((buf)+2-fmt->Gshift/8) = g;		\
   245 				*((buf)+2-fmt->Bshift/8) = b;		\
   246 			}						\
   247 		}							\
   248 		break;							\
   249 									\
   250 		case 4: {						\
   251 			Uint32 *bufp;					\
   252 			Uint32 pixel;					\
   253 									\
   254 			bufp = (Uint32 *)buf;				\
   255 			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
   256 			*bufp = pixel | (*bufp & Amask);		\
   257 		}							\
   258 		break;							\
   259 	}								\
   260 }
   261 
   262 /* FIXME: Should we rescale alpha into 0..255 here? */
   263 #define RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a)				\
   264 {									\
   265 	r = ((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss; 		\
   266 	g = ((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss; 		\
   267 	b = ((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss; 		\
   268 	a = ((pixel&fmt->Amask)>>fmt->Ashift)<<fmt->Aloss;	 	\
   269 }
   270 #define RGBA_FROM_8888(pixel, fmt, r, g, b, a)	\
   271 {						\
   272 	r = (pixel&fmt->Rmask)>>fmt->Rshift;	\
   273 	g = (pixel&fmt->Gmask)>>fmt->Gshift;	\
   274 	b = (pixel&fmt->Bmask)>>fmt->Bshift;	\
   275 	a = (pixel&fmt->Amask)>>fmt->Ashift;	\
   276 }
   277 #define RGBA_FROM_RGBA8888(pixel, r, g, b, a)				\
   278 {									\
   279 	r = (pixel>>24);						\
   280 	g = ((pixel>>16)&0xFF);						\
   281 	b = ((pixel>>8)&0xFF);						\
   282 	a = (pixel&0xFF);						\
   283 }
   284 #define RGBA_FROM_ARGB8888(pixel, r, g, b, a)				\
   285 {									\
   286 	r = ((pixel>>16)&0xFF);						\
   287 	g = ((pixel>>8)&0xFF);						\
   288 	b = (pixel&0xFF);						\
   289 	a = (pixel>>24);						\
   290 }
   291 #define RGBA_FROM_ABGR8888(pixel, r, g, b, a)				\
   292 {									\
   293 	r = (pixel&0xFF);						\
   294 	g = ((pixel>>8)&0xFF);						\
   295 	b = ((pixel>>16)&0xFF);						\
   296 	a = (pixel>>24);						\
   297 }
   298 #define DISEMBLE_RGBA(buf, bpp, fmt, pixel, r, g, b, a)			   \
   299 do {									   \
   300 	switch (bpp) {							   \
   301 		case 2:							   \
   302 			pixel = *((Uint16 *)(buf));			   \
   303 		break;							   \
   304 									   \
   305 		case 3:	{/* FIXME: broken code (no alpha) */		   \
   306 		        Uint8 *b = (Uint8 *)buf;			   \
   307 			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
   308 			        pixel = b[0] + (b[1] << 8) + (b[2] << 16); \
   309 			} else {					   \
   310 			        pixel = (b[0] << 16) + (b[1] << 8) + b[2]; \
   311 			}						   \
   312 		}							   \
   313 		break;							   \
   314 									   \
   315 		case 4:							   \
   316 			pixel = *((Uint32 *)(buf));			   \
   317 		break;							   \
   318 									   \
   319 		default:						   \
   320 		        pixel = 0; /* stop gcc complaints */		   \
   321 		break;							   \
   322 	}								   \
   323 	RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a);			   \
   324 	pixel &= ~fmt->Amask;						   \
   325 } while(0)
   326 
   327 /* FIXME: this isn't correct, especially for Alpha (maximum != 255) */
   328 #define PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a)				\
   329 {									\
   330 	pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
   331 		((g>>fmt->Gloss)<<fmt->Gshift)|				\
   332 		((b>>fmt->Bloss)<<fmt->Bshift)|				\
   333 		((a>>fmt->Aloss)<<fmt->Ashift);				\
   334 }
   335 #define ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)			\
   336 {									\
   337 	switch (bpp) {							\
   338 		case 2: {						\
   339 			Uint16 pixel;					\
   340 									\
   341 			PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a);	\
   342 			*((Uint16 *)(buf)) = pixel;			\
   343 		}							\
   344 		break;							\
   345 									\
   346 		case 3: { /* FIXME: broken code (no alpha) */		\
   347                         if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
   348 			        *((buf)+fmt->Rshift/8) = r;		\
   349 				*((buf)+fmt->Gshift/8) = g;		\
   350 				*((buf)+fmt->Bshift/8) = b;		\
   351 			} else {					\
   352 			        *((buf)+2-fmt->Rshift/8) = r;		\
   353 				*((buf)+2-fmt->Gshift/8) = g;		\
   354 				*((buf)+2-fmt->Bshift/8) = b;		\
   355 			}						\
   356 		}							\
   357 		break;							\
   358 									\
   359 		case 4: {						\
   360 			Uint32 pixel;					\
   361 									\
   362 			PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a);	\
   363 			*((Uint32 *)(buf)) = pixel;			\
   364 		}							\
   365 		break;							\
   366 	}								\
   367 }
   368 
   369 /* Blend the RGB values of two pixels based on a source alpha value */
   370 #define ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB)	\
   371 do {						\
   372 	dR = (((sR-dR)*(A))>>8)+dR;		\
   373 	dG = (((sG-dG)*(A))>>8)+dG;		\
   374 	dB = (((sB-dB)*(A))>>8)+dB;		\
   375 } while(0)
   376 
   377 /* This is a very useful loop for optimizing blitters */
   378 #if defined(_MSC_VER) && (_MSC_VER == 1300)
   379 /* There's a bug in the Visual C++ 7 optimizer when compiling this code */
   380 #else
   381 #define USE_DUFFS_LOOP
   382 #endif
   383 #ifdef USE_DUFFS_LOOP
   384 
   385 /* 8-times unrolled loop */
   386 #define DUFFS_LOOP8(pixel_copy_increment, width)			\
   387 { int n = (width+7)/8;							\
   388 	switch (width & 7) {						\
   389 	case 0: do {	pixel_copy_increment;				\
   390 	case 7:		pixel_copy_increment;				\
   391 	case 6:		pixel_copy_increment;				\
   392 	case 5:		pixel_copy_increment;				\
   393 	case 4:		pixel_copy_increment;				\
   394 	case 3:		pixel_copy_increment;				\
   395 	case 2:		pixel_copy_increment;				\
   396 	case 1:		pixel_copy_increment;				\
   397 		} while ( --n > 0 );					\
   398 	}								\
   399 }
   400 
   401 /* 4-times unrolled loop */
   402 #define DUFFS_LOOP4(pixel_copy_increment, width)			\
   403 { int n = (width+3)/4;							\
   404 	switch (width & 3) {						\
   405 	case 0: do {	pixel_copy_increment;				\
   406 	case 3:		pixel_copy_increment;				\
   407 	case 2:		pixel_copy_increment;				\
   408 	case 1:		pixel_copy_increment;				\
   409 		} while ( --n > 0 );					\
   410 	}								\
   411 }
   412 
   413 /* 2 - times unrolled loop */
   414 #define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
   415 				double_pixel_copy_increment, width)	\
   416 { int n, w = width;							\
   417 	if( w & 1 ) {							\
   418 	    pixel_copy_increment;					\
   419 	    w--;							\
   420 	}								\
   421 	if ( w > 0 )	{						\
   422 	    n = ( w + 2) / 4;						\
   423 	    switch( w & 2 ) {						\
   424 	    case 0: do {	double_pixel_copy_increment;		\
   425 	    case 2:		double_pixel_copy_increment;		\
   426 		    } while ( --n > 0 );					\
   427 	    }								\
   428 	}								\
   429 }
   430 
   431 /* 2 - times unrolled loop 4 pixels */
   432 #define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
   433 				double_pixel_copy_increment,		\
   434 				quatro_pixel_copy_increment, width)	\
   435 { int n, w = width;								\
   436         if(w & 1) {							\
   437 	  pixel_copy_increment;						\
   438 	  w--;								\
   439 	}								\
   440 	if(w & 2) {							\
   441 	  double_pixel_copy_increment;					\
   442 	  w -= 2;							\
   443 	}								\
   444 	if ( w > 0 ) {							\
   445 	    n = ( w + 7 ) / 8;						\
   446 	    switch( w & 4 ) {						\
   447 	    case 0: do {	quatro_pixel_copy_increment;		\
   448 	    case 4:		quatro_pixel_copy_increment;		\
   449 		    } while ( --n > 0 );					\
   450 	    }								\
   451 	}								\
   452 }
   453 
   454 /* Use the 8-times version of the loop by default */
   455 #define DUFFS_LOOP(pixel_copy_increment, width)				\
   456 	DUFFS_LOOP8(pixel_copy_increment, width)
   457 
   458 #else
   459 
   460 /* Don't use Duff's device to unroll loops */
   461 #define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
   462 			 double_pixel_copy_increment, width)		\
   463 { int n = width;								\
   464     if( n & 1 ) {							\
   465 	pixel_copy_increment;						\
   466 	n--;								\
   467     }									\
   468     n=n>>1;								\
   469     for(; n > 0; --n) {   						\
   470 	double_pixel_copy_increment;					\
   471     }									\
   472 }
   473 
   474 /* Don't use Duff's device to unroll loops */
   475 #define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
   476 				double_pixel_copy_increment,		\
   477 				quatro_pixel_copy_increment, width)	\
   478 { int n = width;								\
   479         if(n & 1) {							\
   480 	  pixel_copy_increment;						\
   481 	  n--;								\
   482 	}								\
   483 	if(n & 2) {							\
   484 	  double_pixel_copy_increment;					\
   485 	  n -= 2;							\
   486 	}								\
   487 	n=n>>2;								\
   488 	for(; n > 0; --n) {   						\
   489 	  quatro_pixel_copy_increment;					\
   490         }								\
   491 }
   492 
   493 /* Don't use Duff's device to unroll loops */
   494 #define DUFFS_LOOP(pixel_copy_increment, width)				\
   495 { int n;								\
   496 	for ( n=width; n > 0; --n ) {					\
   497 		pixel_copy_increment;					\
   498 	}								\
   499 }
   500 #define DUFFS_LOOP8(pixel_copy_increment, width)			\
   501 	DUFFS_LOOP(pixel_copy_increment, width)
   502 #define DUFFS_LOOP4(pixel_copy_increment, width)			\
   503 	DUFFS_LOOP(pixel_copy_increment, width)
   504 
   505 #endif /* USE_DUFFS_LOOP */
   506 
   507 /* Prevent Visual C++ 6.0 from printing out stupid warnings */
   508 #if defined(_MSC_VER) && (_MSC_VER >= 600)
   509 #pragma warning(disable: 4550)
   510 #endif
   511 
   512 #endif /* _SDL_blit_h */