src/video/SDL_blit.h
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 553 417f8709e648
child 769 b8d311d90021
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@297
     3
    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@0
     6
    modify it under the terms of the GNU Library General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@0
     8
    version 2 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@0
    13
    Library General Public License for more details.
slouken@0
    14
slouken@0
    15
    You should have received a copy of the GNU Library General Public
slouken@0
    16
    License along with this library; if not, write to the Free
slouken@0
    17
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@0
    22
slouken@0
    23
#ifdef SAVE_RCSID
slouken@0
    24
static char rcsid =
slouken@0
    25
 "@(#) $Id$";
slouken@0
    26
#endif
slouken@0
    27
slouken@0
    28
#ifndef _SDL_blit_h
slouken@0
    29
#define _SDL_blit_h
slouken@0
    30
slouken@0
    31
#include "SDL_endian.h"
slouken@0
    32
slouken@0
    33
/* The structure passed to the low level blit functions */
slouken@0
    34
typedef struct {
slouken@0
    35
	Uint8 *s_pixels;
slouken@0
    36
	int s_width;
slouken@0
    37
	int s_height;
slouken@0
    38
	int s_skip;
slouken@0
    39
	Uint8 *d_pixels;
slouken@0
    40
	int d_width;
slouken@0
    41
	int d_height;
slouken@0
    42
	int d_skip;
slouken@0
    43
	void *aux_data;
slouken@0
    44
	SDL_PixelFormat *src;
slouken@0
    45
	Uint8 *table;
slouken@0
    46
	SDL_PixelFormat *dst;
slouken@0
    47
} SDL_BlitInfo;
slouken@0
    48
slouken@0
    49
/* The type definition for the low level blit functions */
slouken@0
    50
typedef void (*SDL_loblit)(SDL_BlitInfo *info);
slouken@0
    51
slouken@0
    52
/* This is the private info structure for software accelerated blits */
slouken@0
    53
struct private_swaccel {
slouken@0
    54
	SDL_loblit blit;
slouken@0
    55
	void *aux_data;
slouken@0
    56
};
slouken@0
    57
slouken@0
    58
/* Blit mapping definition */
slouken@0
    59
typedef struct SDL_BlitMap {
slouken@0
    60
	SDL_Surface *dst;
slouken@0
    61
	int identity;
slouken@0
    62
	Uint8 *table;
slouken@0
    63
	SDL_blit hw_blit;
slouken@0
    64
	SDL_blit sw_blit;
slouken@0
    65
	struct private_hwaccel *hw_data;
slouken@0
    66
	struct private_swaccel *sw_data;
slouken@0
    67
slouken@0
    68
	/* the version count matches the destination; mismatch indicates
slouken@0
    69
	   an invalid mapping */
slouken@0
    70
        unsigned int format_version;
slouken@0
    71
} SDL_BlitMap;
slouken@0
    72
slouken@0
    73
slouken@0
    74
/* Functions found in SDL_blit.c */
slouken@0
    75
extern int SDL_CalculateBlit(SDL_Surface *surface);
slouken@0
    76
slouken@0
    77
/* Functions found in SDL_blit_{0,1,N,A}.c */
slouken@0
    78
extern SDL_loblit SDL_CalculateBlit0(SDL_Surface *surface, int complex);
slouken@0
    79
extern SDL_loblit SDL_CalculateBlit1(SDL_Surface *surface, int complex);
slouken@0
    80
extern SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int complex);
slouken@0
    81
extern SDL_loblit SDL_CalculateAlphaBlit(SDL_Surface *surface, int complex);
slouken@0
    82
slouken@0
    83
/*
slouken@0
    84
 * Useful macros for blitting routines
slouken@0
    85
 */
slouken@0
    86
slouken@0
    87
#define FORMAT_EQUAL(A, B)						\
slouken@0
    88
    ((A)->BitsPerPixel == (B)->BitsPerPixel				\
slouken@0
    89
     && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))
slouken@0
    90
slouken@0
    91
/* Load pixel of the specified format from a buffer and get its R-G-B values */
slouken@0
    92
/* FIXME: rescale values to 0..255 here? */
slouken@0
    93
#define RGB_FROM_PIXEL(pixel, fmt, r, g, b)				\
slouken@0
    94
{									\
slouken@0
    95
	r = (((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss); 		\
slouken@0
    96
	g = (((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss); 		\
slouken@0
    97
	b = (((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss); 		\
slouken@0
    98
}
slouken@0
    99
#define RGB_FROM_RGB565(pixel, r, g, b)					\
slouken@0
   100
{									\
slouken@0
   101
	r = (((pixel&0xF800)>>11)<<3);		 			\
slouken@0
   102
	g = (((pixel&0x07E0)>>5)<<2); 					\
slouken@0
   103
	b = ((pixel&0x001F)<<3); 					\
slouken@0
   104
}
slouken@0
   105
#define RGB_FROM_RGB555(pixel, r, g, b)					\
slouken@0
   106
{									\
slouken@0
   107
	r = (((pixel&0x7C00)>>10)<<3);		 			\
slouken@0
   108
	g = (((pixel&0x03E0)>>5)<<3); 					\
slouken@0
   109
	b = ((pixel&0x001F)<<3); 					\
slouken@0
   110
}
slouken@0
   111
#define RGB_FROM_RGB888(pixel, r, g, b)					\
slouken@0
   112
{									\
slouken@0
   113
	r = ((pixel&0xFF0000)>>16);		 			\
slouken@0
   114
	g = ((pixel&0xFF00)>>8);		 			\
slouken@0
   115
	b = (pixel&0xFF);			 			\
slouken@0
   116
}
slouken@0
   117
#define RETRIEVE_RGB_PIXEL(buf, bpp, pixel)				   \
slouken@0
   118
do {									   \
slouken@0
   119
	switch (bpp) {							   \
slouken@0
   120
		case 2:							   \
slouken@0
   121
			pixel = *((Uint16 *)(buf));			   \
slouken@0
   122
		break;							   \
slouken@0
   123
									   \
slouken@0
   124
		case 3: {						   \
slouken@0
   125
		        Uint8 *B = (Uint8 *)(buf);			   \
slouken@0
   126
			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
slouken@0
   127
			        pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
slouken@0
   128
			} else {					   \
slouken@0
   129
			        pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
slouken@0
   130
			}						   \
slouken@0
   131
		}							   \
slouken@0
   132
		break;							   \
slouken@0
   133
									   \
slouken@0
   134
		case 4:							   \
slouken@0
   135
			pixel = *((Uint32 *)(buf));			   \
slouken@0
   136
		break;							   \
slouken@0
   137
									   \
slouken@0
   138
		default:						   \
slouken@0
   139
			pixel = 0; /* appease gcc */			   \
slouken@0
   140
		break;							   \
slouken@0
   141
	}								   \
slouken@0
   142
} while(0)
slouken@0
   143
slouken@0
   144
#define DISEMBLE_RGB(buf, bpp, fmt, pixel, r, g, b)			   \
slouken@0
   145
do {									   \
slouken@0
   146
	switch (bpp) {							   \
slouken@0
   147
		case 2:							   \
slouken@0
   148
			pixel = *((Uint16 *)(buf));			   \
slouken@0
   149
		break;							   \
slouken@0
   150
									   \
slouken@0
   151
		case 3: {						   \
slouken@0
   152
		        Uint8 *B = (Uint8 *)buf;			   \
slouken@0
   153
			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
slouken@0
   154
			        pixel = B[0] + (B[1] << 8) + (B[2] << 16); \
slouken@0
   155
			} else {					   \
slouken@0
   156
			        pixel = (B[0] << 16) + (B[1] << 8) + B[2]; \
slouken@0
   157
			}						   \
slouken@0
   158
		}							   \
slouken@0
   159
		break;							   \
slouken@0
   160
									   \
slouken@0
   161
		case 4:							   \
slouken@0
   162
			pixel = *((Uint32 *)(buf));			   \
slouken@0
   163
		break;							   \
slouken@0
   164
									   \
slouken@0
   165
	        default:						   \
slouken@0
   166
		        pixel = 0;	/* prevent gcc from complaining */ \
slouken@0
   167
		break;							   \
slouken@0
   168
	}								   \
slouken@0
   169
	RGB_FROM_PIXEL(pixel, fmt, r, g, b);				   \
slouken@0
   170
} while(0)
slouken@0
   171
slouken@0
   172
/* Assemble R-G-B values into a specified pixel format and store them */
slouken@0
   173
#define PIXEL_FROM_RGB(pixel, fmt, r, g, b)				\
slouken@0
   174
{									\
slouken@0
   175
	pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
slouken@0
   176
		((g>>fmt->Gloss)<<fmt->Gshift)|				\
slouken@0
   177
		((b>>fmt->Bloss)<<fmt->Bshift);				\
slouken@0
   178
}
slouken@0
   179
#define RGB565_FROM_RGB(pixel, r, g, b)					\
slouken@0
   180
{									\
slouken@0
   181
	pixel = ((r>>3)<<11)|((g>>2)<<5)|(b>>3);			\
slouken@0
   182
}
slouken@0
   183
#define RGB555_FROM_RGB(pixel, r, g, b)					\
slouken@0
   184
{									\
slouken@0
   185
	pixel = ((r>>3)<<10)|((g>>3)<<5)|(b>>3);			\
slouken@0
   186
}
slouken@0
   187
#define RGB888_FROM_RGB(pixel, r, g, b)					\
slouken@0
   188
{									\
slouken@0
   189
	pixel = (r<<16)|(g<<8)|b;					\
slouken@0
   190
}
slouken@0
   191
#define ASSEMBLE_RGB(buf, bpp, fmt, r, g, b) 				\
slouken@0
   192
{									\
slouken@0
   193
	switch (bpp) {							\
slouken@0
   194
		case 2: {						\
slouken@0
   195
			Uint16 pixel;					\
slouken@0
   196
									\
slouken@0
   197
			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
slouken@0
   198
			*((Uint16 *)(buf)) = pixel;			\
slouken@0
   199
		}							\
slouken@0
   200
		break;							\
slouken@0
   201
									\
slouken@0
   202
		case 3: {						\
slouken@0
   203
                        if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
slouken@0
   204
			        *((buf)+fmt->Rshift/8) = r;		\
slouken@0
   205
				*((buf)+fmt->Gshift/8) = g;		\
slouken@0
   206
				*((buf)+fmt->Bshift/8) = b;		\
slouken@0
   207
			} else {					\
slouken@0
   208
			        *((buf)+2-fmt->Rshift/8) = r;		\
slouken@0
   209
				*((buf)+2-fmt->Gshift/8) = g;		\
slouken@0
   210
				*((buf)+2-fmt->Bshift/8) = b;		\
slouken@0
   211
			}						\
slouken@0
   212
		}							\
slouken@0
   213
		break;							\
slouken@0
   214
									\
slouken@0
   215
		case 4: {						\
slouken@0
   216
			Uint32 pixel;					\
slouken@0
   217
									\
slouken@0
   218
			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
slouken@0
   219
			*((Uint32 *)(buf)) = pixel;			\
slouken@0
   220
		}							\
slouken@0
   221
		break;							\
slouken@0
   222
	}								\
slouken@0
   223
}
slouken@0
   224
#define ASSEMBLE_RGB_AMASK(buf, bpp, fmt, r, g, b, Amask)		\
slouken@0
   225
{									\
slouken@0
   226
	switch (bpp) {							\
slouken@0
   227
		case 2: {						\
slouken@0
   228
			Uint16 *bufp;					\
slouken@0
   229
			Uint16 pixel;					\
slouken@0
   230
									\
slouken@0
   231
			bufp = (Uint16 *)buf;				\
slouken@0
   232
			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
slouken@0
   233
			*bufp = pixel | (*bufp & Amask);		\
slouken@0
   234
		}							\
slouken@0
   235
		break;							\
slouken@0
   236
									\
slouken@0
   237
		case 3: {						\
slouken@0
   238
                        if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
slouken@0
   239
			        *((buf)+fmt->Rshift/8) = r;		\
slouken@0
   240
				*((buf)+fmt->Gshift/8) = g;		\
slouken@0
   241
				*((buf)+fmt->Bshift/8) = b;		\
slouken@0
   242
			} else {					\
slouken@0
   243
			        *((buf)+2-fmt->Rshift/8) = r;		\
slouken@0
   244
				*((buf)+2-fmt->Gshift/8) = g;		\
slouken@0
   245
				*((buf)+2-fmt->Bshift/8) = b;		\
slouken@0
   246
			}						\
slouken@0
   247
		}							\
slouken@0
   248
		break;							\
slouken@0
   249
									\
slouken@0
   250
		case 4: {						\
slouken@0
   251
			Uint32 *bufp;					\
slouken@0
   252
			Uint32 pixel;					\
slouken@0
   253
									\
slouken@0
   254
			bufp = (Uint32 *)buf;				\
slouken@0
   255
			PIXEL_FROM_RGB(pixel, fmt, r, g, b);		\
slouken@0
   256
			*bufp = pixel | (*bufp & Amask);		\
slouken@0
   257
		}							\
slouken@0
   258
		break;							\
slouken@0
   259
	}								\
slouken@0
   260
}
slouken@0
   261
slouken@0
   262
/* FIXME: Should we rescale alpha into 0..255 here? */
slouken@0
   263
#define RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a)				\
slouken@0
   264
{									\
slouken@0
   265
	r = ((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss; 		\
slouken@0
   266
	g = ((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss; 		\
slouken@0
   267
	b = ((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss; 		\
slouken@0
   268
	a = ((pixel&fmt->Amask)>>fmt->Ashift)<<fmt->Aloss;	 	\
slouken@0
   269
}
slouken@0
   270
#define RGBA_FROM_8888(pixel, fmt, r, g, b, a)	\
slouken@0
   271
{						\
slouken@0
   272
	r = (pixel&fmt->Rmask)>>fmt->Rshift;	\
slouken@0
   273
	g = (pixel&fmt->Gmask)>>fmt->Gshift;	\
slouken@0
   274
	b = (pixel&fmt->Bmask)>>fmt->Bshift;	\
slouken@0
   275
	a = (pixel&fmt->Amask)>>fmt->Ashift;	\
slouken@0
   276
}
slouken@0
   277
#define RGBA_FROM_RGBA8888(pixel, r, g, b, a)				\
slouken@0
   278
{									\
slouken@0
   279
	r = (pixel>>24);						\
slouken@0
   280
	g = ((pixel>>16)&0xFF);						\
slouken@0
   281
	b = ((pixel>>8)&0xFF);						\
slouken@0
   282
	a = (pixel&0xFF);						\
slouken@0
   283
}
slouken@0
   284
#define RGBA_FROM_ARGB8888(pixel, r, g, b, a)				\
slouken@0
   285
{									\
slouken@0
   286
	r = ((pixel>>16)&0xFF);						\
slouken@0
   287
	g = ((pixel>>8)&0xFF);						\
slouken@0
   288
	b = (pixel&0xFF);						\
slouken@0
   289
	a = (pixel>>24);						\
slouken@0
   290
}
slouken@0
   291
#define RGBA_FROM_ABGR8888(pixel, r, g, b, a)				\
slouken@0
   292
{									\
slouken@0
   293
	r = (pixel&0xFF);						\
slouken@0
   294
	g = ((pixel>>8)&0xFF);						\
slouken@0
   295
	b = ((pixel>>16)&0xFF);						\
slouken@0
   296
	a = (pixel>>24);						\
slouken@0
   297
}
slouken@0
   298
#define DISEMBLE_RGBA(buf, bpp, fmt, pixel, r, g, b, a)			   \
slouken@0
   299
do {									   \
slouken@0
   300
	switch (bpp) {							   \
slouken@0
   301
		case 2:							   \
slouken@0
   302
			pixel = *((Uint16 *)(buf));			   \
slouken@0
   303
		break;							   \
slouken@0
   304
									   \
slouken@0
   305
		case 3:	{/* FIXME: broken code (no alpha) */		   \
slouken@0
   306
		        Uint8 *b = (Uint8 *)buf;			   \
slouken@0
   307
			if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		   \
slouken@0
   308
			        pixel = b[0] + (b[1] << 8) + (b[2] << 16); \
slouken@0
   309
			} else {					   \
slouken@0
   310
			        pixel = (b[0] << 16) + (b[1] << 8) + b[2]; \
slouken@0
   311
			}						   \
slouken@0
   312
		}							   \
slouken@0
   313
		break;							   \
slouken@0
   314
									   \
slouken@0
   315
		case 4:							   \
slouken@0
   316
			pixel = *((Uint32 *)(buf));			   \
slouken@0
   317
		break;							   \
slouken@0
   318
									   \
slouken@0
   319
		default:						   \
slouken@0
   320
		        pixel = 0; /* stop gcc complaints */		   \
slouken@0
   321
		break;							   \
slouken@0
   322
	}								   \
slouken@0
   323
	RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a);			   \
slouken@0
   324
	pixel &= ~fmt->Amask;						   \
slouken@0
   325
} while(0)
slouken@0
   326
slouken@0
   327
/* FIXME: this isn't correct, especially for Alpha (maximum != 255) */
slouken@0
   328
#define PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a)				\
slouken@0
   329
{									\
slouken@0
   330
	pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
slouken@0
   331
		((g>>fmt->Gloss)<<fmt->Gshift)|				\
slouken@0
   332
		((b>>fmt->Bloss)<<fmt->Bshift)|				\
slouken@535
   333
		((a>>fmt->Aloss)<<fmt->Ashift);				\
slouken@0
   334
}
slouken@0
   335
#define ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)			\
slouken@0
   336
{									\
slouken@0
   337
	switch (bpp) {							\
slouken@0
   338
		case 2: {						\
slouken@0
   339
			Uint16 pixel;					\
slouken@0
   340
									\
slouken@0
   341
			PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a);	\
slouken@0
   342
			*((Uint16 *)(buf)) = pixel;			\
slouken@0
   343
		}							\
slouken@0
   344
		break;							\
slouken@0
   345
									\
slouken@0
   346
		case 3: { /* FIXME: broken code (no alpha) */		\
slouken@0
   347
                        if(SDL_BYTEORDER == SDL_LIL_ENDIAN) {		\
slouken@0
   348
			        *((buf)+fmt->Rshift/8) = r;		\
slouken@0
   349
				*((buf)+fmt->Gshift/8) = g;		\
slouken@0
   350
				*((buf)+fmt->Bshift/8) = b;		\
slouken@0
   351
			} else {					\
slouken@0
   352
			        *((buf)+2-fmt->Rshift/8) = r;		\
slouken@0
   353
				*((buf)+2-fmt->Gshift/8) = g;		\
slouken@0
   354
				*((buf)+2-fmt->Bshift/8) = b;		\
slouken@0
   355
			}						\
slouken@0
   356
		}							\
slouken@0
   357
		break;							\
slouken@0
   358
									\
slouken@0
   359
		case 4: {						\
slouken@0
   360
			Uint32 pixel;					\
slouken@0
   361
									\
slouken@0
   362
			PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a);	\
slouken@0
   363
			*((Uint32 *)(buf)) = pixel;			\
slouken@0
   364
		}							\
slouken@0
   365
		break;							\
slouken@0
   366
	}								\
slouken@0
   367
}
slouken@0
   368
slouken@0
   369
/* Blend the RGB values of two pixels based on a source alpha value */
slouken@0
   370
#define ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB)	\
slouken@0
   371
do {						\
slouken@0
   372
	dR = (((sR-dR)*(A))>>8)+dR;		\
slouken@0
   373
	dG = (((sG-dG)*(A))>>8)+dG;		\
slouken@0
   374
	dB = (((sB-dB)*(A))>>8)+dB;		\
slouken@0
   375
} while(0)
slouken@0
   376
slouken@0
   377
/* This is a very useful loop for optimizing blitters */
slouken@553
   378
#if defined(_MSC_VER) && (_MSC_VER == 1300)
slouken@553
   379
/* There's a bug in the Visual C++ 7 optimizer when compiling this code */
slouken@553
   380
#else
slouken@0
   381
#define USE_DUFFS_LOOP
slouken@553
   382
#endif
slouken@0
   383
#ifdef USE_DUFFS_LOOP
slouken@0
   384
slouken@0
   385
/* 8-times unrolled loop */
slouken@0
   386
#define DUFFS_LOOP8(pixel_copy_increment, width)			\
slouken@0
   387
{ int n = (width+7)/8;							\
slouken@91
   388
	switch (width & 7) {						\
slouken@0
   389
	case 0: do {	pixel_copy_increment;				\
slouken@0
   390
	case 7:		pixel_copy_increment;				\
slouken@0
   391
	case 6:		pixel_copy_increment;				\
slouken@0
   392
	case 5:		pixel_copy_increment;				\
slouken@0
   393
	case 4:		pixel_copy_increment;				\
slouken@0
   394
	case 3:		pixel_copy_increment;				\
slouken@0
   395
	case 2:		pixel_copy_increment;				\
slouken@0
   396
	case 1:		pixel_copy_increment;				\
slouken@0
   397
		} while ( --n > 0 );					\
slouken@0
   398
	}								\
slouken@0
   399
}
slouken@0
   400
slouken@0
   401
/* 4-times unrolled loop */
slouken@0
   402
#define DUFFS_LOOP4(pixel_copy_increment, width)			\
slouken@0
   403
{ int n = (width+3)/4;							\
slouken@91
   404
	switch (width & 3) {						\
slouken@0
   405
	case 0: do {	pixel_copy_increment;				\
slouken@0
   406
	case 3:		pixel_copy_increment;				\
slouken@0
   407
	case 2:		pixel_copy_increment;				\
slouken@0
   408
	case 1:		pixel_copy_increment;				\
slouken@0
   409
		} while ( --n > 0 );					\
slouken@0
   410
	}								\
slouken@0
   411
}
slouken@0
   412
slouken@689
   413
/* 2 - times unrolled loop */
slouken@689
   414
#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
slouken@689
   415
				double_pixel_copy_increment, width)	\
slouken@689
   416
{ int n, w = width;							\
slouken@689
   417
	if( w & 1 ) {							\
slouken@689
   418
	    pixel_copy_increment;					\
slouken@689
   419
	    w--;							\
slouken@689
   420
	}								\
slouken@689
   421
	if ( w > 0 )	{						\
slouken@689
   422
	    n = ( w + 2) / 4;						\
slouken@689
   423
	    switch( w & 2 ) {						\
slouken@689
   424
	    case 0: do {	double_pixel_copy_increment;		\
slouken@689
   425
	    case 2:		double_pixel_copy_increment;		\
slouken@689
   426
		    } while ( --n > 0 );					\
slouken@689
   427
	    }								\
slouken@689
   428
	}								\
slouken@689
   429
}
slouken@689
   430
slouken@689
   431
/* 2 - times unrolled loop 4 pixels */
slouken@689
   432
#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
slouken@689
   433
				double_pixel_copy_increment,		\
slouken@689
   434
				quatro_pixel_copy_increment, width)	\
slouken@689
   435
{ int n, w = width;								\
slouken@689
   436
        if(w & 1) {							\
slouken@689
   437
	  pixel_copy_increment;						\
slouken@689
   438
	  w--;								\
slouken@689
   439
	}								\
slouken@689
   440
	if(w & 2) {							\
slouken@689
   441
	  double_pixel_copy_increment;					\
slouken@689
   442
	  w -= 2;							\
slouken@689
   443
	}								\
slouken@689
   444
	if ( w > 0 ) {							\
slouken@689
   445
	    n = ( w + 7 ) / 8;						\
slouken@689
   446
	    switch( w & 4 ) {						\
slouken@689
   447
	    case 0: do {	quatro_pixel_copy_increment;		\
slouken@689
   448
	    case 4:		quatro_pixel_copy_increment;		\
slouken@689
   449
		    } while ( --n > 0 );					\
slouken@689
   450
	    }								\
slouken@689
   451
	}								\
slouken@689
   452
}
slouken@689
   453
slouken@0
   454
/* Use the 8-times version of the loop by default */
slouken@0
   455
#define DUFFS_LOOP(pixel_copy_increment, width)				\
slouken@0
   456
	DUFFS_LOOP8(pixel_copy_increment, width)
slouken@0
   457
slouken@0
   458
#else
slouken@0
   459
slouken@0
   460
/* Don't use Duff's device to unroll loops */
slouken@689
   461
#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment,			\
slouken@689
   462
			 double_pixel_copy_increment, width)		\
slouken@689
   463
{ int n = width;								\
slouken@689
   464
    if( n & 1 ) {							\
slouken@689
   465
	pixel_copy_increment;						\
slouken@689
   466
	n--;								\
slouken@689
   467
    }									\
slouken@689
   468
    n=n>>1;								\
slouken@689
   469
    for(; n > 0; --n) {   						\
slouken@689
   470
	double_pixel_copy_increment;					\
slouken@689
   471
    }									\
slouken@689
   472
}
slouken@689
   473
slouken@689
   474
/* Don't use Duff's device to unroll loops */
slouken@689
   475
#define DUFFS_LOOP_QUATRO2(pixel_copy_increment,			\
slouken@689
   476
				double_pixel_copy_increment,		\
slouken@689
   477
				quatro_pixel_copy_increment, width)	\
slouken@689
   478
{ int n = width;								\
slouken@689
   479
        if(n & 1) {							\
slouken@689
   480
	  pixel_copy_increment;						\
slouken@689
   481
	  n--;								\
slouken@689
   482
	}								\
slouken@689
   483
	if(n & 2) {							\
slouken@689
   484
	  double_pixel_copy_increment;					\
slouken@689
   485
	  n -= 2;							\
slouken@689
   486
	}								\
slouken@689
   487
	n=n>>2;								\
slouken@689
   488
	for(; n > 0; --n) {   						\
slouken@689
   489
	  quatro_pixel_copy_increment;					\
slouken@689
   490
        }								\
slouken@689
   491
}
slouken@689
   492
slouken@689
   493
/* Don't use Duff's device to unroll loops */
slouken@0
   494
#define DUFFS_LOOP(pixel_copy_increment, width)				\
slouken@0
   495
{ int n;								\
slouken@0
   496
	for ( n=width; n > 0; --n ) {					\
slouken@0
   497
		pixel_copy_increment;					\
slouken@0
   498
	}								\
slouken@0
   499
}
slouken@0
   500
#define DUFFS_LOOP8(pixel_copy_increment, width)			\
slouken@0
   501
	DUFFS_LOOP(pixel_copy_increment, width)
slouken@0
   502
#define DUFFS_LOOP4(pixel_copy_increment, width)			\
slouken@0
   503
	DUFFS_LOOP(pixel_copy_increment, width)
slouken@0
   504
slouken@0
   505
#endif /* USE_DUFFS_LOOP */
slouken@0
   506
slouken@0
   507
/* Prevent Visual C++ 6.0 from printing out stupid warnings */
slouken@0
   508
#if defined(_MSC_VER) && (_MSC_VER >= 600)
slouken@0
   509
#pragma warning(disable: 4550)
slouken@0
   510
#endif
slouken@0
   511
slouken@0
   512
#endif /* _SDL_blit_h */