src/video/SDL_blit.c
author Sam Lantinga
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 526 4314a501d7be
child 697 8468fc0504f3
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@297
     3
    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@0
     6
    modify it under the terms of the GNU Library General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@0
     8
    version 2 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@0
    13
    Library General Public License for more details.
slouken@0
    14
slouken@0
    15
    You should have received a copy of the GNU Library General Public
slouken@0
    16
    License along with this library; if not, write to the Free
slouken@0
    17
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@0
    22
slouken@0
    23
#ifdef SAVE_RCSID
slouken@0
    24
static char rcsid =
slouken@0
    25
 "@(#) $Id$";
slouken@0
    26
#endif
slouken@0
    27
slouken@0
    28
#include <stdio.h>
slouken@0
    29
#include <stdlib.h>
slouken@0
    30
#include <string.h>
slouken@0
    31
slouken@0
    32
#include "SDL_error.h"
slouken@0
    33
#include "SDL_video.h"
slouken@0
    34
#include "SDL_sysvideo.h"
slouken@0
    35
#include "SDL_blit.h"
slouken@0
    36
#include "SDL_RLEaccel_c.h"
slouken@0
    37
#include "SDL_pixels_c.h"
slouken@0
    38
#include "SDL_memops.h"
slouken@0
    39
slouken@689
    40
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
    41
#include "mmx.h"
slouken@689
    42
/* Function to check the CPU flags */
slouken@689
    43
#define MMX_CPU		0x800000
slouken@689
    44
#define SSE_CPU		0x2000000
slouken@689
    45
#define CPU_Flags()	Hermes_X86_CPU()
slouken@689
    46
#define X86_ASSEMBLER
slouken@689
    47
#define HermesConverterInterface	void
slouken@689
    48
#define HermesClearInterface		void
slouken@689
    49
#define STACKCALL
slouken@689
    50
#include "HeadX86.h"
slouken@689
    51
#endif
slouken@689
    52
slouken@0
    53
/* The general purpose software blit routine */
slouken@0
    54
static int SDL_SoftBlit(SDL_Surface *src, SDL_Rect *srcrect,
slouken@0
    55
			SDL_Surface *dst, SDL_Rect *dstrect)
slouken@0
    56
{
slouken@0
    57
	int okay;
slouken@0
    58
	int src_locked;
slouken@0
    59
	int dst_locked;
slouken@0
    60
slouken@0
    61
	/* Everything is okay at the beginning...  */
slouken@0
    62
	okay = 1;
slouken@0
    63
slouken@0
    64
	/* Lock the destination if it's in hardware */
slouken@0
    65
	dst_locked = 0;
slouken@526
    66
	if ( SDL_MUSTLOCK(dst) ) {
slouken@526
    67
		if ( SDL_LockSurface(dst) < 0 ) {
slouken@0
    68
			okay = 0;
slouken@0
    69
		} else {
slouken@0
    70
			dst_locked = 1;
slouken@0
    71
		}
slouken@0
    72
	}
slouken@0
    73
	/* Lock the source if it's in hardware */
slouken@0
    74
	src_locked = 0;
slouken@526
    75
	if ( SDL_MUSTLOCK(src) ) {
slouken@526
    76
		if ( SDL_LockSurface(src) < 0 ) {
slouken@0
    77
			okay = 0;
slouken@0
    78
		} else {
slouken@0
    79
			src_locked = 1;
slouken@0
    80
		}
slouken@0
    81
	}
slouken@0
    82
slouken@0
    83
	/* Set up source and destination buffer pointers, and BLIT! */
slouken@0
    84
	if ( okay  && srcrect->w && srcrect->h ) {
slouken@0
    85
		SDL_BlitInfo info;
slouken@0
    86
		SDL_loblit RunBlit;
slouken@0
    87
slouken@0
    88
		/* Set up the blit information */
slouken@526
    89
		info.s_pixels = (Uint8 *)src->pixels +
slouken@0
    90
				(Uint16)srcrect->y*src->pitch +
slouken@0
    91
				(Uint16)srcrect->x*src->format->BytesPerPixel;
slouken@0
    92
		info.s_width = srcrect->w;
slouken@0
    93
		info.s_height = srcrect->h;
slouken@0
    94
		info.s_skip=src->pitch-info.s_width*src->format->BytesPerPixel;
slouken@526
    95
		info.d_pixels = (Uint8 *)dst->pixels +
slouken@0
    96
				(Uint16)dstrect->y*dst->pitch +
slouken@0
    97
				(Uint16)dstrect->x*dst->format->BytesPerPixel;
slouken@0
    98
		info.d_width = dstrect->w;
slouken@0
    99
		info.d_height = dstrect->h;
slouken@0
   100
		info.d_skip=dst->pitch-info.d_width*dst->format->BytesPerPixel;
slouken@0
   101
		info.aux_data = src->map->sw_data->aux_data;
slouken@0
   102
		info.src = src->format;
slouken@0
   103
		info.table = src->map->table;
slouken@0
   104
		info.dst = dst->format;
slouken@0
   105
		RunBlit = src->map->sw_data->blit;
slouken@0
   106
slouken@0
   107
		/* Run the actual software blit */
slouken@0
   108
		RunBlit(&info);
slouken@0
   109
	}
slouken@0
   110
slouken@0
   111
	/* We need to unlock the surfaces if they're locked */
slouken@0
   112
	if ( dst_locked ) {
slouken@526
   113
		SDL_UnlockSurface(dst);
slouken@310
   114
	}
slouken@0
   115
	if ( src_locked ) {
slouken@526
   116
		SDL_UnlockSurface(src);
slouken@0
   117
	}
slouken@0
   118
	/* Blit is done! */
slouken@0
   119
	return(okay ? 0 : -1);
slouken@0
   120
}
slouken@0
   121
slouken@689
   122
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   123
void SDL_memcpyMMX(char* to,char* from,int len)
slouken@689
   124
{
slouken@689
   125
	int i;
slouken@689
   126
slouken@689
   127
	for(i=0; i<len/8; i++) {
slouken@689
   128
		__asm__ __volatile__ (
slouken@689
   129
		"	movq (%0), %%mm0\n"
slouken@689
   130
		"	movq %%mm0, (%1)\n"
slouken@689
   131
		: : "r" (from), "r" (to) : "memory");
slouken@689
   132
		from+=8;
slouken@689
   133
		to+=8;
slouken@689
   134
	}
slouken@689
   135
	if (len&7)
slouken@689
   136
		SDL_memcpy(to, from, len&7);
slouken@689
   137
}
slouken@689
   138
slouken@689
   139
void SDL_memcpySSE(char* to,char* from,int len)
slouken@689
   140
{
slouken@689
   141
	int i;
slouken@689
   142
slouken@689
   143
	__asm__ __volatile__ (
slouken@689
   144
	"	prefetchnta (%0)\n"
slouken@689
   145
	"	prefetchnta 64(%0)\n"
slouken@689
   146
	"	prefetchnta 128(%0)\n"
slouken@689
   147
	"	prefetchnta 192(%0)\n"
slouken@689
   148
	: : "r" (from) );
slouken@689
   149
slouken@689
   150
	for(i=0; i<len/8; i++) {
slouken@689
   151
		__asm__ __volatile__ (
slouken@689
   152
		"	prefetchnta 256(%0)\n"
slouken@689
   153
		"	movq (%0), %%mm0\n"
slouken@689
   154
		"	movntq %%mm0, (%1)\n"
slouken@689
   155
		: : "r" (from), "r" (to) : "memory");
slouken@689
   156
		from+=8;
slouken@689
   157
		to+=8;
slouken@689
   158
	}
slouken@689
   159
	if (len&7)
slouken@689
   160
		SDL_memcpy(to, from, len&7);
slouken@689
   161
}
slouken@689
   162
#endif
slouken@689
   163
slouken@0
   164
static void SDL_BlitCopy(SDL_BlitInfo *info)
slouken@0
   165
{
slouken@0
   166
	Uint8 *src, *dst;
slouken@0
   167
	int w, h;
slouken@0
   168
	int srcskip, dstskip;
slouken@689
   169
	Uint32 f;
slouken@0
   170
slouken@0
   171
	w = info->d_width*info->dst->BytesPerPixel;
slouken@0
   172
	h = info->d_height;
slouken@0
   173
	src = info->s_pixels;
slouken@0
   174
	dst = info->d_pixels;
slouken@0
   175
	srcskip = w+info->s_skip;
slouken@0
   176
	dstskip = w+info->d_skip;
slouken@689
   177
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   178
	f=CPU_Flags();
slouken@689
   179
	if((f&(MMX_CPU|SSE_CPU))==(MMX_CPU|SSE_CPU))
slouken@689
   180
	{
slouken@689
   181
		while ( h-- ) {
slouken@689
   182
			SDL_memcpySSE(dst, src, w);
slouken@689
   183
			src += srcskip;
slouken@689
   184
			dst += dstskip;
slouken@689
   185
		}
slouken@689
   186
		__asm__ __volatile__ (
slouken@689
   187
		"	emms\n"
slouken@689
   188
		::);
slouken@689
   189
	}
slouken@689
   190
	else
slouken@689
   191
	if((f&(MMX_CPU))!=0)
slouken@689
   192
	{
slouken@689
   193
		while ( h-- ) {
slouken@689
   194
			SDL_memcpyMMX(dst, src, w);
slouken@689
   195
			src += srcskip;
slouken@689
   196
			dst += dstskip;
slouken@689
   197
		}
slouken@689
   198
		__asm__ __volatile__ (
slouken@689
   199
		"	emms\n"
slouken@689
   200
		::);
slouken@689
   201
	}
slouken@689
   202
	else
slouken@689
   203
#endif
slouken@0
   204
	while ( h-- ) {
slouken@0
   205
		SDL_memcpy(dst, src, w);
slouken@0
   206
		src += srcskip;
slouken@0
   207
		dst += dstskip;
slouken@0
   208
	}
slouken@0
   209
}
slouken@0
   210
slouken@0
   211
static void SDL_BlitCopyOverlap(SDL_BlitInfo *info)
slouken@0
   212
{
slouken@0
   213
	Uint8 *src, *dst;
slouken@0
   214
	int w, h;
slouken@0
   215
	int srcskip, dstskip;
slouken@0
   216
slouken@0
   217
	w = info->d_width*info->dst->BytesPerPixel;
slouken@0
   218
	h = info->d_height;
slouken@0
   219
	src = info->s_pixels;
slouken@0
   220
	dst = info->d_pixels;
slouken@0
   221
	srcskip = w+info->s_skip;
slouken@0
   222
	dstskip = w+info->d_skip;
slouken@0
   223
	if ( dst < src ) {
slouken@0
   224
		while ( h-- ) {
slouken@0
   225
			SDL_memcpy(dst, src, w);
slouken@0
   226
			src += srcskip;
slouken@0
   227
			dst += dstskip;
slouken@0
   228
		}
slouken@0
   229
	} else {
slouken@0
   230
		src += ((h-1) * srcskip);
slouken@0
   231
		dst += ((h-1) * dstskip);
slouken@0
   232
		while ( h-- ) {
slouken@0
   233
			SDL_revcpy(dst, src, w);
slouken@0
   234
			src -= srcskip;
slouken@0
   235
			dst -= dstskip;
slouken@0
   236
		}
slouken@0
   237
	}
slouken@0
   238
}
slouken@0
   239
slouken@0
   240
/* Figure out which of many blit routines to set up on a surface */
slouken@0
   241
int SDL_CalculateBlit(SDL_Surface *surface)
slouken@0
   242
{
slouken@0
   243
	int blit_index;
slouken@0
   244
slouken@0
   245
	/* Clean everything out to start */
slouken@0
   246
	if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
slouken@0
   247
		SDL_UnRLESurface(surface, 1);
slouken@0
   248
	}
slouken@0
   249
	surface->map->sw_blit = NULL;
slouken@0
   250
slouken@0
   251
	/* Figure out if an accelerated hardware blit is possible */
slouken@0
   252
	surface->flags &= ~SDL_HWACCEL;
slouken@0
   253
	if ( surface->map->identity ) {
slouken@0
   254
		int hw_blit_ok;
slouken@0
   255
slouken@0
   256
		if ( (surface->flags & SDL_HWSURFACE) == SDL_HWSURFACE ) {
slouken@0
   257
			/* We only support accelerated blitting to hardware */
slouken@0
   258
			if ( surface->map->dst->flags & SDL_HWSURFACE ) {
slouken@0
   259
				hw_blit_ok = current_video->info.blit_hw;
slouken@0
   260
			} else {
slouken@0
   261
				hw_blit_ok = 0;
slouken@0
   262
			}
slouken@0
   263
			if (hw_blit_ok && (surface->flags & SDL_SRCCOLORKEY)) {
slouken@0
   264
				hw_blit_ok = current_video->info.blit_hw_CC;
slouken@0
   265
			}
slouken@0
   266
			if ( hw_blit_ok && (surface->flags & SDL_SRCALPHA) ) {
slouken@0
   267
				hw_blit_ok = current_video->info.blit_hw_A;
slouken@0
   268
			}
slouken@0
   269
		} else {
slouken@0
   270
			/* We only support accelerated blitting to hardware */
slouken@0
   271
			if ( surface->map->dst->flags & SDL_HWSURFACE ) {
slouken@0
   272
				hw_blit_ok = current_video->info.blit_sw;
slouken@0
   273
			} else {
slouken@0
   274
				hw_blit_ok = 0;
slouken@0
   275
			}
slouken@0
   276
			if (hw_blit_ok && (surface->flags & SDL_SRCCOLORKEY)) {
slouken@0
   277
				hw_blit_ok = current_video->info.blit_sw_CC;
slouken@0
   278
			}
slouken@0
   279
			if ( hw_blit_ok && (surface->flags & SDL_SRCALPHA) ) {
slouken@0
   280
				hw_blit_ok = current_video->info.blit_sw_A;
slouken@0
   281
			}
slouken@0
   282
		}
slouken@0
   283
		if ( hw_blit_ok ) {
slouken@0
   284
			SDL_VideoDevice *video = current_video;
slouken@0
   285
			SDL_VideoDevice *this  = current_video;
slouken@0
   286
			video->CheckHWBlit(this, surface, surface->map->dst);
slouken@0
   287
		}
slouken@0
   288
	}
slouken@0
   289
slouken@0
   290
	/* Get the blit function index, based on surface mode */
slouken@0
   291
	/* { 0 = nothing, 1 = colorkey, 2 = alpha, 3 = colorkey+alpha } */
slouken@0
   292
	blit_index = 0;
slouken@0
   293
	blit_index |= (!!(surface->flags & SDL_SRCCOLORKEY))      << 0;
slouken@0
   294
	if ( surface->flags & SDL_SRCALPHA
slouken@0
   295
	     && (surface->format->alpha != SDL_ALPHA_OPAQUE
slouken@0
   296
		 || surface->format->Amask) ) {
slouken@0
   297
	        blit_index |= 2;
slouken@0
   298
	}
slouken@0
   299
slouken@0
   300
	/* Check for special "identity" case -- copy blit */
slouken@0
   301
	if ( surface->map->identity && blit_index == 0 ) {
slouken@0
   302
	        surface->map->sw_data->blit = SDL_BlitCopy;
slouken@0
   303
slouken@0
   304
		/* Handle overlapping blits on the same surface */
slouken@0
   305
		if ( surface == surface->map->dst ) {
slouken@0
   306
		        surface->map->sw_data->blit = SDL_BlitCopyOverlap;
slouken@0
   307
		}
slouken@0
   308
	} else {
slouken@0
   309
		if ( surface->format->BitsPerPixel < 8 ) {
slouken@0
   310
			surface->map->sw_data->blit =
slouken@0
   311
			    SDL_CalculateBlit0(surface, blit_index);
slouken@0
   312
		} else {
slouken@0
   313
			switch ( surface->format->BytesPerPixel ) {
slouken@0
   314
			    case 1:
slouken@0
   315
				surface->map->sw_data->blit =
slouken@0
   316
				    SDL_CalculateBlit1(surface, blit_index);
slouken@0
   317
				break;
slouken@0
   318
			    case 2:
slouken@0
   319
			    case 3:
slouken@0
   320
			    case 4:
slouken@0
   321
				surface->map->sw_data->blit =
slouken@0
   322
				    SDL_CalculateBlitN(surface, blit_index);
slouken@0
   323
				break;
slouken@0
   324
			    default:
slouken@0
   325
				surface->map->sw_data->blit = NULL;
slouken@0
   326
				break;
slouken@0
   327
			}
slouken@0
   328
		}
slouken@0
   329
	}
slouken@0
   330
	/* Make sure we have a blit function */
slouken@0
   331
	if ( surface->map->sw_data->blit == NULL ) {
slouken@0
   332
		SDL_InvalidateMap(surface->map);
slouken@0
   333
		SDL_SetError("Blit combination not supported");
slouken@0
   334
		return(-1);
slouken@0
   335
	}
slouken@0
   336
slouken@0
   337
	/* Choose software blitting function */
slouken@0
   338
	if(surface->flags & SDL_RLEACCELOK
slouken@0
   339
	   && (surface->flags & SDL_HWACCEL) != SDL_HWACCEL) {
slouken@0
   340
slouken@0
   341
	        if(surface->map->identity
slouken@0
   342
		   && (blit_index == 1
slouken@0
   343
		       || (blit_index == 3 && !surface->format->Amask))) {
slouken@0
   344
		        if ( SDL_RLESurface(surface) == 0 )
slouken@0
   345
			        surface->map->sw_blit = SDL_RLEBlit;
slouken@0
   346
		} else if(blit_index == 2 && surface->format->Amask) {
slouken@0
   347
		        if ( SDL_RLESurface(surface) == 0 )
slouken@0
   348
			        surface->map->sw_blit = SDL_RLEAlphaBlit;
slouken@0
   349
		}
slouken@0
   350
	}
slouken@0
   351
	
slouken@0
   352
	if ( surface->map->sw_blit == NULL ) {
slouken@0
   353
		surface->map->sw_blit = SDL_SoftBlit;
slouken@0
   354
	}
slouken@0
   355
	return(0);
slouken@0
   356
}
slouken@0
   357