src/video/SDL_RLEaccel.c
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 526 4314a501d7be
child 739 22dbf364c017
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
slouken@0
     1
/*
slouken@0
     2
    SDL - Simple DirectMedia Layer
slouken@297
     3
    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
slouken@0
     4
slouken@0
     5
    This library is free software; you can redistribute it and/or
slouken@0
     6
    modify it under the terms of the GNU Library General Public
slouken@0
     7
    License as published by the Free Software Foundation; either
slouken@0
     8
    version 2 of the License, or (at your option) any later version.
slouken@0
     9
slouken@0
    10
    This library is distributed in the hope that it will be useful,
slouken@0
    11
    but WITHOUT ANY WARRANTY; without even the implied warranty of
slouken@0
    12
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
slouken@0
    13
    Library General Public License for more details.
slouken@0
    14
slouken@0
    15
    You should have received a copy of the GNU Library General Public
slouken@0
    16
    License along with this library; if not, write to the Free
slouken@0
    17
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
slouken@0
    18
slouken@0
    19
    Sam Lantinga
slouken@252
    20
    slouken@libsdl.org
slouken@0
    21
*/
slouken@0
    22
slouken@0
    23
#ifdef SAVE_RCSID
slouken@0
    24
static char rcsid =
slouken@0
    25
 "@(#) $Id$";
slouken@0
    26
#endif
slouken@0
    27
slouken@0
    28
/*
slouken@0
    29
 * RLE encoding for software colorkey and alpha-channel acceleration
slouken@0
    30
 *
slouken@0
    31
 * Original version by Sam Lantinga
slouken@0
    32
 *
slouken@0
    33
 * Mattias Engdegård (Yorick): Rewrite. New encoding format, encoder and
slouken@0
    34
 * decoder. Added per-surface alpha blitter. Added per-pixel alpha
slouken@0
    35
 * format, encoder and blitter.
slouken@0
    36
 *
slouken@0
    37
 * Many thanks to Xark and johns for hints, benchmarks and useful comments
slouken@0
    38
 * leading to this code.
slouken@0
    39
 *
slouken@0
    40
 * Welcome to Macro Mayhem.
slouken@0
    41
 */
slouken@0
    42
slouken@0
    43
/*
slouken@0
    44
 * The encoding translates the image data to a stream of segments of the form
slouken@0
    45
 *
slouken@0
    46
 * <skip> <run> <data>
slouken@0
    47
 *
slouken@0
    48
 * where <skip> is the number of transparent pixels to skip,
slouken@0
    49
 *       <run>  is the number of opaque pixels to blit,
slouken@0
    50
 * and   <data> are the pixels themselves.
slouken@0
    51
 *
slouken@0
    52
 * This basic structure is used both for colorkeyed surfaces, used for simple
slouken@0
    53
 * binary transparency and for per-surface alpha blending, and for surfaces
slouken@0
    54
 * with per-pixel alpha. The details differ, however:
slouken@0
    55
 *
slouken@0
    56
 * Encoding of colorkeyed surfaces:
slouken@0
    57
 *
slouken@0
    58
 *   Encoded pixels always have the same format as the target surface.
slouken@0
    59
 *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
slouken@0
    60
 *   where they are 16 bit. This makes the pixel data aligned at all times.
slouken@0
    61
 *   Segments never wrap around from one scan line to the next.
slouken@0
    62
 *
slouken@0
    63
 *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
slouken@0
    64
 *   beginning of a line.
slouken@0
    65
 *
slouken@0
    66
 * Encoding of surfaces with per-pixel alpha:
slouken@0
    67
 *
slouken@0
    68
 *   The sequence begins with a struct RLEDestFormat describing the target
slouken@0
    69
 *   pixel format, to provide reliable un-encoding.
slouken@0
    70
 *
slouken@0
    71
 *   Each scan line is encoded twice: First all completely opaque pixels,
slouken@0
    72
 *   encoded in the target format as described above, and then all
slouken@0
    73
 *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
slouken@0
    74
 *   in the following 32-bit format:
slouken@0
    75
 *
slouken@0
    76
 *   For 32-bit targets, each pixel has the target RGB format but with
slouken@0
    77
 *   the alpha value occupying the highest 8 bits. The <skip> and <run>
slouken@0
    78
 *   counts are 16 bit.
slouken@0
    79
 * 
slouken@0
    80
 *   For 16-bit targets, each pixel has the target RGB format, but with
slouken@0
    81
 *   the middle component (usually green) shifted 16 steps to the left,
slouken@0
    82
 *   and the hole filled with the 5 most significant bits of the alpha value.
slouken@0
    83
 *   i.e. if the target has the format         rrrrrggggggbbbbb,
slouken@0
    84
 *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
slouken@0
    85
 *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
slouken@0
    86
 *   for the translucent lines. Two padding bytes may be inserted
slouken@0
    87
 *   before each translucent line to keep them 32-bit aligned.
slouken@0
    88
 *
slouken@0
    89
 *   The end of the sequence is marked by a zero <skip>,<run> pair at the
slouken@0
    90
 *   beginning of an opaque line.
slouken@0
    91
 */
slouken@0
    92
slouken@0
    93
#include <stdio.h>
slouken@0
    94
#include <stdlib.h>
slouken@0
    95
#include <string.h>
slouken@0
    96
slouken@0
    97
#include "SDL_types.h"
slouken@0
    98
#include "SDL_video.h"
slouken@0
    99
#include "SDL_error.h"
slouken@0
   100
#include "SDL_sysvideo.h"
slouken@0
   101
#include "SDL_blit.h"
slouken@0
   102
#include "SDL_memops.h"
slouken@0
   103
#include "SDL_RLEaccel_c.h"
slouken@0
   104
slouken@689
   105
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   106
#include "mmx.h"
slouken@689
   107
/* Function to check the CPU flags */
slouken@689
   108
#define MMX_CPU		0x800000
slouken@689
   109
#define CPU_Flags()	Hermes_X86_CPU()
slouken@689
   110
#define X86_ASSEMBLER
slouken@689
   111
#define HermesConverterInterface	void
slouken@689
   112
#define HermesClearInterface		void
slouken@689
   113
#define STACKCALL
slouken@689
   114
#include "HeadX86.h"
slouken@689
   115
#endif
slouken@689
   116
slouken@0
   117
#ifndef MAX
slouken@0
   118
#define MAX(a, b) ((a) > (b) ? (a) : (b))
slouken@0
   119
#endif
slouken@0
   120
#ifndef MIN
slouken@0
   121
#define MIN(a, b) ((a) < (b) ? (a) : (b))
slouken@0
   122
#endif
slouken@0
   123
slouken@1
   124
#define PIXEL_COPY(to, from, len, bpp)			\
slouken@1
   125
do {							\
slouken@1
   126
    if(bpp == 4) {					\
slouken@1
   127
	SDL_memcpy4(to, from, (unsigned)(len));		\
slouken@1
   128
    } else {						\
slouken@1
   129
	SDL_memcpy(to, from, (unsigned)(len) * (bpp));	\
slouken@1
   130
    }							\
slouken@1
   131
} while(0)
slouken@1
   132
slouken@0
   133
/*
slouken@0
   134
 * Various colorkey blit methods, for opaque and per-surface alpha
slouken@0
   135
 */
slouken@0
   136
slouken@0
   137
#define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
slouken@1
   138
    PIXEL_COPY(to, from, length, bpp)
slouken@0
   139
slouken@689
   140
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@689
   141
slouken@689
   142
#define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
slouken@689
   143
    do {							\
slouken@689
   144
	Uint32 *srcp = (Uint32 *)(from);			\
slouken@689
   145
	Uint32 *dstp = (Uint32 *)(to);				\
slouken@689
   146
        int i = 0x00FF00FF;					\
slouken@689
   147
        movd_m2r(*(&i), mm3);					\
slouken@689
   148
        punpckldq_r2r(mm3, mm3);				\
slouken@689
   149
        i = 0xFF000000;						\
slouken@689
   150
        movd_m2r(*(&i), mm7);					\
slouken@689
   151
        punpckldq_r2r(mm7, mm7);				\
slouken@689
   152
        i = alpha | alpha << 16;				\
slouken@689
   153
        movd_m2r(*(&i), mm4);					\
slouken@689
   154
        punpckldq_r2r(mm4, mm4);				\
slouken@689
   155
	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
slouken@689
   156
	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
slouken@689
   157
        i = length;						\
slouken@689
   158
	if(i & 1) {						\
slouken@689
   159
          movd_m2r((*srcp), mm1); /* src -> mm1 */		\
slouken@689
   160
          punpcklbw_r2r(mm1, mm1);				\
slouken@689
   161
          pand_r2r(mm3, mm1);					\
slouken@689
   162
	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
slouken@689
   163
          punpcklbw_r2r(mm2, mm2);				\
slouken@689
   164
          pand_r2r(mm3, mm2);					\
slouken@689
   165
	  psubw_r2r(mm2, mm1);					\
slouken@689
   166
	  pmullw_r2r(mm4, mm1);					\
slouken@689
   167
	  psrlw_i2r(8, mm1);					\
slouken@689
   168
	  paddw_r2r(mm1, mm2);					\
slouken@689
   169
	  pand_r2r(mm3, mm2);					\
slouken@689
   170
	  packuswb_r2r(mm2, mm2);				\
slouken@689
   171
	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
slouken@689
   172
	  movd_r2m(mm2, *dstp);					\
slouken@689
   173
	  ++srcp;						\
slouken@689
   174
	  ++dstp;						\
slouken@689
   175
	  i--;							\
slouken@689
   176
	}							\
slouken@689
   177
	for(; i > 0; --i) {					\
slouken@689
   178
          movq_m2r((*srcp), mm0);				\
slouken@689
   179
	  movq_r2r(mm0, mm1);					\
slouken@689
   180
          punpcklbw_r2r(mm0, mm0);				\
slouken@689
   181
	  movq_m2r((*dstp), mm2);				\
slouken@689
   182
	  punpckhbw_r2r(mm1, mm1);				\
slouken@689
   183
	  movq_r2r(mm2, mm6);					\
slouken@689
   184
          pand_r2r(mm3, mm0);					\
slouken@689
   185
          punpcklbw_r2r(mm2, mm2);				\
slouken@689
   186
	  pand_r2r(mm3, mm1);					\
slouken@689
   187
	  punpckhbw_r2r(mm6, mm6);				\
slouken@689
   188
          pand_r2r(mm3, mm2);					\
slouken@689
   189
	  psubw_r2r(mm2, mm0);					\
slouken@689
   190
	  pmullw_r2r(mm4, mm0);					\
slouken@689
   191
	  pand_r2r(mm3, mm6);					\
slouken@689
   192
	  psubw_r2r(mm6, mm1);					\
slouken@689
   193
	  pmullw_r2r(mm4, mm1);					\
slouken@689
   194
	  psrlw_i2r(8, mm0);					\
slouken@689
   195
	  paddw_r2r(mm0, mm2);					\
slouken@689
   196
	  psrlw_i2r(8, mm1);					\
slouken@689
   197
	  paddw_r2r(mm1, mm6);					\
slouken@689
   198
	  pand_r2r(mm3, mm2);					\
slouken@689
   199
	  pand_r2r(mm3, mm6);					\
slouken@689
   200
	  packuswb_r2r(mm2, mm2);				\
slouken@689
   201
	  packuswb_r2r(mm6, mm6);				\
slouken@689
   202
	  psrlq_i2r(32, mm2);					\
slouken@689
   203
	  psllq_i2r(32, mm6);					\
slouken@689
   204
	  por_r2r(mm6, mm2);					\
slouken@689
   205
	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
slouken@689
   206
         movq_r2m(mm2, *dstp);					\
slouken@689
   207
	  srcp += 2;						\
slouken@689
   208
	  dstp += 2;						\
slouken@689
   209
	  i--;							\
slouken@689
   210
	}							\
slouken@689
   211
	emms();							\
slouken@689
   212
    } while(0)
slouken@689
   213
slouken@689
   214
#define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
slouken@689
   215
    do {						\
slouken@689
   216
        int i, n = 0;					\
slouken@689
   217
	Uint16 *srcp = (Uint16 *)(from);		\
slouken@689
   218
	Uint16 *dstp = (Uint16 *)(to);			\
slouken@689
   219
        Uint32 ALPHA = 0xF800;				\
slouken@689
   220
	movd_m2r(*(&ALPHA), mm1);			\
slouken@689
   221
        punpcklwd_r2r(mm1, mm1);			\
slouken@689
   222
        punpcklwd_r2r(mm1, mm1);			\
slouken@689
   223
	ALPHA = 0x07E0;					\
slouken@689
   224
	movd_m2r(*(&ALPHA), mm4);			\
slouken@689
   225
        punpcklwd_r2r(mm4, mm4);			\
slouken@689
   226
        punpcklwd_r2r(mm4, mm4);			\
slouken@689
   227
	ALPHA = 0x001F;					\
slouken@689
   228
	movd_m2r(*(&ALPHA), mm7);			\
slouken@689
   229
        punpcklwd_r2r(mm7, mm7);			\
slouken@689
   230
        punpcklwd_r2r(mm7, mm7);			\
slouken@689
   231
	alpha &= ~(1+2+4);				\
slouken@689
   232
        i = (Uint32)alpha | (Uint32)alpha << 16;	\
slouken@689
   233
        movd_m2r(*(&i), mm0);				\
slouken@689
   234
        punpckldq_r2r(mm0, mm0);			\
slouken@689
   235
        ALPHA = alpha >> 3;				\
slouken@689
   236
        i = ((int)(length) & 3);			\
slouken@689
   237
	for(; i > 0; --i) {				\
slouken@689
   238
	    Uint32 s = *srcp++;				\
slouken@689
   239
	    Uint32 d = *dstp;				\
slouken@689
   240
	    s = (s | s << 16) & 0x07e0f81f;		\
slouken@689
   241
	    d = (d | d << 16) & 0x07e0f81f;		\
slouken@689
   242
	    d += (s - d) * ALPHA >> 5;			\
slouken@689
   243
	    d &= 0x07e0f81f;				\
slouken@689
   244
	    *dstp++ = d | d >> 16;			\
slouken@689
   245
	    n++;					\
slouken@689
   246
	}						\
slouken@689
   247
	i = (int)(length) - n;				\
slouken@689
   248
	for(; i > 0; --i) {				\
slouken@689
   249
	  movq_m2r((*dstp), mm3);			\
slouken@689
   250
	  movq_m2r((*srcp), mm2);			\
slouken@689
   251
	  movq_r2r(mm2, mm5);				\
slouken@689
   252
	  pand_r2r(mm1 , mm5);				\
slouken@689
   253
	  psrlq_i2r(11, mm5);				\
slouken@689
   254
	  movq_r2r(mm3, mm6);				\
slouken@689
   255
	  pand_r2r(mm1 , mm6);				\
slouken@689
   256
	  psrlq_i2r(11, mm6);				\
slouken@689
   257
	  psubw_r2r(mm6, mm5);				\
slouken@689
   258
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   259
	  psrlw_i2r(8, mm5);				\
slouken@689
   260
	  paddw_r2r(mm5, mm6);				\
slouken@689
   261
	  psllq_i2r(11, mm6);				\
slouken@689
   262
	  pand_r2r(mm1, mm6);				\
slouken@689
   263
	  movq_r2r(mm4, mm5);				\
slouken@689
   264
	  por_r2r(mm7, mm5);				\
slouken@689
   265
	  pand_r2r(mm5, mm3);				\
slouken@689
   266
	  por_r2r(mm6, mm3);				\
slouken@689
   267
	  movq_r2r(mm2, mm5);				\
slouken@689
   268
	  pand_r2r(mm4 , mm5);				\
slouken@689
   269
	  psrlq_i2r(5, mm5);				\
slouken@689
   270
	  movq_r2r(mm3, mm6);				\
slouken@689
   271
	  pand_r2r(mm4 , mm6);				\
slouken@689
   272
	  psrlq_i2r(5, mm6);				\
slouken@689
   273
	  psubw_r2r(mm6, mm5);				\
slouken@689
   274
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   275
	  psrlw_i2r(8, mm5);				\
slouken@689
   276
	  paddw_r2r(mm5, mm6);				\
slouken@689
   277
	  psllq_i2r(5, mm6);				\
slouken@689
   278
	  pand_r2r(mm4, mm6);				\
slouken@689
   279
	  movq_r2r(mm1, mm5);				\
slouken@689
   280
	  por_r2r(mm7, mm5);				\
slouken@689
   281
	  pand_r2r(mm5, mm3);				\
slouken@689
   282
	  por_r2r(mm6, mm3);				\
slouken@689
   283
	  movq_r2r(mm2, mm5);				\
slouken@689
   284
	  pand_r2r(mm7 , mm5);				\
slouken@689
   285
          movq_r2r(mm3, mm6);				\
slouken@689
   286
	  pand_r2r(mm7 , mm6);				\
slouken@689
   287
	  psubw_r2r(mm6, mm5);				\
slouken@689
   288
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   289
	  psrlw_i2r(8, mm5);				\
slouken@689
   290
	  paddw_r2r(mm5, mm6);				\
slouken@689
   291
	  pand_r2r(mm7, mm6);				\
slouken@689
   292
	  movq_r2r(mm1, mm5);				\
slouken@689
   293
	  por_r2r(mm4, mm5);				\
slouken@689
   294
	  pand_r2r(mm5, mm3);				\
slouken@689
   295
	  por_r2r(mm6, mm3);				\
slouken@689
   296
	  movq_r2m(mm3, *dstp);				\
slouken@689
   297
	  srcp += 4;					\
slouken@689
   298
	  dstp += 4;					\
slouken@689
   299
	  i -= 3;					\
slouken@689
   300
	}						\
slouken@689
   301
	emms();						\
slouken@689
   302
    } while(0)
slouken@689
   303
slouken@689
   304
#define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
slouken@689
   305
    do {						\
slouken@689
   306
        int i, n = 0;					\
slouken@689
   307
	Uint16 *srcp = (Uint16 *)(from);		\
slouken@689
   308
	Uint16 *dstp = (Uint16 *)(to);			\
slouken@689
   309
        Uint32 ALPHA = 0x7C00;				\
slouken@689
   310
	movd_m2r(*(&ALPHA), mm1);			\
slouken@689
   311
        punpcklwd_r2r(mm1, mm1);			\
slouken@689
   312
        punpcklwd_r2r(mm1, mm1);			\
slouken@689
   313
	ALPHA = 0x03E0;					\
slouken@689
   314
        movd_m2r(*(&ALPHA), mm4);			\
slouken@689
   315
        punpcklwd_r2r(mm4, mm4);			\
slouken@689
   316
        punpcklwd_r2r(mm4, mm4);			\
slouken@689
   317
	ALPHA = 0x001F;					\
slouken@689
   318
	movd_m2r(*(&ALPHA), mm7);			\
slouken@689
   319
        punpcklwd_r2r(mm7, mm7);			\
slouken@689
   320
        punpcklwd_r2r(mm7, mm7);			\
slouken@689
   321
	alpha &= ~(1+2+4);				\
slouken@689
   322
        i = (Uint32)alpha | (Uint32)alpha << 16;	\
slouken@689
   323
        movd_m2r(*(&i), mm0);				\
slouken@689
   324
        punpckldq_r2r(mm0, mm0);			\
slouken@689
   325
        i = ((int)(length) & 3);				\
slouken@689
   326
        ALPHA = alpha >> 3;				\
slouken@689
   327
	for(; i > 0; --i) {				\
slouken@689
   328
	    Uint32 s = *srcp++;				\
slouken@689
   329
	    Uint32 d = *dstp;				\
slouken@689
   330
	    s = (s | s << 16) & 0x03e07c1f;		\
slouken@689
   331
	    d = (d | d << 16) & 0x03e07c1f;		\
slouken@689
   332
	    d += (s - d) * ALPHA >> 5;			\
slouken@689
   333
	    d &= 0x03e07c1f;				\
slouken@689
   334
	    *dstp++ = d | d >> 16;			\
slouken@689
   335
	    n++;					\
slouken@689
   336
	}						\
slouken@689
   337
	i = (int)(length) - n;				\
slouken@689
   338
	for(; i > 0; --i) {				\
slouken@689
   339
	  movq_m2r((*dstp), mm3);			\
slouken@689
   340
	  movq_m2r((*srcp), mm2);			\
slouken@689
   341
	  movq_r2r(mm2, mm5);				\
slouken@689
   342
	  pand_r2r(mm1 , mm5);				\
slouken@689
   343
	  psrlq_i2r(10, mm5);				\
slouken@689
   344
	  movq_r2r(mm3, mm6);				\
slouken@689
   345
	  pand_r2r(mm1 , mm6);				\
slouken@689
   346
	  psrlq_i2r(10, mm6);				\
slouken@689
   347
	  psubw_r2r(mm6, mm5);				\
slouken@689
   348
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   349
	  psrlw_i2r(8, mm5);				\
slouken@689
   350
	  paddw_r2r(mm5, mm6);				\
slouken@689
   351
	  psllq_i2r(10, mm6);				\
slouken@689
   352
	  pand_r2r(mm1, mm6);				\
slouken@689
   353
	  movq_r2r(mm4, mm5);				\
slouken@689
   354
	  por_r2r(mm7, mm5);				\
slouken@689
   355
	  pand_r2r(mm5, mm3);				\
slouken@689
   356
	  por_r2r(mm6, mm3);				\
slouken@689
   357
	  movq_r2r(mm2, mm5);				\
slouken@689
   358
	  pand_r2r(mm4 , mm5);				\
slouken@689
   359
	  psrlq_i2r(5, mm5);				\
slouken@689
   360
	  movq_r2r(mm3, mm6);				\
slouken@689
   361
	  pand_r2r(mm4 , mm6);				\
slouken@689
   362
	  psrlq_i2r(5, mm6);				\
slouken@689
   363
	  psubw_r2r(mm6, mm5);				\
slouken@689
   364
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   365
	  psrlw_i2r(8, mm5);				\
slouken@689
   366
	  paddw_r2r(mm5, mm6);				\
slouken@689
   367
	  psllq_i2r(5, mm6);				\
slouken@689
   368
	  pand_r2r(mm4, mm6);				\
slouken@689
   369
	  movq_r2r(mm1, mm5);				\
slouken@689
   370
	  por_r2r(mm7, mm5);				\
slouken@689
   371
	  pand_r2r(mm5, mm3);				\
slouken@689
   372
	  por_r2r(mm6, mm3);				\
slouken@689
   373
	  movq_r2r(mm2, mm5);				\
slouken@689
   374
	  pand_r2r(mm7 , mm5);				\
slouken@689
   375
          movq_r2r(mm3, mm6);				\
slouken@689
   376
	  pand_r2r(mm7 , mm6);				\
slouken@689
   377
	  psubw_r2r(mm6, mm5);				\
slouken@689
   378
	  pmullw_r2r(mm0, mm5);				\
slouken@689
   379
	  psrlw_i2r(8, mm5);				\
slouken@689
   380
	  paddw_r2r(mm5, mm6);				\
slouken@689
   381
	  pand_r2r(mm7, mm6);				\
slouken@689
   382
	  movq_r2r(mm1, mm5);				\
slouken@689
   383
	  por_r2r(mm4, mm5);				\
slouken@689
   384
	  pand_r2r(mm5, mm3);				\
slouken@689
   385
	  por_r2r(mm6, mm3);				\
slouken@689
   386
	  movq_r2m(mm3, *dstp);				\
slouken@689
   387
	  srcp += 4;					\
slouken@689
   388
	  dstp += 4;					\
slouken@689
   389
	  i -= 3;					\
slouken@689
   390
	}						\
slouken@689
   391
	emms();						\
slouken@689
   392
    } while(0)
slouken@689
   393
slouken@689
   394
#endif
slouken@689
   395
slouken@0
   396
/*
slouken@0
   397
 * For 32bpp pixels on the form 0x00rrggbb:
slouken@0
   398
 * If we treat the middle component separately, we can process the two
slouken@0
   399
 * remaining in parallel. This is safe to do because of the gap to the left
slouken@0
   400
 * of each component, so the bits from the multiplication don't collide.
slouken@0
   401
 * This can be used for any RGB permutation of course.
slouken@0
   402
 */
slouken@0
   403
#define ALPHA_BLIT32_888(to, from, length, bpp, alpha)		\
slouken@0
   404
    do {							\
slouken@0
   405
        int i;							\
slouken@0
   406
	Uint32 *src = (Uint32 *)(from);				\
slouken@0
   407
	Uint32 *dst = (Uint32 *)(to);				\
slouken@0
   408
	for(i = 0; i < (int)(length); i++) {			\
slouken@0
   409
	    Uint32 s = *src++;					\
slouken@0
   410
	    Uint32 d = *dst;					\
slouken@0
   411
	    Uint32 s1 = s & 0xff00ff;				\
slouken@0
   412
	    Uint32 d1 = d & 0xff00ff;				\
slouken@0
   413
	    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
slouken@0
   414
	    s &= 0xff00;					\
slouken@0
   415
	    d &= 0xff00;					\
slouken@0
   416
	    d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
slouken@0
   417
	    *dst++ = d1 | d;					\
slouken@0
   418
	}							\
slouken@0
   419
    } while(0)
slouken@0
   420
slouken@0
   421
/*
slouken@0
   422
 * For 16bpp pixels we can go a step further: put the middle component
slouken@0
   423
 * in the high 16 bits of a 32 bit word, and process all three RGB
slouken@0
   424
 * components at the same time. Since the smallest gap is here just
slouken@0
   425
 * 5 bits, we have to scale alpha down to 5 bits as well.
slouken@0
   426
 */
slouken@0
   427
#define ALPHA_BLIT16_565(to, from, length, bpp, alpha)	\
slouken@0
   428
    do {						\
slouken@0
   429
        int i;						\
slouken@0
   430
	Uint16 *src = (Uint16 *)(from);			\
slouken@0
   431
	Uint16 *dst = (Uint16 *)(to);			\
slouken@689
   432
	Uint32 ALPHA = alpha >> 3;			\
slouken@0
   433
	for(i = 0; i < (int)(length); i++) {		\
slouken@0
   434
	    Uint32 s = *src++;				\
slouken@0
   435
	    Uint32 d = *dst;				\
slouken@0
   436
	    s = (s | s << 16) & 0x07e0f81f;		\
slouken@0
   437
	    d = (d | d << 16) & 0x07e0f81f;		\
slouken@689
   438
	    d += (s - d) * ALPHA >> 5;			\
slouken@0
   439
	    d &= 0x07e0f81f;				\
slouken@0
   440
	    *dst++ = d | d >> 16;			\
slouken@0
   441
	}						\
slouken@0
   442
    } while(0)
slouken@0
   443
slouken@0
   444
#define ALPHA_BLIT16_555(to, from, length, bpp, alpha)	\
slouken@0
   445
    do {						\
slouken@0
   446
        int i;						\
slouken@0
   447
	Uint16 *src = (Uint16 *)(from);			\
slouken@0
   448
	Uint16 *dst = (Uint16 *)(to);			\
slouken@689
   449
	Uint32 ALPHA = alpha >> 3;			\
slouken@0
   450
	for(i = 0; i < (int)(length); i++) {		\
slouken@0
   451
	    Uint32 s = *src++;				\
slouken@0
   452
	    Uint32 d = *dst;				\
slouken@0
   453
	    s = (s | s << 16) & 0x03e07c1f;		\
slouken@0
   454
	    d = (d | d << 16) & 0x03e07c1f;		\
slouken@689
   455
	    d += (s - d) * ALPHA >> 5;			\
slouken@0
   456
	    d &= 0x03e07c1f;				\
slouken@0
   457
	    *dst++ = d | d >> 16;			\
slouken@0
   458
	}						\
slouken@0
   459
    } while(0)
slouken@0
   460
slouken@0
   461
/*
slouken@0
   462
 * The general slow catch-all function, for remaining depths and formats
slouken@0
   463
 */
slouken@0
   464
#define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)			\
slouken@0
   465
    do {								\
slouken@0
   466
        int i;								\
slouken@0
   467
	Uint8 *src = from;						\
slouken@0
   468
	Uint8 *dst = to;						\
slouken@0
   469
	for(i = 0; i < (int)(length); i++) {				\
slouken@0
   470
	    Uint32 s, d;						\
slouken@0
   471
	    unsigned rs, gs, bs, rd, gd, bd;				\
slouken@0
   472
	    switch(bpp) {						\
slouken@0
   473
	    case 2:							\
slouken@0
   474
		s = *(Uint16 *)src;					\
slouken@0
   475
		d = *(Uint16 *)dst;					\
slouken@0
   476
		break;							\
slouken@0
   477
	    case 3:							\
slouken@0
   478
		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
slouken@0
   479
		    s = (src[0] << 16) | (src[1] << 8) | src[2];	\
slouken@0
   480
		    d = (dst[0] << 16) | (dst[1] << 8) | dst[2];	\
slouken@0
   481
		} else {						\
slouken@0
   482
		    s = (src[2] << 16) | (src[1] << 8) | src[0];	\
slouken@0
   483
		    d = (dst[2] << 16) | (dst[1] << 8) | dst[0];	\
slouken@0
   484
		}							\
slouken@0
   485
		break;							\
slouken@0
   486
	    case 4:							\
slouken@0
   487
		s = *(Uint32 *)src;					\
slouken@0
   488
		d = *(Uint32 *)dst;					\
slouken@0
   489
		break;							\
slouken@0
   490
	    }								\
slouken@0
   491
	    RGB_FROM_PIXEL(s, fmt, rs, gs, bs);				\
slouken@0
   492
	    RGB_FROM_PIXEL(d, fmt, rd, gd, bd);				\
slouken@0
   493
	    rd += (rs - rd) * alpha >> 8;				\
slouken@0
   494
	    gd += (gs - gd) * alpha >> 8;				\
slouken@0
   495
	    bd += (bs - bd) * alpha >> 8;				\
slouken@0
   496
	    PIXEL_FROM_RGB(d, fmt, rd, gd, bd);				\
slouken@0
   497
	    switch(bpp) {						\
slouken@0
   498
	    case 2:							\
slouken@0
   499
		*(Uint16 *)dst = d;					\
slouken@0
   500
		break;							\
slouken@0
   501
	    case 3:							\
slouken@0
   502
		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
slouken@0
   503
		    dst[0] = d >> 16;					\
slouken@0
   504
		    dst[1] = d >> 8;					\
slouken@0
   505
		    dst[2] = d;						\
slouken@0
   506
		} else {						\
slouken@0
   507
		    dst[0] = d;						\
slouken@0
   508
		    dst[1] = d >> 8;					\
slouken@0
   509
		    dst[2] = d >> 16;					\
slouken@0
   510
		}							\
slouken@0
   511
		break;							\
slouken@0
   512
	    case 4:							\
slouken@0
   513
		*(Uint32 *)dst = d;					\
slouken@0
   514
		break;							\
slouken@0
   515
	    }								\
slouken@0
   516
	    src += bpp;							\
slouken@0
   517
	    dst += bpp;							\
slouken@0
   518
	}								\
slouken@0
   519
    } while(0)
slouken@0
   520
slouken@689
   521
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@0
   522
slouken@689
   523
#define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
slouken@689
   524
    do {								\
slouken@689
   525
	Uint32 *srcp = (Uint32 *)(from);				\
slouken@689
   526
	Uint32 *dstp = (Uint32 *)(to);					\
slouken@689
   527
        int i = 0x00fefefe;						\
slouken@689
   528
        movd_m2r(*(&i), mm4);						\
slouken@689
   529
        punpckldq_r2r(mm4, mm4);					\
slouken@689
   530
        i = 0x00010101;							\
slouken@689
   531
        movd_m2r(*(&i), mm3);						\
slouken@689
   532
        punpckldq_r2r(mm3, mm3);					\
slouken@689
   533
        i = (int)(length);						\
slouken@689
   534
        if( i & 1 ) {							\
slouken@689
   535
	  Uint32 s = *srcp++;						\
slouken@689
   536
	  Uint32 d = *dstp;						\
slouken@689
   537
	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
slouken@689
   538
		     + (s & d & 0x00010101);				\
slouken@689
   539
	  i--;								\
slouken@689
   540
	}								\
slouken@689
   541
	for(; i > 0; --i) {						\
slouken@689
   542
	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
slouken@689
   543
	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
slouken@689
   544
	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
slouken@689
   545
	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
slouken@689
   546
	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
slouken@689
   547
	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
slouken@689
   548
	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
slouken@689
   549
	    psrld_i2r(1, mm5);						\
slouken@689
   550
	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
slouken@689
   551
	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
slouken@689
   552
	    paddd_r2r(mm5, mm2);					\
slouken@689
   553
	    movq_r2m(mm2, (*dstp));					\
slouken@689
   554
	    dstp += 2;							\
slouken@689
   555
	    srcp += 2;							\
slouken@689
   556
	    i--;							\
slouken@689
   557
	}								\
slouken@689
   558
	emms();								\
slouken@689
   559
    } while(0)
slouken@689
   560
slouken@689
   561
#endif
slouken@689
   562
    
slouken@0
   563
/*
slouken@0
   564
 * Special case: 50% alpha (alpha=128)
slouken@0
   565
 * This is treated specially because it can be optimized very well, and
slouken@0
   566
 * since it is good for many cases of semi-translucency.
slouken@0
   567
 * The theory is to do all three components at the same time:
slouken@0
   568
 * First zero the lowest bit of each component, which gives us room to
slouken@0
   569
 * add them. Then shift right and add the sum of the lowest bits.
slouken@0
   570
 */
slouken@0
   571
#define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)		\
slouken@0
   572
    do {								\
slouken@0
   573
        int i;								\
slouken@0
   574
	Uint32 *src = (Uint32 *)(from);					\
slouken@0
   575
	Uint32 *dst = (Uint32 *)(to);					\
slouken@0
   576
	for(i = 0; i < (int)(length); i++) {				\
slouken@0
   577
	    Uint32 s = *src++;						\
slouken@0
   578
	    Uint32 d = *dst;						\
slouken@0
   579
	    *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
slouken@0
   580
		     + (s & d & 0x00010101);				\
slouken@0
   581
	}								\
slouken@0
   582
    } while(0)
slouken@0
   583
slouken@0
   584
/*
slouken@0
   585
 * For 16bpp, we can actually blend two pixels in parallel, if we take
slouken@0
   586
 * care to shift before we add, not after.
slouken@0
   587
 */
slouken@0
   588
slouken@0
   589
/* helper: blend a single 16 bit pixel at 50% */
slouken@0
   590
#define BLEND16_50(dst, src, mask)			\
slouken@0
   591
    do {						\
slouken@0
   592
        Uint32 s = *src++;				\
slouken@0
   593
	Uint32 d = *dst;				\
slouken@0
   594
	*dst++ = (((s & mask) + (d & mask)) >> 1)	\
slouken@0
   595
	         + (s & d & (~mask & 0xffff));		\
slouken@0
   596
    } while(0)
slouken@0
   597
slouken@0
   598
/* basic 16bpp blender. mask is the pixels to keep when adding. */
slouken@0
   599
#define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)		\
slouken@0
   600
    do {								\
slouken@0
   601
	unsigned n = (length);						\
slouken@0
   602
	Uint16 *src = (Uint16 *)(from);					\
slouken@0
   603
	Uint16 *dst = (Uint16 *)(to);					\
slouken@0
   604
	if(((unsigned long)src ^ (unsigned long)dst) & 3) {		\
slouken@0
   605
	    /* source and destination not in phase, blit one by one */	\
slouken@0
   606
	    while(n--)							\
slouken@0
   607
		BLEND16_50(dst, src, mask);				\
slouken@0
   608
	} else {							\
slouken@0
   609
	    if((unsigned long)src & 3) {				\
slouken@0
   610
		/* first odd pixel */					\
slouken@0
   611
		BLEND16_50(dst, src, mask);				\
slouken@0
   612
		n--;							\
slouken@0
   613
	    }								\
slouken@0
   614
	    for(; n > 1; n -= 2) {					\
slouken@0
   615
		Uint32 s = *(Uint32 *)src;				\
slouken@0
   616
		Uint32 d = *(Uint32 *)dst;				\
slouken@0
   617
		*(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)	\
slouken@0
   618
		               + ((d & (mask | mask << 16)) >> 1)	\
slouken@0
   619
		               + (s & d & (~(mask | mask << 16)));	\
slouken@0
   620
		src += 2;						\
slouken@0
   621
		dst += 2;						\
slouken@0
   622
	    }								\
slouken@0
   623
	    if(n)							\
slouken@0
   624
		BLEND16_50(dst, src, mask); /* last odd pixel */	\
slouken@0
   625
	}								\
slouken@0
   626
    } while(0)
slouken@0
   627
slouken@0
   628
#define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)	\
slouken@0
   629
    ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
slouken@0
   630
slouken@0
   631
#define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
slouken@0
   632
    ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
slouken@0
   633
slouken@689
   634
#if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
slouken@0
   635
slouken@0
   636
#define CHOOSE_BLIT(blitter, alpha, fmt)				\
slouken@0
   637
    do {								\
slouken@0
   638
        if(alpha == 255) {						\
slouken@0
   639
	    switch(fmt->BytesPerPixel) {				\
slouken@0
   640
	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
slouken@0
   641
	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
slouken@0
   642
	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
slouken@0
   643
	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
slouken@0
   644
	    }								\
slouken@0
   645
	} else {							\
slouken@0
   646
	    switch(fmt->BytesPerPixel) {				\
slouken@0
   647
	    case 1:							\
slouken@0
   648
		/* No 8bpp alpha blitting */				\
slouken@0
   649
		break;							\
slouken@0
   650
									\
slouken@0
   651
	    case 2:							\
slouken@0
   652
		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
slouken@0
   653
		case 0xffff:						\
slouken@0
   654
		    if(fmt->Gmask == 0x07e0				\
slouken@0
   655
		       || fmt->Rmask == 0x07e0				\
slouken@0
   656
		       || fmt->Bmask == 0x07e0) {			\
slouken@0
   657
			if(alpha == 128)				\
slouken@0
   658
			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
slouken@0
   659
			else {						\
slouken@689
   660
			    if((CPU_Flags()&MMX_CPU)!=0)		\
slouken@689
   661
				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
slouken@689
   662
			    else					\
slouken@689
   663
				blitter(2, Uint8, ALPHA_BLIT16_565);	\
slouken@689
   664
			}						\
slouken@689
   665
		    } else						\
slouken@689
   666
			goto general16;					\
slouken@689
   667
		    break;						\
slouken@689
   668
									\
slouken@689
   669
		case 0x7fff:						\
slouken@689
   670
		    if(fmt->Gmask == 0x03e0				\
slouken@689
   671
		       || fmt->Rmask == 0x03e0				\
slouken@689
   672
		       || fmt->Bmask == 0x03e0) {			\
slouken@689
   673
			if(alpha == 128)				\
slouken@689
   674
			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
slouken@689
   675
			else {						\
slouken@689
   676
			    if((CPU_Flags()&MMX_CPU)!=0)		\
slouken@689
   677
				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
slouken@689
   678
			    else					\
slouken@689
   679
				blitter(2, Uint8, ALPHA_BLIT16_555);	\
slouken@689
   680
			}						\
slouken@689
   681
			break;						\
slouken@689
   682
		    }							\
slouken@689
   683
		    /* fallthrough */					\
slouken@689
   684
									\
slouken@689
   685
		default:						\
slouken@689
   686
		general16:						\
slouken@689
   687
		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
slouken@689
   688
		}							\
slouken@689
   689
		break;							\
slouken@689
   690
									\
slouken@689
   691
	    case 3:							\
slouken@689
   692
		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
slouken@689
   693
		break;							\
slouken@689
   694
									\
slouken@689
   695
	    case 4:							\
slouken@689
   696
		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
slouken@689
   697
		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
slouken@689
   698
		       || fmt->Bmask == 0xff00)) {			\
slouken@689
   699
		    if(alpha == 128)					\
slouken@689
   700
		    {							\
slouken@689
   701
			if((CPU_Flags()&MMX_CPU)!=0)			\
slouken@689
   702
				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
slouken@689
   703
			else						\
slouken@689
   704
				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
slouken@689
   705
		    }							\
slouken@689
   706
		    else						\
slouken@689
   707
		    {							\
slouken@689
   708
			if((CPU_Flags()&MMX_CPU)!=0)			\
slouken@689
   709
				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
slouken@689
   710
			else						\
slouken@689
   711
				blitter(4, Uint16, ALPHA_BLIT32_888);	\
slouken@689
   712
		    }							\
slouken@689
   713
		} else							\
slouken@689
   714
		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
slouken@689
   715
		break;							\
slouken@689
   716
	    }								\
slouken@689
   717
	}								\
slouken@689
   718
    } while(0)
slouken@689
   719
slouken@689
   720
#else
slouken@689
   721
	
slouken@689
   722
#define CHOOSE_BLIT(blitter, alpha, fmt)				\
slouken@689
   723
    do {								\
slouken@689
   724
        if(alpha == 255) {						\
slouken@689
   725
	    switch(fmt->BytesPerPixel) {				\
slouken@689
   726
	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
slouken@689
   727
	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
slouken@689
   728
	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
slouken@689
   729
	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
slouken@689
   730
	    }								\
slouken@689
   731
	} else {							\
slouken@689
   732
	    switch(fmt->BytesPerPixel) {				\
slouken@689
   733
	    case 1:							\
slouken@689
   734
		/* No 8bpp alpha blitting */				\
slouken@689
   735
		break;							\
slouken@689
   736
									\
slouken@689
   737
	    case 2:							\
slouken@689
   738
		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
slouken@689
   739
		case 0xffff:						\
slouken@689
   740
		    if(fmt->Gmask == 0x07e0				\
slouken@689
   741
		       || fmt->Rmask == 0x07e0				\
slouken@689
   742
		       || fmt->Bmask == 0x07e0) {			\
slouken@689
   743
			if(alpha == 128)				\
slouken@689
   744
			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
slouken@689
   745
			else {						\
slouken@0
   746
			    blitter(2, Uint8, ALPHA_BLIT16_565);	\
slouken@0
   747
			}						\
slouken@0
   748
		    } else						\
slouken@0
   749
			goto general16;					\
slouken@0
   750
		    break;						\
slouken@0
   751
									\
slouken@0
   752
		case 0x7fff:						\
slouken@0
   753
		    if(fmt->Gmask == 0x03e0				\
slouken@0
   754
		       || fmt->Rmask == 0x03e0				\
slouken@0
   755
		       || fmt->Bmask == 0x03e0) {			\
slouken@0
   756
			if(alpha == 128)				\
slouken@0
   757
			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
slouken@0
   758
			else {						\
slouken@0
   759
			    blitter(2, Uint8, ALPHA_BLIT16_555);	\
slouken@0
   760
			}						\
slouken@0
   761
			break;						\
slouken@0
   762
		    }							\
slouken@0
   763
		    /* fallthrough */					\
slouken@0
   764
									\
slouken@0
   765
		default:						\
slouken@0
   766
		general16:						\
slouken@0
   767
		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
slouken@0
   768
		}							\
slouken@0
   769
		break;							\
slouken@0
   770
									\
slouken@0
   771
	    case 3:							\
slouken@0
   772
		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
slouken@0
   773
		break;							\
slouken@0
   774
									\
slouken@0
   775
	    case 4:							\
slouken@0
   776
		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
slouken@0
   777
		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
slouken@0
   778
		       || fmt->Bmask == 0xff00)) {			\
slouken@0
   779
		    if(alpha == 128)					\
slouken@0
   780
			blitter(4, Uint16, ALPHA_BLIT32_888_50);	\
slouken@0
   781
		    else						\
slouken@0
   782
			blitter(4, Uint16, ALPHA_BLIT32_888);		\
slouken@0
   783
		} else							\
slouken@0
   784
		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
slouken@0
   785
		break;							\
slouken@0
   786
	    }								\
slouken@0
   787
	}								\
slouken@0
   788
    } while(0)
slouken@0
   789
slouken@689
   790
#endif
slouken@0
   791
slouken@0
   792
/*
slouken@0
   793
 * This takes care of the case when the surface is clipped on the left and/or
slouken@0
   794
 * right. Top clipping has already been taken care of.
slouken@0
   795
 */
slouken@0
   796
static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
slouken@0
   797
			Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
slouken@0
   798
{
slouken@0
   799
    SDL_PixelFormat *fmt = dst->format;
slouken@0
   800
slouken@0
   801
#define RLECLIPBLIT(bpp, Type, do_blit)					   \
slouken@0
   802
    do {								   \
slouken@0
   803
	int linecount = srcrect->h;					   \
slouken@0
   804
	int ofs = 0;							   \
slouken@0
   805
	int left = srcrect->x;						   \
slouken@0
   806
	int right = left + srcrect->w;					   \
slouken@0
   807
	dstbuf -= left * bpp;						   \
slouken@0
   808
	for(;;) {							   \
slouken@0
   809
	    int run;							   \
slouken@0
   810
	    ofs += *(Type *)srcbuf;					   \
slouken@0
   811
	    run = ((Type *)srcbuf)[1];					   \
slouken@0
   812
	    srcbuf += 2 * sizeof(Type);					   \
slouken@0
   813
	    if(run) {							   \
slouken@0
   814
		/* clip to left and right borders */			   \
slouken@0
   815
		if(ofs < right) {					   \
slouken@0
   816
		    int start = 0;					   \
slouken@0
   817
		    int len = run;					   \
slouken@0
   818
		    int startcol;					   \
slouken@0
   819
		    if(left - ofs > 0) {				   \
slouken@0
   820
			start = left - ofs;				   \
slouken@0
   821
			len -= start;					   \
slouken@0
   822
			if(len <= 0)					   \
slouken@0
   823
			    goto nocopy ## bpp ## do_blit;		   \
slouken@0
   824
		    }							   \
slouken@0
   825
		    startcol = ofs + start;				   \
slouken@0
   826
		    if(len > right - startcol)				   \
slouken@0
   827
			len = right - startcol;				   \
slouken@0
   828
		    do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
slouken@0
   829
			    len, bpp, alpha);				   \
slouken@0
   830
		}							   \
slouken@0
   831
	    nocopy ## bpp ## do_blit:					   \
slouken@0
   832
		srcbuf += run * bpp;					   \
slouken@0
   833
		ofs += run;						   \
slouken@0
   834
	    } else if(!ofs)						   \
slouken@0
   835
		break;							   \
slouken@0
   836
	    if(ofs == w) {						   \
slouken@0
   837
		ofs = 0;						   \
slouken@0
   838
		dstbuf += dst->pitch;					   \
slouken@0
   839
		if(!--linecount)					   \
slouken@0
   840
		    break;						   \
slouken@0
   841
	    }								   \
slouken@0
   842
	}								   \
slouken@0
   843
    } while(0)
slouken@0
   844
slouken@0
   845
    CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
slouken@0
   846
slouken@0
   847
#undef RLECLIPBLIT
slouken@0
   848
slouken@0
   849
}
slouken@0
   850
slouken@0
   851
slouken@0
   852
/* blit a colorkeyed RLE surface */
slouken@0
   853
int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
slouken@0
   854
		SDL_Surface *dst, SDL_Rect *dstrect)
slouken@0
   855
{
slouken@0
   856
	Uint8 *dstbuf;
slouken@0
   857
	Uint8 *srcbuf;
slouken@0
   858
	int x, y;
slouken@0
   859
	int w = src->w;
slouken@0
   860
	unsigned alpha;
slouken@0
   861
slouken@0
   862
	/* Lock the destination if necessary */
slouken@526
   863
	if ( SDL_MUSTLOCK(dst) ) {
slouken@526
   864
		if ( SDL_LockSurface(dst) < 0 ) {
slouken@0
   865
			return(-1);
slouken@0
   866
		}
slouken@0
   867
	}
slouken@0
   868
slouken@0
   869
	/* Set up the source and destination pointers */
slouken@0
   870
	x = dstrect->x;
slouken@0
   871
	y = dstrect->y;
slouken@526
   872
	dstbuf = (Uint8 *)dst->pixels
slouken@0
   873
	         + y * dst->pitch + x * src->format->BytesPerPixel;
slouken@0
   874
	srcbuf = (Uint8 *)src->map->sw_data->aux_data;
slouken@0
   875
slouken@0
   876
	{
slouken@0
   877
	    /* skip lines at the top if neccessary */
slouken@0
   878
	    int vskip = srcrect->y;
slouken@0
   879
	    int ofs = 0;
slouken@0
   880
	    if(vskip) {
slouken@0
   881
slouken@0
   882
#define RLESKIP(bpp, Type)			\
slouken@0
   883
		for(;;) {			\
slouken@0
   884
		    int run;			\
slouken@0
   885
		    ofs += *(Type *)srcbuf;	\
slouken@0
   886
		    run = ((Type *)srcbuf)[1];	\
slouken@0
   887
		    srcbuf += sizeof(Type) * 2;	\
slouken@0
   888
		    if(run) {			\
slouken@0
   889
			srcbuf += run * bpp;	\
slouken@0
   890
			ofs += run;		\
slouken@0
   891
		    } else if(!ofs)		\
slouken@0
   892
			goto done;		\
slouken@0
   893
		    if(ofs == w) {		\
slouken@0
   894
			ofs = 0;		\
slouken@0
   895
			if(!--vskip)		\
slouken@0
   896
			    break;		\
slouken@0
   897
		    }				\
slouken@0
   898
		}
slouken@0
   899
slouken@0
   900
		switch(src->format->BytesPerPixel) {
slouken@0
   901
		case 1: RLESKIP(1, Uint8); break;
slouken@0
   902
		case 2: RLESKIP(2, Uint8); break;
slouken@0
   903
		case 3: RLESKIP(3, Uint8); break;
slouken@0
   904
		case 4: RLESKIP(4, Uint16); break;
slouken@0
   905
		}
slouken@0
   906
slouken@0
   907
#undef RLESKIP
slouken@0
   908
slouken@0
   909
	    }
slouken@0
   910
	}
slouken@0
   911
slouken@0
   912
	alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
slouken@0
   913
	        ? src->format->alpha : 255;
slouken@0
   914
	/* if left or right edge clipping needed, call clip blit */
slouken@0
   915
	if ( srcrect->x || srcrect->w != src->w ) {
slouken@0
   916
	    RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
slouken@0
   917
	} else {
slouken@0
   918
	    SDL_PixelFormat *fmt = src->format;
slouken@0
   919
slouken@0
   920
#define RLEBLIT(bpp, Type, do_blit)					      \
slouken@0
   921
	    do {							      \
slouken@0
   922
		int linecount = srcrect->h;				      \
slouken@0
   923
		int ofs = 0;						      \
slouken@0
   924
		for(;;) {						      \
slouken@0
   925
		    unsigned run;					      \
slouken@0
   926
		    ofs += *(Type *)srcbuf;				      \
slouken@0
   927
		    run = ((Type *)srcbuf)[1];				      \
slouken@0
   928
		    srcbuf += 2 * sizeof(Type);				      \
slouken@0
   929
		    if(run) {						      \
slouken@0
   930
			do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
slouken@0
   931
			srcbuf += run * bpp;				      \
slouken@0
   932
			ofs += run;					      \
slouken@0
   933
		    } else if(!ofs)					      \
slouken@0
   934
			break;						      \
slouken@0
   935
		    if(ofs == w) {					      \
slouken@0
   936
			ofs = 0;					      \
slouken@0
   937
			dstbuf += dst->pitch;				      \
slouken@0
   938
			if(!--linecount)				      \
slouken@0
   939
			    break;					      \
slouken@0
   940
		    }							      \
slouken@0
   941
		}							      \
slouken@0
   942
	    } while(0)
slouken@0
   943
slouken@0
   944
	    CHOOSE_BLIT(RLEBLIT, alpha, fmt);
slouken@0
   945
slouken@0
   946
#undef RLEBLIT
slouken@0
   947
	}
slouken@0
   948
slouken@0
   949
done:
slouken@0
   950
	/* Unlock the destination if necessary */
slouken@526
   951
	if ( SDL_MUSTLOCK(dst) ) {
slouken@526
   952
		SDL_UnlockSurface(dst);
slouken@0
   953
	}
slouken@0
   954
	return(0);
slouken@0
   955
}
slouken@0
   956
slouken@0
   957
#undef OPAQUE_BLIT
slouken@0
   958
slouken@0
   959
/*
slouken@0
   960
 * Per-pixel blitting macros for translucent pixels:
slouken@0
   961
 * These use the same techniques as the per-surface blitting macros
slouken@0
   962
 */
slouken@0
   963
slouken@0
   964
/*
slouken@0
   965
 * For 32bpp pixels, we have made sure the alpha is stored in the top
slouken@0
   966
 * 8 bits, so proceed as usual
slouken@0
   967
 */
slouken@0
   968
#define BLIT_TRANSL_888(src, dst)				\
slouken@0
   969
    do {							\
slouken@0
   970
        Uint32 s = src;						\
slouken@0
   971
	Uint32 d = dst;						\
slouken@0
   972
	unsigned alpha = s >> 24;				\
slouken@0
   973
	Uint32 s1 = s & 0xff00ff;				\
slouken@0
   974
	Uint32 d1 = d & 0xff00ff;				\
slouken@0
   975
	d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
slouken@0
   976
	s &= 0xff00;						\
slouken@0
   977
	d &= 0xff00;						\
slouken@0
   978
	d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
slouken@0
   979
	dst = d1 | d;						\
slouken@0
   980
    } while(0)
slouken@0
   981
slouken@0
   982
/*
slouken@0
   983
 * For 16bpp pixels, we have stored the 5 most significant alpha bits in
slouken@0
   984
 * bits 5-10. As before, we can process all 3 RGB components at the same time.
slouken@0
   985
 */
slouken@0
   986
#define BLIT_TRANSL_565(src, dst)		\
slouken@0
   987
    do {					\
slouken@0
   988
        Uint32 s = src;				\
slouken@0
   989
	Uint32 d = dst;				\
slouken@0
   990
	unsigned alpha = (s & 0x3e0) >> 5;	\
slouken@0
   991
	s &= 0x07e0f81f;			\
slouken@0
   992
	d = (d | d << 16) & 0x07e0f81f;		\
slouken@0
   993
	d += (s - d) * alpha >> 5;		\
slouken@0
   994
	d &= 0x07e0f81f;			\
slouken@0
   995
	dst = d | d >> 16;			\
slouken@0
   996
    } while(0)
slouken@0
   997
slouken@0
   998
#define BLIT_TRANSL_555(src, dst)		\
slouken@0
   999
    do {					\
slouken@0
  1000
        Uint32 s = src;				\
slouken@0
  1001
	Uint32 d = dst;				\
slouken@0
  1002
	unsigned alpha = (s & 0x3e0) >> 5;	\
slouken@0
  1003
	s &= 0x03e07c1f;			\
slouken@0
  1004
	d = (d | d << 16) & 0x03e07c1f;		\
slouken@0
  1005
	d += (s - d) * alpha >> 5;		\
slouken@0
  1006
	d &= 0x03e07c1f;			\
slouken@0
  1007
	dst = d | d >> 16;			\
slouken@0
  1008
    } while(0)
slouken@0
  1009
slouken@0
  1010
/* used to save the destination format in the encoding. Designed to be
slouken@0
  1011
   macro-compatible with SDL_PixelFormat but without the unneeded fields */
slouken@0
  1012
typedef struct {
slouken@0
  1013
    	Uint8  BytesPerPixel;
slouken@0
  1014
	Uint8  Rloss;
slouken@0
  1015
	Uint8  Gloss;
slouken@0
  1016
	Uint8  Bloss;
slouken@0
  1017
	Uint8  Rshift;
slouken@0
  1018
	Uint8  Gshift;
slouken@0
  1019
	Uint8  Bshift;
slouken@0
  1020
	Uint8  Ashift;
slouken@0
  1021
	Uint32 Rmask;
slouken@0
  1022
	Uint32 Gmask;
slouken@0
  1023
	Uint32 Bmask;
slouken@0
  1024
	Uint32 Amask;
slouken@0
  1025
} RLEDestFormat;
slouken@0
  1026
slouken@0
  1027
/* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
slouken@0
  1028
static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
slouken@0
  1029
			     Uint8 *dstbuf, SDL_Rect *srcrect)
slouken@0
  1030
{
slouken@0
  1031
    SDL_PixelFormat *df = dst->format;
slouken@0
  1032
    /*
slouken@0
  1033
     * clipped blitter: Ptype is the destination pixel type,
slouken@0
  1034
     * Ctype the translucent count type, and do_blend the macro
slouken@0
  1035
     * to blend one pixel.
slouken@0
  1036
     */
slouken@0
  1037
#define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)			  \
slouken@0
  1038
    do {								  \
slouken@0
  1039
	int linecount = srcrect->h;					  \
slouken@0
  1040
	int left = srcrect->x;						  \
slouken@0
  1041
	int right = left + srcrect->w;					  \
slouken@0
  1042
	dstbuf -= left * sizeof(Ptype);					  \
slouken@0
  1043
	do {								  \
slouken@0
  1044
	    int ofs = 0;						  \
slouken@0
  1045
	    /* blit opaque pixels on one line */			  \
slouken@0
  1046
	    do {							  \
slouken@0
  1047
		unsigned run;						  \
slouken@0
  1048
		ofs += ((Ctype *)srcbuf)[0];				  \
slouken@0
  1049
		run = ((Ctype *)srcbuf)[1];				  \
slouken@0
  1050
		srcbuf += 2 * sizeof(Ctype);				  \
slouken@0
  1051
		if(run) {						  \
slouken@0
  1052
		    /* clip to left and right borders */		  \
slouken@0
  1053
		    int cofs = ofs;					  \
slouken@0
  1054
		    int crun = run;					  \
slouken@0
  1055
		    if(left - cofs > 0) {				  \
slouken@0
  1056
			crun -= left - cofs;				  \
slouken@0
  1057
			cofs = left;					  \
slouken@0
  1058
		    }							  \
slouken@0
  1059
		    if(crun > right - cofs)				  \
slouken@0
  1060
			crun = right - cofs;				  \
slouken@0
  1061
		    if(crun > 0)					  \
slouken@1
  1062
			PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),	  \
slouken@0
  1063
				   srcbuf + (cofs - ofs) * sizeof(Ptype), \
slouken@1
  1064
				   (unsigned)crun, sizeof(Ptype));	  \
slouken@0
  1065
		    srcbuf += run * sizeof(Ptype);			  \
slouken@0
  1066
		    ofs += run;						  \
slouken@0
  1067
		} else if(!ofs)						  \
slouken@0
  1068
		    return;						  \
slouken@0
  1069
	    } while(ofs < w);						  \
slouken@0
  1070
	    /* skip padding if necessary */				  \
slouken@0
  1071
	    if(sizeof(Ptype) == 2)					  \
slouken@0
  1072
		srcbuf += (unsigned long)srcbuf & 2;			  \
slouken@0
  1073
	    /* blit translucent pixels on the same line */		  \
slouken@0
  1074
	    ofs = 0;							  \
slouken@0
  1075
	    do {							  \
slouken@0
  1076
		unsigned run;						  \
slouken@0
  1077
		ofs += ((Uint16 *)srcbuf)[0];				  \
slouken@0
  1078
		run = ((Uint16 *)srcbuf)[1];				  \
slouken@0
  1079
		srcbuf += 4;						  \
slouken@0
  1080
		if(run) {						  \
slouken@0
  1081
		    /* clip to left and right borders */		  \
slouken@0
  1082
		    int cofs = ofs;					  \
slouken@0
  1083
		    int crun = run;					  \
slouken@0
  1084
		    if(left - cofs > 0) {				  \
slouken@0
  1085
			crun -= left - cofs;				  \
slouken@0
  1086
			cofs = left;					  \
slouken@0
  1087
		    }							  \
slouken@0
  1088
		    if(crun > right - cofs)				  \
slouken@0
  1089
			crun = right - cofs;				  \
slouken@0
  1090
		    if(crun > 0) {					  \
slouken@0
  1091
			Ptype *dst = (Ptype *)dstbuf + cofs;		  \
slouken@0
  1092
			Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);	  \
slouken@0
  1093
			int i;						  \
slouken@0
  1094
			for(i = 0; i < crun; i++)			  \
slouken@0
  1095
			    do_blend(src[i], dst[i]);			  \
slouken@0
  1096
		    }							  \
slouken@0
  1097
		    srcbuf += run * 4;					  \
slouken@0
  1098
		    ofs += run;						  \
slouken@0
  1099
		}							  \
slouken@0
  1100
	    } while(ofs < w);						  \
slouken@0
  1101
	    dstbuf += dst->pitch;					  \
slouken@0
  1102
	} while(--linecount);						  \
slouken@0
  1103
    } while(0)
slouken@0
  1104
slouken@0
  1105
    switch(df->BytesPerPixel) {
slouken@0
  1106
    case 2:
slouken@0
  1107
	if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
slouken@0
  1108
	   || df->Bmask == 0x07e0)
slouken@0
  1109
	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
slouken@0
  1110
	else
slouken@0
  1111
	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
slouken@0
  1112
	break;
slouken@0
  1113
    case 4:
slouken@0
  1114
	RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
slouken@0
  1115
	break;
slouken@0
  1116
    }
slouken@0
  1117
}
slouken@0
  1118
slouken@0
  1119
/* blit a pixel-alpha RLE surface */
slouken@0
  1120
int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
slouken@0
  1121
		     SDL_Surface *dst, SDL_Rect *dstrect)
slouken@0
  1122
{
slouken@0
  1123
    int x, y;
slouken@0
  1124
    int w = src->w;
slouken@0
  1125
    Uint8 *srcbuf, *dstbuf;
slouken@0
  1126
    SDL_PixelFormat *df = dst->format;
slouken@0
  1127
slouken@0
  1128
    /* Lock the destination if necessary */
slouken@526
  1129
    if ( SDL_MUSTLOCK(dst) ) {
slouken@526
  1130
	if ( SDL_LockSurface(dst) < 0 ) {
slouken@0
  1131
	    return -1;
slouken@0
  1132
	}
slouken@0
  1133
    }
slouken@0
  1134
slouken@0
  1135
    x = dstrect->x;
slouken@0
  1136
    y = dstrect->y;
slouken@526
  1137
    dstbuf = (Uint8 *)dst->pixels
slouken@0
  1138
	     + y * dst->pitch + x * df->BytesPerPixel;
slouken@0
  1139
    srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
slouken@0
  1140
slouken@0
  1141
    {
slouken@0
  1142
	/* skip lines at the top if necessary */
slouken@0
  1143
	int vskip = srcrect->y;
slouken@0
  1144
	if(vskip) {
slouken@0
  1145
	    int ofs;
slouken@0
  1146
	    if(df->BytesPerPixel == 2) {
slouken@0
  1147
		/* the 16/32 interleaved format */
slouken@0
  1148
		do {
slouken@0
  1149
		    /* skip opaque line */
slouken@0
  1150
		    ofs = 0;
slouken@0
  1151
		    do {
slouken@0
  1152
			int run;
slouken@0
  1153
			ofs += srcbuf[0];
slouken@0
  1154
			run = srcbuf[1];
slouken@0
  1155
			srcbuf += 2;
slouken@0
  1156
			if(run) {
slouken@0
  1157
			    srcbuf += 2 * run;
slouken@0
  1158
			    ofs += run;
slouken@0
  1159
			} else if(!ofs)
slouken@0
  1160
			    goto done;
slouken@0
  1161
		    } while(ofs < w);
slouken@0
  1162
slouken@0
  1163
		    /* skip padding */
slouken@0
  1164
		    srcbuf += (unsigned long)srcbuf & 2;
slouken@0
  1165
slouken@0
  1166
		    /* skip translucent line */
slouken@0
  1167
		    ofs = 0;
slouken@0
  1168
		    do {
slouken@0
  1169
			int run;
slouken@0
  1170
			ofs += ((Uint16 *)srcbuf)[0];
slouken@0
  1171
			run = ((Uint16 *)srcbuf)[1];
slouken@0
  1172
			srcbuf += 4 * (run + 1);
slouken@0
  1173
			ofs += run;
slouken@0
  1174
		    } while(ofs < w);
slouken@0
  1175
		} while(--vskip);
slouken@0
  1176
	    } else {
slouken@0
  1177
		/* the 32/32 interleaved format */
slouken@0
  1178
		vskip <<= 1;	/* opaque and translucent have same format */
slouken@0
  1179
		do {
slouken@0
  1180
		    ofs = 0;
slouken@0
  1181
		    do {
slouken@0
  1182
			int run;
slouken@0
  1183
			ofs += ((Uint16 *)srcbuf)[0];
slouken@0
  1184
			run = ((Uint16 *)srcbuf)[1];
slouken@0
  1185
			srcbuf += 4;
slouken@0
  1186
			if(run) {
slouken@0
  1187
			    srcbuf += 4 * run;
slouken@0
  1188
			    ofs += run;
slouken@0
  1189
			} else if(!ofs)
slouken@0
  1190
			    goto done;
slouken@0
  1191
		    } while(ofs < w);
slouken@0
  1192
		} while(--vskip);
slouken@0
  1193
	    }
slouken@0
  1194
	}
slouken@0
  1195
    }
slouken@0
  1196
slouken@0
  1197
    /* if left or right edge clipping needed, call clip blit */
slouken@0
  1198
    if(srcrect->x || srcrect->w != src->w) {
slouken@0
  1199
	RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
slouken@0
  1200
    } else {
slouken@0
  1201
slouken@0
  1202
	/*
slouken@0
  1203
	 * non-clipped blitter. Ptype is the destination pixel type,
slouken@0
  1204
	 * Ctype the translucent count type, and do_blend the
slouken@0
  1205
	 * macro to blend one pixel.
slouken@0
  1206
	 */
slouken@0
  1207
#define RLEALPHABLIT(Ptype, Ctype, do_blend)				 \
slouken@0
  1208
	do {								 \
slouken@0
  1209
	    int linecount = srcrect->h;					 \
slouken@0
  1210
	    do {							 \
slouken@0
  1211
		int ofs = 0;						 \
slouken@0
  1212
		/* blit opaque pixels on one line */			 \
slouken@0
  1213
		do {							 \
slouken@0
  1214
		    unsigned run;					 \
slouken@0
  1215
		    ofs += ((Ctype *)srcbuf)[0];			 \
slouken@0
  1216
		    run = ((Ctype *)srcbuf)[1];				 \
slouken@0
  1217
		    srcbuf += 2 * sizeof(Ctype);			 \
slouken@0
  1218
		    if(run) {						 \
slouken@1
  1219
			PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
slouken@1
  1220
				   run, sizeof(Ptype));			 \
slouken@0
  1221
			srcbuf += run * sizeof(Ptype);			 \
slouken@0
  1222
			ofs += run;					 \
slouken@0
  1223
		    } else if(!ofs)					 \
slouken@0
  1224
			goto done;					 \
slouken@0
  1225
		} while(ofs < w);					 \
slouken@0
  1226
		/* skip padding if necessary */				 \
slouken@0
  1227
		if(sizeof(Ptype) == 2)					 \
slouken@0
  1228
		    srcbuf += (unsigned long)srcbuf & 2;		 \
slouken@0
  1229
		/* blit translucent pixels on the same line */		 \
slouken@0
  1230
		ofs = 0;						 \
slouken@0
  1231
		do {							 \
slouken@0
  1232
		    unsigned run;					 \
slouken@0
  1233
		    ofs += ((Uint16 *)srcbuf)[0];			 \
slouken@0
  1234
		    run = ((Uint16 *)srcbuf)[1];			 \
slouken@0
  1235
		    srcbuf += 4;					 \
slouken@0
  1236
		    if(run) {						 \
slouken@0
  1237
			Ptype *dst = (Ptype *)dstbuf + ofs;		 \
slouken@0
  1238
			unsigned i;					 \
slouken@0
  1239
			for(i = 0; i < run; i++) {			 \
slouken@0
  1240
			    Uint32 src = *(Uint32 *)srcbuf;		 \
slouken@0
  1241
			    do_blend(src, *dst);			 \
slouken@0
  1242
			    srcbuf += 4;				 \
slouken@0
  1243
			    dst++;					 \
slouken@0
  1244
			}						 \
slouken@0
  1245
			ofs += run;					 \
slouken@0
  1246
		    }							 \
slouken@0
  1247
		} while(ofs < w);					 \
slouken@0
  1248
		dstbuf += dst->pitch;					 \
slouken@0
  1249
	    } while(--linecount);					 \
slouken@0
  1250
	} while(0)
slouken@0
  1251
slouken@0
  1252
	switch(df->BytesPerPixel) {
slouken@0
  1253
	case 2:
slouken@0
  1254
	    if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
slouken@0
  1255
	       || df->Bmask == 0x07e0)
slouken@0
  1256
		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
slouken@0
  1257
	    else
slouken@0
  1258
		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
slouken@0
  1259
	    break;
slouken@0
  1260
	case 4:
slouken@0
  1261
	    RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
slouken@0
  1262
	    break;
slouken@0
  1263
	}
slouken@0
  1264
    }
slouken@0
  1265
slouken@0
  1266
 done:
slouken@0
  1267
    /* Unlock the destination if necessary */
slouken@526
  1268
    if ( SDL_MUSTLOCK(dst) ) {
slouken@526
  1269
	SDL_UnlockSurface(dst);
slouken@0
  1270
    }
slouken@0
  1271
    return 0;
slouken@0
  1272
}
slouken@0
  1273
slouken@0
  1274
/*
slouken@0
  1275
 * Auxiliary functions:
slouken@0
  1276
 * The encoding functions take 32bpp rgb + a, and
slouken@0
  1277
 * return the number of bytes copied to the destination.
slouken@0
  1278
 * The decoding functions copy to 32bpp rgb + a, and
slouken@0
  1279
 * return the number of bytes copied from the source.
slouken@0
  1280
 * These are only used in the encoder and un-RLE code and are therefore not
slouken@0
  1281
 * highly optimised.
slouken@0
  1282
 */
slouken@0
  1283
slouken@0
  1284
/* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
slouken@0
  1285
static int copy_opaque_16(void *dst, Uint32 *src, int n,
slouken@0
  1286
			  SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1287
{
slouken@0
  1288
    int i;
slouken@0
  1289
    Uint16 *d = dst;
slouken@0
  1290
    for(i = 0; i < n; i++) {
slouken@0
  1291
	unsigned r, g, b;
slouken@0
  1292
	RGB_FROM_PIXEL(*src, sfmt, r, g, b);
slouken@0
  1293
	PIXEL_FROM_RGB(*d, dfmt, r, g, b);
slouken@0
  1294
	src++;
slouken@0
  1295
	d++;
slouken@0
  1296
    }
slouken@0
  1297
    return n * 2;
slouken@0
  1298
}
slouken@0
  1299
slouken@0
  1300
/* decode opaque pixels from 16bpp to 32bpp rgb + a */
slouken@0
  1301
static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
slouken@0
  1302
			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1303
{
slouken@0
  1304
    int i;
slouken@0
  1305
    Uint16 *s = src;
slouken@0
  1306
    unsigned alpha = dfmt->Amask ? 255 : 0;
slouken@0
  1307
    for(i = 0; i < n; i++) {
slouken@0
  1308
	unsigned r, g, b;
slouken@0
  1309
	RGB_FROM_PIXEL(*s, sfmt, r, g, b);
slouken@0
  1310
	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
slouken@0
  1311
	s++;
slouken@0
  1312
	dst++;
slouken@0
  1313
    }
slouken@0
  1314
    return n * 2;
slouken@0
  1315
}
slouken@0
  1316
slouken@0
  1317
slouken@0
  1318
slouken@0
  1319
/* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
slouken@0
  1320
static int copy_transl_565(void *dst, Uint32 *src, int n,
slouken@0
  1321
			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1322
{
slouken@0
  1323
    int i;
slouken@0
  1324
    Uint32 *d = dst;
slouken@0
  1325
    for(i = 0; i < n; i++) {
slouken@0
  1326
	unsigned r, g, b, a;
slouken@0
  1327
	Uint16 pix;
slouken@0
  1328
	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
slouken@0
  1329
	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
slouken@0
  1330
	*d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
slouken@0
  1331
	src++;
slouken@0
  1332
	d++;
slouken@0
  1333
    }
slouken@0
  1334
    return n * 4;
slouken@0
  1335
}
slouken@0
  1336
slouken@0
  1337
/* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
slouken@0
  1338
static int copy_transl_555(void *dst, Uint32 *src, int n,
slouken@0
  1339
			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1340
{
slouken@0
  1341
    int i;
slouken@0
  1342
    Uint32 *d = dst;
slouken@0
  1343
    for(i = 0; i < n; i++) {
slouken@0
  1344
	unsigned r, g, b, a;
slouken@0
  1345
	Uint16 pix;
slouken@0
  1346
	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
slouken@0
  1347
	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
slouken@0
  1348
	*d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
slouken@0
  1349
	src++;
slouken@0
  1350
	d++;
slouken@0
  1351
    }
slouken@0
  1352
    return n * 4;
slouken@0
  1353
}
slouken@0
  1354
slouken@0
  1355
/* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
slouken@0
  1356
static int uncopy_transl_16(Uint32 *dst, void *src, int n,
slouken@0
  1357
			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1358
{
slouken@0
  1359
    int i;
slouken@0
  1360
    Uint32 *s = src;
slouken@0
  1361
    for(i = 0; i < n; i++) {
slouken@0
  1362
	unsigned r, g, b, a;
slouken@0
  1363
	Uint32 pix = *s++;
slouken@0
  1364
	a = (pix & 0x3e0) >> 2;
slouken@0
  1365
	pix = (pix & ~0x3e0) | pix >> 16;
slouken@0
  1366
	RGB_FROM_PIXEL(pix, sfmt, r, g, b);
slouken@0
  1367
	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
slouken@0
  1368
	dst++;
slouken@0
  1369
    }
slouken@0
  1370
    return n * 4;
slouken@0
  1371
}
slouken@0
  1372
slouken@0
  1373
/* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
slouken@0
  1374
static int copy_32(void *dst, Uint32 *src, int n,
slouken@0
  1375
		   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1376
{
slouken@0
  1377
    int i;
slouken@0
  1378
    Uint32 *d = dst;
slouken@0
  1379
    for(i = 0; i < n; i++) {
slouken@0
  1380
	unsigned r, g, b, a;
slouken@0
  1381
	Uint32 pixel;
slouken@0
  1382
	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
slouken@0
  1383
	PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
slouken@0
  1384
	*d++ = pixel | a << 24;
slouken@0
  1385
	src++;
slouken@0
  1386
    }
slouken@0
  1387
    return n * 4;
slouken@0
  1388
}
slouken@0
  1389
slouken@0
  1390
/* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
slouken@0
  1391
static int uncopy_32(Uint32 *dst, void *src, int n,
slouken@0
  1392
		     RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
slouken@0
  1393
{
slouken@0
  1394
    int i;
slouken@0
  1395
    Uint32 *s = src;
slouken@0
  1396
    for(i = 0; i < n; i++) {
slouken@0
  1397
	unsigned r, g, b, a;
slouken@0
  1398
	Uint32 pixel = *s++;
slouken@0
  1399
	RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
slouken@0
  1400
	a = pixel >> 24;
slouken@0
  1401
	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
slouken@0
  1402
	dst++;
slouken@0
  1403
    }
slouken@0
  1404
    return n * 4;
slouken@0
  1405
}
slouken@0
  1406
slouken@0
  1407
#define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
slouken@0
  1408
slouken@0
  1409
#define ISTRANSL(pixel, fmt)	\
slouken@0
  1410
    ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
slouken@0
  1411
slouken@0
  1412
/* convert surface to be quickly alpha-blittable onto dest, if possible */
slouken@0
  1413
static int RLEAlphaSurface(SDL_Surface *surface)
slouken@0
  1414
{
slouken@0
  1415
    SDL_Surface *dest;
slouken@0
  1416
    SDL_PixelFormat *df;
slouken@0
  1417
    int maxsize = 0;
slouken@0
  1418
    int max_opaque_run;
slouken@0
  1419
    int max_transl_run = 65535;
slouken@0
  1420
    unsigned masksum;
slouken@0
  1421
    Uint8 *rlebuf, *dst;
slouken@0
  1422
    int (*copy_opaque)(void *, Uint32 *, int,
slouken@0
  1423
		       SDL_PixelFormat *, SDL_PixelFormat *);
slouken@0
  1424
    int (*copy_transl)(void *, Uint32 *, int,
slouken@0
  1425
		       SDL_PixelFormat *, SDL_PixelFormat *);
slouken@0
  1426
slouken@0
  1427
    dest = surface->map->dst;
slouken@0
  1428
    if(!dest)
slouken@0
  1429
	return -1;
slouken@0
  1430
    df = dest->format;
slouken@0
  1431
    if(surface->format->BitsPerPixel != 32)
slouken@0
  1432
	return -1;		/* only 32bpp source supported */
slouken@0
  1433
slouken@0
  1434
    /* find out whether the destination is one we support,
slouken@0
  1435
       and determine the max size of the encoded result */
slouken@0
  1436
    masksum = df->Rmask | df->Gmask | df->Bmask;
slouken@0
  1437
    switch(df->BytesPerPixel) {
slouken@0
  1438
    case 2:
slouken@0
  1439
	/* 16bpp: only support 565 and 555 formats */
slouken@0
  1440
	switch(masksum) {
slouken@0
  1441
	case 0xffff:
slouken@0
  1442
	    if(df->Gmask == 0x07e0
slouken@0
  1443
	       || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
slouken@0
  1444
		copy_opaque = copy_opaque_16;
slouken@0
  1445
		copy_transl = copy_transl_565;
slouken@0
  1446
	    } else
slouken@0
  1447
		return -1;
slouken@0
  1448
	    break;
slouken@0
  1449
	case 0x7fff:
slouken@0
  1450
	    if(df->Gmask == 0x03e0
slouken@0
  1451
	       || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
slouken@0
  1452
		copy_opaque = copy_opaque_16;
slouken@0
  1453
		copy_transl = copy_transl_555;
slouken@0
  1454
	    } else
slouken@0
  1455
		return -1;
slouken@0
  1456
	    break;
slouken@0
  1457
	default:
slouken@0
  1458
	    return -1;
slouken@0
  1459
	}
slouken@0
  1460
	max_opaque_run = 255;	/* runs stored as bytes */
slouken@0
  1461
slouken@0
  1462
	/* worst case is alternating opaque and translucent pixels,
slouken@0
  1463
	   with room for alignment padding between lines */
slouken@0
  1464
	maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
slouken@0
  1465
	break;
slouken@0
  1466
    case 4:
slouken@0
  1467
	if(masksum != 0x00ffffff)
slouken@0
  1468
	    return -1;		/* requires unused high byte */
slouken@0
  1469
	copy_opaque = copy_32;
slouken@0
  1470
	copy_transl = copy_32;
slouken@0
  1471
	max_opaque_run = 255;	/* runs stored as short ints */
slouken@0
  1472
slouken@0
  1473
	/* worst case is alternating opaque and translucent pixels */
slouken@0
  1474
	maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
slouken@0
  1475
	break;
slouken@0
  1476
    default:
slouken@0
  1477
	return -1;		/* anything else unsupported right now */
slouken@0
  1478
    }
slouken@0
  1479
slouken@0
  1480
    maxsize += sizeof(RLEDestFormat);
slouken@0
  1481
    rlebuf = (Uint8 *)malloc(maxsize);
slouken@0
  1482
    if(!rlebuf) {
slouken@0
  1483
	SDL_OutOfMemory();
slouken@0
  1484
	return -1;
slouken@0
  1485
    }
slouken@0
  1486
    {
slouken@0
  1487
	/* save the destination format so we can undo the encoding later */
slouken@0
  1488
	RLEDestFormat *r = (RLEDestFormat *)rlebuf;
slouken@0
  1489
	r->BytesPerPixel = df->BytesPerPixel;
slouken@0
  1490
	r->Rloss = df->Rloss;
slouken@0
  1491
	r->Gloss = df->Gloss;
slouken@0
  1492
	r->Bloss = df->Bloss;
slouken@0
  1493
	r->Rshift = df->Rshift;
slouken@0
  1494
	r->Gshift = df->Gshift;
slouken@0
  1495
	r->Bshift = df->Bshift;
slouken@0
  1496
	r->Ashift = df->Ashift;
slouken@0
  1497
	r->Rmask = df->Rmask;
slouken@0
  1498
	r->Gmask = df->Gmask;
slouken@0
  1499
	r->Bmask = df->Bmask;
slouken@0
  1500
	r->Amask = df->Amask;
slouken@0
  1501
    }
slouken@0
  1502
    dst = rlebuf + sizeof(RLEDestFormat);
slouken@0
  1503
slouken@0
  1504
    /* Do the actual encoding */
slouken@0
  1505
    {
slouken@0
  1506
	int x, y;
slouken@0
  1507
	int h = surface->h, w = surface->w;
slouken@0
  1508
	SDL_PixelFormat *sf = surface->format;
slouken@526
  1509
	Uint32 *src = (Uint32 *)surface->pixels;
slouken@0
  1510
	Uint8 *lastline = dst;	/* end of last non-blank line */
slouken@0
  1511
slouken@0
  1512
	/* opaque counts are 8 or 16 bits, depending on target depth */
slouken@0
  1513
#define ADD_OPAQUE_COUNTS(n, m)			\
slouken@0
  1514
	if(df->BytesPerPixel == 4) {		\
slouken@0
  1515
	    ((Uint16 *)dst)[0] = n;		\
slouken@0
  1516
	    ((Uint16 *)dst)[1] = m;		\
slouken@0
  1517
	    dst += 4;				\
slouken@0
  1518
	} else {				\
slouken@0
  1519
	    dst[0] = n;				\
slouken@0
  1520
	    dst[1] = m;				\
slouken@0
  1521
	    dst += 2;				\
slouken@0
  1522
	}
slouken@0
  1523
slouken@0
  1524
	/* translucent counts are always 16 bit */
slouken@0
  1525
#define ADD_TRANSL_COUNTS(n, m)		\
slouken@0
  1526
	(((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
slouken@0
  1527
slouken@0
  1528
	for(y = 0; y < h; y++) {
slouken@0
  1529
	    int runstart, skipstart;
slouken@0
  1530
	    int blankline = 0;
slouken@0
  1531
	    /* First encode all opaque pixels of a scan line */
slouken@0
  1532
	    x = 0;
slouken@0
  1533
	    do {
slouken@0
  1534
		int run, skip, len;
slouken@0
  1535
		skipstart = x;
slouken@0
  1536
		while(x < w && !ISOPAQUE(src[x], sf))
slouken@0
  1537
		    x++;
slouken@0
  1538
		runstart = x;
slouken@0
  1539
		while(x < w && ISOPAQUE(src[x], sf))
slouken@0
  1540
		    x++;
slouken@0
  1541
		skip = runstart - skipstart;
slouken@0
  1542
		if(skip == w)
slouken@0
  1543
		    blankline = 1;
slouken@0
  1544
		run = x - runstart;
slouken@0
  1545
		while(skip > max_opaque_run) {
slouken@0
  1546
		    ADD_OPAQUE_COUNTS(max_opaque_run, 0);
slouken@0
  1547
		    skip -= max_opaque_run;
slouken@0
  1548
		}
slouken@0
  1549
		len = MIN(run, max_opaque_run);
slouken@0
  1550
		ADD_OPAQUE_COUNTS(skip, len);
slouken@0
  1551
		dst += copy_opaque(dst, src + runstart, len, sf, df);
slouken@0
  1552
		runstart += len;
slouken@0
  1553
		run -= len;
slouken@0
  1554
		while(run) {
slouken@0
  1555
		    len = MIN(run, max_opaque_run);
slouken@0
  1556
		    ADD_OPAQUE_COUNTS(0, len);
slouken@0
  1557
		    dst += copy_opaque(dst, src + runstart, len, sf, df);
slouken@0
  1558
		    runstart += len;
slouken@0
  1559
		    run -= len;
slouken@0
  1560
		}
slouken@0
  1561
	    } while(x < w);
slouken@0
  1562
slouken@0
  1563
	    /* Make sure the next output address is 32-bit aligned */
slouken@0
  1564
	    dst += (unsigned long)dst & 2;
slouken@0
  1565
slouken@0
  1566
	    /* Next, encode all translucent pixels of the same scan line */
slouken@0
  1567
	    x = 0;
slouken@0
  1568
	    do {
slouken@0
  1569
		int run, skip, len;
slouken@0
  1570
		skipstart = x;
slouken@0
  1571
		while(x < w && !ISTRANSL(src[x], sf))
slouken@0
  1572
		    x++;
slouken@0
  1573
		runstart = x;
slouken@0
  1574
		while(x < w && ISTRANSL(src[x], sf))
slouken@0
  1575
		    x++;
slouken@0
  1576
		skip = runstart - skipstart;
slouken@0
  1577
		blankline &= (skip == w);
slouken@0
  1578
		run = x - runstart;
slouken@0
  1579
		while(skip > max_transl_run) {
slouken@0
  1580
		    ADD_TRANSL_COUNTS(max_transl_run, 0);
slouken@0
  1581
		    skip -= max_transl_run;
slouken@0
  1582
		}
slouken@0
  1583
		len = MIN(run, max_transl_run);
slouken@0
  1584
		ADD_TRANSL_COUNTS(skip, len);
slouken@0
  1585
		dst += copy_transl(dst, src + runstart, len, sf, df);
slouken@0
  1586
		runstart += len;
slouken@0
  1587
		run -= len;
slouken@0
  1588
		while(run) {
slouken@0
  1589
		    len = MIN(run, max_transl_run);
slouken@0
  1590
		    ADD_TRANSL_COUNTS(0, len);
slouken@0
  1591
		    dst += copy_transl(dst, src + runstart, len, sf, df);
slouken@0
  1592
		    runstart += len;
slouken@0
  1593
		    run -= len;
slouken@0
  1594
		}
slouken@0
  1595
		if(!blankline)
slouken@0
  1596
		    lastline = dst;
slouken@0
  1597
	    } while(x < w);
slouken@0
  1598
slouken@0
  1599
	    src += surface->pitch >> 2;
slouken@0
  1600
	}
slouken@0
  1601
	dst = lastline;		/* back up past trailing blank lines */
slouken@0
  1602
	ADD_OPAQUE_COUNTS(0, 0);
slouken@0
  1603
    }
slouken@0
  1604
slouken@0
  1605
#undef ADD_OPAQUE_COUNTS
slouken@0
  1606
#undef ADD_TRANSL_COUNTS
slouken@0
  1607
slouken@0
  1608
    /* Now that we have it encoded, release the original pixels */
slouken@0
  1609
    if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
slouken@0
  1610
       && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
slouken@0
  1611
	free( surface->pixels );
slouken@0
  1612
	surface->pixels = NULL;
slouken@0
  1613
    }
slouken@0
  1614
slouken@0
  1615
    /* realloc the buffer to release unused memory */
slouken@0
  1616
    {
slouken@0
  1617
	Uint8 *p = realloc(rlebuf, dst - rlebuf);
slouken@0
  1618
	if(!p)
slouken@0
  1619
	    p = rlebuf;
slouken@0
  1620
	surface->map->sw_data->aux_data = p;
slouken@0
  1621
    }
slouken@0
  1622
slouken@0
  1623
    return 0;
slouken@0
  1624
}
slouken@0
  1625
slouken@0
  1626
static Uint32 getpix_8(Uint8 *srcbuf)
slouken@0
  1627
{
slouken@0
  1628
    return *srcbuf;
slouken@0
  1629
}
slouken@0
  1630
slouken@0
  1631
static Uint32 getpix_16(Uint8 *srcbuf)
slouken@0
  1632
{
slouken@0
  1633
    return *(Uint16 *)srcbuf;
slouken@0
  1634
}
slouken@0
  1635
slouken@0
  1636
static Uint32 getpix_24(Uint8 *srcbuf)
slouken@0
  1637
{
slouken@0
  1638
    if(SDL_BYTEORDER == SDL_LIL_ENDIAN)
slouken@0
  1639
	return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
slouken@0
  1640
    else
slouken@0
  1641
	return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
slouken@0
  1642
}
slouken@0
  1643
slouken@0
  1644
static Uint32 getpix_32(Uint8 *srcbuf)
slouken@0
  1645
{
slouken@0
  1646
    return *(Uint32 *)srcbuf;
slouken@0
  1647
}
slouken@0
  1648
slouken@0
  1649
typedef Uint32 (*getpix_func)(Uint8 *);
slouken@0
  1650
slouken@0
  1651
static getpix_func getpixes[4] = {
slouken@0
  1652
    getpix_8, getpix_16, getpix_24, getpix_32
slouken@0
  1653
};
slouken@0
  1654
slouken@0
  1655
static int RLEColorkeySurface(SDL_Surface *surface)
slouken@0
  1656
{
slouken@0
  1657
        Uint8 *rlebuf, *dst;
slouken@0
  1658
	int maxn;
slouken@0
  1659
	int y;
slouken@0
  1660
	Uint8 *srcbuf, *curbuf, *lastline;
slouken@0
  1661
	int maxsize = 0;
slouken@0
  1662
	int skip, run;
slouken@0
  1663
	int bpp = surface->format->BytesPerPixel;
slouken@0
  1664
	getpix_func getpix;
slouken@0
  1665
	Uint32 ckey, rgbmask;
slouken@0
  1666
	int w, h;
slouken@0
  1667
slouken@0
  1668
	/* calculate the worst case size for the compressed surface */
slouken@0
  1669
	switch(bpp) {
slouken@0
  1670
	case 1:
slouken@0
  1671
	    /* worst case is alternating opaque and transparent pixels,
slouken@0
  1672
	       starting with an opaque pixel */
slouken@0
  1673
	    maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
slouken@0
  1674
	    break;
slouken@0
  1675
	case 2:
slouken@0
  1676
	case 3:
slouken@0
  1677
	    /* worst case is solid runs, at most 255 pixels wide */
slouken@0
  1678
	    maxsize = surface->h * (2 * (surface->w / 255 + 1)
slouken@0
  1679
				    + surface->w * bpp) + 2;
slouken@0
  1680
	    break;
slouken@0
  1681
	case 4:
slouken@0
  1682
	    /* worst case is solid runs, at most 65535 pixels wide */
slouken@0
  1683
	    maxsize = surface->h * (4 * (surface->w / 65535 + 1)
slouken@0
  1684
				    + surface->w * 4) + 4;
slouken@0
  1685
	    break;
slouken@0
  1686
	}
slouken@0
  1687
slouken@0
  1688
	rlebuf = (Uint8 *)malloc(maxsize);
slouken@0
  1689
	if ( rlebuf == NULL ) {
slouken@0
  1690
		SDL_OutOfMemory();
slouken@0
  1691
		return(-1);
slouken@0
  1692
	}
slouken@0
  1693
slouken@0
  1694
	/* Set up the conversion */
slouken@526
  1695
	srcbuf = (Uint8 *)surface->pixels;
slouken@0
  1696
	curbuf = srcbuf;
slouken@0
  1697
	maxn = bpp == 4 ? 65535 : 255;
slouken@0
  1698
	skip = run = 0;
slouken@0
  1699
	dst = rlebuf;
slouken@0
  1700
	rgbmask = ~surface->format->Amask;
slouken@0
  1701
	ckey = surface->format->colorkey & rgbmask;
slouken@0
  1702
	lastline = dst;
slouken@0
  1703
	getpix = getpixes[bpp - 1];
slouken@0
  1704
	w = surface->w;
slouken@0
  1705
	h = surface->h;
slouken@0
  1706
slouken@0
  1707
#define ADD_COUNTS(n, m)			\
slouken@0
  1708
	if(bpp == 4) {				\
slouken@0
  1709
	    ((Uint16 *)dst)[0] = n;		\
slouken@0
  1710
	    ((Uint16 *)dst)[1] = m;		\
slouken@0
  1711
	    dst += 4;				\
slouken@0
  1712
	} else {				\
slouken@0
  1713
	    dst[0] = n;				\
slouken@0
  1714
	    dst[1] = m;				\
slouken@0
  1715
	    dst += 2;				\
slouken@0
  1716
	}
slouken@0
  1717
slouken@0
  1718
	for(y = 0; y < h; y++) {
slouken@0
  1719
	    int x = 0;
slouken@0
  1720
	    int blankline = 0;
slouken@0
  1721
	    do {
slouken@0
  1722
		int run, skip, len;
slouken@0
  1723
		int runstart;
slouken@0
  1724
		int skipstart = x;
slouken@0
  1725
slouken@0
  1726
		/* find run of transparent, then opaque pixels */
slouken@0
  1727
		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
slouken@0
  1728
		    x++;
slouken@0
  1729
		runstart = x;
slouken@0
  1730
		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
slouken@0
  1731
		    x++;
slouken@0
  1732
		skip = runstart - skipstart;
slouken@0
  1733
		if(skip == w)
slouken@0
  1734
		    blankline = 1;
slouken@0
  1735
		run = x - runstart;
slouken@0
  1736
slouken@0
  1737
		/* encode segment */
slouken@0
  1738
		while(skip > maxn) {
slouken@0
  1739
		    ADD_COUNTS(maxn, 0);
slouken@0
  1740
		    skip -= maxn;
slouken@0
  1741
		}
slouken@0
  1742
		len = MIN(run, maxn);
slouken@0
  1743
		ADD_COUNTS(skip, len);
slouken@0
  1744
		memcpy(dst, srcbuf + runstart * bpp, len * bpp);
slouken@0
  1745
		dst += len * bpp;
slouken@0
  1746
		run -= len;
slouken@0
  1747
		runstart += len;
slouken@0
  1748
		while(run) {
slouken@0
  1749
		    len = MIN(run, maxn);
slouken@0
  1750
		    ADD_COUNTS(0, len);
slouken@0
  1751
		    memcpy(dst, srcbuf + runstart * bpp, len * bpp);
slouken@0
  1752
		    dst += len * bpp;
slouken@0
  1753
		    runstart += len;
slouken@0
  1754
		    run -= len;
slouken@0
  1755
		}
slouken@0
  1756
		if(!blankline)
slouken@0
  1757
		    lastline = dst;
slouken@0
  1758
	    } while(x < w);
slouken@0
  1759
slouken@0
  1760
	    srcbuf += surface->pitch;
slouken@0
  1761
	}
slouken@0
  1762
	dst = lastline;		/* back up bast trailing blank lines */
slouken@0
  1763
	ADD_COUNTS(0, 0);
slouken@0
  1764
slouken@0
  1765
#undef ADD_COUNTS
slouken@0
  1766
slouken@0
  1767
	/* Now that we have it encoded, release the original pixels */
slouken@0
  1768
	if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
slouken@0
  1769
	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
slouken@0
  1770
	    free( surface->pixels );
slouken@0
  1771
	    surface->pixels = NULL;
slouken@0
  1772
	}
slouken@0
  1773
slouken@0
  1774
	/* realloc the buffer to release unused memory */
slouken@0
  1775
	{
slouken@0
  1776
	    /* If realloc returns NULL, the original block is left intact */
slouken@0
  1777
	    Uint8 *p = realloc(rlebuf, dst - rlebuf);
slouken@0
  1778
	    if(!p)
slouken@0
  1779
		p = rlebuf;
slouken@0
  1780
	    surface->map->sw_data->aux_data = p;
slouken@0
  1781
	}
slouken@0
  1782
slouken@0
  1783
	return(0);
slouken@0
  1784
}
slouken@0
  1785
slouken@0
  1786
int SDL_RLESurface(SDL_Surface *surface)
slouken@0
  1787
{
slouken@0
  1788
	int retcode;
slouken@0
  1789
slouken@0
  1790
	/* Clear any previous RLE conversion */
slouken@0
  1791
	if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
slouken@0
  1792
		SDL_UnRLESurface(surface, 1);
slouken@0
  1793
	}
slouken@0
  1794
slouken@0
  1795
	/* We don't support RLE encoding of bitmaps */
slouken@0
  1796
	if ( surface->format->BitsPerPixel < 8 ) {
slouken@0
  1797
		return(-1);
slouken@0
  1798
	}
slouken@0
  1799
slouken@0
  1800
	/* Lock the surface if it's in hardware */
slouken@526
  1801
	if ( SDL_MUSTLOCK(surface) ) {
slouken@526
  1802
		if ( SDL_LockSurface(surface) < 0 ) {
slouken@0
  1803
			return(-1);
slouken@0
  1804
		}
slouken@0
  1805
	}
slouken@0
  1806
slouken@0
  1807
	/* Encode */
slouken@0
  1808
	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
slouken@0
  1809
	    retcode = RLEColorkeySurface(surface);
slouken@0
  1810
	} else {
slouken@0
  1811
	    if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
slouken@0
  1812
	       && surface->format->Amask != 0)
slouken@0
  1813
		retcode = RLEAlphaSurface(surface);
slouken@0
  1814
	    else
slouken@0
  1815
		retcode = -1;	/* no RLE for per-surface alpha sans ckey */
slouken@0
  1816
	}
slouken@0
  1817
slouken@0
  1818
	/* Unlock the surface if it's in hardware */
slouken@526
  1819
	if ( SDL_MUSTLOCK(surface) ) {
slouken@526
  1820
		SDL_UnlockSurface(surface);
slouken@0
  1821
	}
slouken@0
  1822
slouken@0
  1823
	if(retcode < 0)
slouken@0
  1824
	    return -1;
slouken@0
  1825
slouken@0
  1826
	/* The surface is now accelerated */
slouken@0
  1827
	surface->flags |= SDL_RLEACCEL;
slouken@0
  1828
slouken@0
  1829
	return(0);
slouken@0
  1830
}
slouken@0
  1831
slouken@0
  1832
/*
slouken@0
  1833
 * Un-RLE a surface with pixel alpha
slouken@0
  1834
 * This may not give back exactly the image before RLE-encoding; all
slouken@0
  1835
 * completely transparent pixels will be lost, and colour and alpha depth
slouken@0
  1836
 * may have been reduced (when encoding for 16bpp targets).
slouken@0
  1837
 */
slouken@0
  1838
static void UnRLEAlpha(SDL_Surface *surface)
slouken@0
  1839
{
slouken@0
  1840
    Uint8 *srcbuf;
slouken@0
  1841
    Uint32 *dst;
slouken@0
  1842
    SDL_PixelFormat *sf = surface->format;
slouken@0
  1843
    RLEDestFormat *df = surface->map->sw_data->aux_data;
slouken@0
  1844
    int (*uncopy_opaque)(Uint32 *, void *, int,
slouken@0
  1845
			 RLEDestFormat *, SDL_PixelFormat *);
slouken@0
  1846
    int (*uncopy_transl)(Uint32 *, void *, int,
slouken@0
  1847
			 RLEDestFormat *, SDL_PixelFormat *);
slouken@0
  1848
    int w = surface->w;
slouken@0
  1849
    int bpp = df->BytesPerPixel;
slouken@0
  1850
slouken@0
  1851
    if(bpp == 2) {
slouken@0
  1852
	uncopy_opaque = uncopy_opaque_16;
slouken@0
  1853
	uncopy_transl = uncopy_transl_16;
slouken@0
  1854
    } else {
slouken@0
  1855
	uncopy_opaque = uncopy_transl = uncopy_32;
slouken@0
  1856
    }
slouken@0
  1857
slouken@0
  1858
    surface->pixels = malloc(surface->h * surface->pitch);
slouken@0
  1859
    /* fill background with transparent pixels */
slouken@0
  1860
    memset(surface->pixels, 0, surface->h * surface->pitch);
slouken@0
  1861
slouken@0
  1862
    dst = surface->pixels;
slouken@0
  1863
    srcbuf = (Uint8 *)(df + 1);
slouken@0
  1864
    for(;;) {
slouken@0
  1865
	/* copy opaque pixels */
slouken@0
  1866
	int ofs = 0;
slouken@0
  1867
	do {
slouken@0
  1868
	    unsigned run;
slouken@0
  1869
	    if(bpp == 2) {
slouken@0
  1870
		ofs += srcbuf[0];
slouken@0
  1871
		run = srcbuf[1];
slouken@0
  1872
		srcbuf += 2;
slouken@0
  1873
	    } else {
slouken@0
  1874
		ofs += ((Uint16 *)srcbuf)[0];
slouken@0
  1875
		run = ((Uint16 *)srcbuf)[1];
slouken@0
  1876
		srcbuf += 4;
slouken@0
  1877
	    }
slouken@0
  1878
	    if(run) {
slouken@0
  1879
		srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
slouken@0
  1880
		ofs += run;
slouken@0
  1881
	    } else if(!ofs)
slouken@0
  1882
		return;
slouken@0
  1883
	} while(ofs < w);
slouken@0
  1884
slouken@0
  1885
	/* skip padding if needed */
slouken@0
  1886
	if(bpp == 2)
slouken@0
  1887
	    srcbuf += (unsigned long)srcbuf & 2;
slouken@0
  1888
	
slouken@0
  1889
	/* copy translucent pixels */
slouken@0
  1890
	ofs = 0;
slouken@0
  1891
	do {
slouken@0
  1892
	    unsigned run;
slouken@0
  1893
	    ofs += ((Uint16 *)srcbuf)[0];
slouken@0
  1894
	    run = ((Uint16 *)srcbuf)[1];
slouken@0
  1895
	    srcbuf += 4;
slouken@0
  1896
	    if(run) {
slouken@0
  1897
		srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
slouken@0
  1898
		ofs += run;
slouken@0
  1899
	    }
slouken@0
  1900
	} while(ofs < w);
slouken@0
  1901
	dst += surface->pitch >> 2;
slouken@0
  1902
    }
slouken@0
  1903
}
slouken@0
  1904
slouken@0
  1905
void SDL_UnRLESurface(SDL_Surface *surface, int recode)
slouken@0
  1906
{
slouken@0
  1907
    if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
slouken@0
  1908
	surface->flags &= ~SDL_RLEACCEL;
slouken@0
  1909
slouken@0
  1910
	if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
slouken@0
  1911
	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
slouken@0
  1912
	    if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
slouken@0
  1913
		SDL_Rect full;
slouken@0
  1914
		unsigned alpha_flag;
slouken@0
  1915
slouken@0
  1916
		/* re-create the original surface */
slouken@0
  1917
		surface->pixels = malloc(surface->h * surface->pitch);
slouken@0
  1918
slouken@0
  1919
		/* fill it with the background colour */
slouken@0
  1920
		SDL_FillRect(surface, NULL, surface->format->colorkey);
slouken@0
  1921
slouken@0
  1922
		/* now render the encoded surface */
slouken@0
  1923
		full.x = full.y = 0;
slouken@0
  1924
		full.w = surface->w;
slouken@0
  1925
		full.h = surface->h;
slouken@0
  1926
		alpha_flag = surface->flags & SDL_SRCALPHA;
slouken@0
  1927
		surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
slouken@0
  1928
		SDL_RLEBlit(surface, &full, surface, &full);
slouken@0
  1929
		surface->flags |= alpha_flag;
slouken@0
  1930
	    } else
slouken@0
  1931
		UnRLEAlpha(surface);
slouken@0
  1932
	}
slouken@0
  1933
slouken@0
  1934
	if ( surface->map && surface->map->sw_data->aux_data ) {
slouken@0
  1935
	    free(surface->map->sw_data->aux_data);
slouken@0
  1936
	    surface->map->sw_data->aux_data = NULL;
slouken@0
  1937
	}
slouken@0
  1938
    }
slouken@0
  1939
}
slouken@0
  1940
slouken@0
  1941