src/video/SDL_RLEaccel.c
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
parent 526 4314a501d7be
child 739 22dbf364c017
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
     1 /*
     2     SDL - Simple DirectMedia Layer
     3     Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002  Sam Lantinga
     4 
     5     This library is free software; you can redistribute it and/or
     6     modify it under the terms of the GNU Library General Public
     7     License as published by the Free Software Foundation; either
     8     version 2 of the License, or (at your option) any later version.
     9 
    10     This library is distributed in the hope that it will be useful,
    11     but WITHOUT ANY WARRANTY; without even the implied warranty of
    12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13     Library General Public License for more details.
    14 
    15     You should have received a copy of the GNU Library General Public
    16     License along with this library; if not, write to the Free
    17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    18 
    19     Sam Lantinga
    20     slouken@libsdl.org
    21 */
    22 
    23 #ifdef SAVE_RCSID
    24 static char rcsid =
    25  "@(#) $Id$";
    26 #endif
    27 
    28 /*
    29  * RLE encoding for software colorkey and alpha-channel acceleration
    30  *
    31  * Original version by Sam Lantinga
    32  *
    33  * Mattias Engdegård (Yorick): Rewrite. New encoding format, encoder and
    34  * decoder. Added per-surface alpha blitter. Added per-pixel alpha
    35  * format, encoder and blitter.
    36  *
    37  * Many thanks to Xark and johns for hints, benchmarks and useful comments
    38  * leading to this code.
    39  *
    40  * Welcome to Macro Mayhem.
    41  */
    42 
    43 /*
    44  * The encoding translates the image data to a stream of segments of the form
    45  *
    46  * <skip> <run> <data>
    47  *
    48  * where <skip> is the number of transparent pixels to skip,
    49  *       <run>  is the number of opaque pixels to blit,
    50  * and   <data> are the pixels themselves.
    51  *
    52  * This basic structure is used both for colorkeyed surfaces, used for simple
    53  * binary transparency and for per-surface alpha blending, and for surfaces
    54  * with per-pixel alpha. The details differ, however:
    55  *
    56  * Encoding of colorkeyed surfaces:
    57  *
    58  *   Encoded pixels always have the same format as the target surface.
    59  *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
    60  *   where they are 16 bit. This makes the pixel data aligned at all times.
    61  *   Segments never wrap around from one scan line to the next.
    62  *
    63  *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
    64  *   beginning of a line.
    65  *
    66  * Encoding of surfaces with per-pixel alpha:
    67  *
    68  *   The sequence begins with a struct RLEDestFormat describing the target
    69  *   pixel format, to provide reliable un-encoding.
    70  *
    71  *   Each scan line is encoded twice: First all completely opaque pixels,
    72  *   encoded in the target format as described above, and then all
    73  *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
    74  *   in the following 32-bit format:
    75  *
    76  *   For 32-bit targets, each pixel has the target RGB format but with
    77  *   the alpha value occupying the highest 8 bits. The <skip> and <run>
    78  *   counts are 16 bit.
    79  * 
    80  *   For 16-bit targets, each pixel has the target RGB format, but with
    81  *   the middle component (usually green) shifted 16 steps to the left,
    82  *   and the hole filled with the 5 most significant bits of the alpha value.
    83  *   i.e. if the target has the format         rrrrrggggggbbbbb,
    84  *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
    85  *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
    86  *   for the translucent lines. Two padding bytes may be inserted
    87  *   before each translucent line to keep them 32-bit aligned.
    88  *
    89  *   The end of the sequence is marked by a zero <skip>,<run> pair at the
    90  *   beginning of an opaque line.
    91  */
    92 
    93 #include <stdio.h>
    94 #include <stdlib.h>
    95 #include <string.h>
    96 
    97 #include "SDL_types.h"
    98 #include "SDL_video.h"
    99 #include "SDL_error.h"
   100 #include "SDL_sysvideo.h"
   101 #include "SDL_blit.h"
   102 #include "SDL_memops.h"
   103 #include "SDL_RLEaccel_c.h"
   104 
   105 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
   106 #include "mmx.h"
   107 /* Function to check the CPU flags */
   108 #define MMX_CPU		0x800000
   109 #define CPU_Flags()	Hermes_X86_CPU()
   110 #define X86_ASSEMBLER
   111 #define HermesConverterInterface	void
   112 #define HermesClearInterface		void
   113 #define STACKCALL
   114 #include "HeadX86.h"
   115 #endif
   116 
   117 #ifndef MAX
   118 #define MAX(a, b) ((a) > (b) ? (a) : (b))
   119 #endif
   120 #ifndef MIN
   121 #define MIN(a, b) ((a) < (b) ? (a) : (b))
   122 #endif
   123 
   124 #define PIXEL_COPY(to, from, len, bpp)			\
   125 do {							\
   126     if(bpp == 4) {					\
   127 	SDL_memcpy4(to, from, (unsigned)(len));		\
   128     } else {						\
   129 	SDL_memcpy(to, from, (unsigned)(len) * (bpp));	\
   130     }							\
   131 } while(0)
   132 
   133 /*
   134  * Various colorkey blit methods, for opaque and per-surface alpha
   135  */
   136 
   137 #define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
   138     PIXEL_COPY(to, from, length, bpp)
   139 
   140 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
   141 
   142 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
   143     do {							\
   144 	Uint32 *srcp = (Uint32 *)(from);			\
   145 	Uint32 *dstp = (Uint32 *)(to);				\
   146         int i = 0x00FF00FF;					\
   147         movd_m2r(*(&i), mm3);					\
   148         punpckldq_r2r(mm3, mm3);				\
   149         i = 0xFF000000;						\
   150         movd_m2r(*(&i), mm7);					\
   151         punpckldq_r2r(mm7, mm7);				\
   152         i = alpha | alpha << 16;				\
   153         movd_m2r(*(&i), mm4);					\
   154         punpckldq_r2r(mm4, mm4);				\
   155 	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
   156 	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
   157         i = length;						\
   158 	if(i & 1) {						\
   159           movd_m2r((*srcp), mm1); /* src -> mm1 */		\
   160           punpcklbw_r2r(mm1, mm1);				\
   161           pand_r2r(mm3, mm1);					\
   162 	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
   163           punpcklbw_r2r(mm2, mm2);				\
   164           pand_r2r(mm3, mm2);					\
   165 	  psubw_r2r(mm2, mm1);					\
   166 	  pmullw_r2r(mm4, mm1);					\
   167 	  psrlw_i2r(8, mm1);					\
   168 	  paddw_r2r(mm1, mm2);					\
   169 	  pand_r2r(mm3, mm2);					\
   170 	  packuswb_r2r(mm2, mm2);				\
   171 	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
   172 	  movd_r2m(mm2, *dstp);					\
   173 	  ++srcp;						\
   174 	  ++dstp;						\
   175 	  i--;							\
   176 	}							\
   177 	for(; i > 0; --i) {					\
   178           movq_m2r((*srcp), mm0);				\
   179 	  movq_r2r(mm0, mm1);					\
   180           punpcklbw_r2r(mm0, mm0);				\
   181 	  movq_m2r((*dstp), mm2);				\
   182 	  punpckhbw_r2r(mm1, mm1);				\
   183 	  movq_r2r(mm2, mm6);					\
   184           pand_r2r(mm3, mm0);					\
   185           punpcklbw_r2r(mm2, mm2);				\
   186 	  pand_r2r(mm3, mm1);					\
   187 	  punpckhbw_r2r(mm6, mm6);				\
   188           pand_r2r(mm3, mm2);					\
   189 	  psubw_r2r(mm2, mm0);					\
   190 	  pmullw_r2r(mm4, mm0);					\
   191 	  pand_r2r(mm3, mm6);					\
   192 	  psubw_r2r(mm6, mm1);					\
   193 	  pmullw_r2r(mm4, mm1);					\
   194 	  psrlw_i2r(8, mm0);					\
   195 	  paddw_r2r(mm0, mm2);					\
   196 	  psrlw_i2r(8, mm1);					\
   197 	  paddw_r2r(mm1, mm6);					\
   198 	  pand_r2r(mm3, mm2);					\
   199 	  pand_r2r(mm3, mm6);					\
   200 	  packuswb_r2r(mm2, mm2);				\
   201 	  packuswb_r2r(mm6, mm6);				\
   202 	  psrlq_i2r(32, mm2);					\
   203 	  psllq_i2r(32, mm6);					\
   204 	  por_r2r(mm6, mm2);					\
   205 	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
   206          movq_r2m(mm2, *dstp);					\
   207 	  srcp += 2;						\
   208 	  dstp += 2;						\
   209 	  i--;							\
   210 	}							\
   211 	emms();							\
   212     } while(0)
   213 
   214 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
   215     do {						\
   216         int i, n = 0;					\
   217 	Uint16 *srcp = (Uint16 *)(from);		\
   218 	Uint16 *dstp = (Uint16 *)(to);			\
   219         Uint32 ALPHA = 0xF800;				\
   220 	movd_m2r(*(&ALPHA), mm1);			\
   221         punpcklwd_r2r(mm1, mm1);			\
   222         punpcklwd_r2r(mm1, mm1);			\
   223 	ALPHA = 0x07E0;					\
   224 	movd_m2r(*(&ALPHA), mm4);			\
   225         punpcklwd_r2r(mm4, mm4);			\
   226         punpcklwd_r2r(mm4, mm4);			\
   227 	ALPHA = 0x001F;					\
   228 	movd_m2r(*(&ALPHA), mm7);			\
   229         punpcklwd_r2r(mm7, mm7);			\
   230         punpcklwd_r2r(mm7, mm7);			\
   231 	alpha &= ~(1+2+4);				\
   232         i = (Uint32)alpha | (Uint32)alpha << 16;	\
   233         movd_m2r(*(&i), mm0);				\
   234         punpckldq_r2r(mm0, mm0);			\
   235         ALPHA = alpha >> 3;				\
   236         i = ((int)(length) & 3);			\
   237 	for(; i > 0; --i) {				\
   238 	    Uint32 s = *srcp++;				\
   239 	    Uint32 d = *dstp;				\
   240 	    s = (s | s << 16) & 0x07e0f81f;		\
   241 	    d = (d | d << 16) & 0x07e0f81f;		\
   242 	    d += (s - d) * ALPHA >> 5;			\
   243 	    d &= 0x07e0f81f;				\
   244 	    *dstp++ = d | d >> 16;			\
   245 	    n++;					\
   246 	}						\
   247 	i = (int)(length) - n;				\
   248 	for(; i > 0; --i) {				\
   249 	  movq_m2r((*dstp), mm3);			\
   250 	  movq_m2r((*srcp), mm2);			\
   251 	  movq_r2r(mm2, mm5);				\
   252 	  pand_r2r(mm1 , mm5);				\
   253 	  psrlq_i2r(11, mm5);				\
   254 	  movq_r2r(mm3, mm6);				\
   255 	  pand_r2r(mm1 , mm6);				\
   256 	  psrlq_i2r(11, mm6);				\
   257 	  psubw_r2r(mm6, mm5);				\
   258 	  pmullw_r2r(mm0, mm5);				\
   259 	  psrlw_i2r(8, mm5);				\
   260 	  paddw_r2r(mm5, mm6);				\
   261 	  psllq_i2r(11, mm6);				\
   262 	  pand_r2r(mm1, mm6);				\
   263 	  movq_r2r(mm4, mm5);				\
   264 	  por_r2r(mm7, mm5);				\
   265 	  pand_r2r(mm5, mm3);				\
   266 	  por_r2r(mm6, mm3);				\
   267 	  movq_r2r(mm2, mm5);				\
   268 	  pand_r2r(mm4 , mm5);				\
   269 	  psrlq_i2r(5, mm5);				\
   270 	  movq_r2r(mm3, mm6);				\
   271 	  pand_r2r(mm4 , mm6);				\
   272 	  psrlq_i2r(5, mm6);				\
   273 	  psubw_r2r(mm6, mm5);				\
   274 	  pmullw_r2r(mm0, mm5);				\
   275 	  psrlw_i2r(8, mm5);				\
   276 	  paddw_r2r(mm5, mm6);				\
   277 	  psllq_i2r(5, mm6);				\
   278 	  pand_r2r(mm4, mm6);				\
   279 	  movq_r2r(mm1, mm5);				\
   280 	  por_r2r(mm7, mm5);				\
   281 	  pand_r2r(mm5, mm3);				\
   282 	  por_r2r(mm6, mm3);				\
   283 	  movq_r2r(mm2, mm5);				\
   284 	  pand_r2r(mm7 , mm5);				\
   285           movq_r2r(mm3, mm6);				\
   286 	  pand_r2r(mm7 , mm6);				\
   287 	  psubw_r2r(mm6, mm5);				\
   288 	  pmullw_r2r(mm0, mm5);				\
   289 	  psrlw_i2r(8, mm5);				\
   290 	  paddw_r2r(mm5, mm6);				\
   291 	  pand_r2r(mm7, mm6);				\
   292 	  movq_r2r(mm1, mm5);				\
   293 	  por_r2r(mm4, mm5);				\
   294 	  pand_r2r(mm5, mm3);				\
   295 	  por_r2r(mm6, mm3);				\
   296 	  movq_r2m(mm3, *dstp);				\
   297 	  srcp += 4;					\
   298 	  dstp += 4;					\
   299 	  i -= 3;					\
   300 	}						\
   301 	emms();						\
   302     } while(0)
   303 
   304 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
   305     do {						\
   306         int i, n = 0;					\
   307 	Uint16 *srcp = (Uint16 *)(from);		\
   308 	Uint16 *dstp = (Uint16 *)(to);			\
   309         Uint32 ALPHA = 0x7C00;				\
   310 	movd_m2r(*(&ALPHA), mm1);			\
   311         punpcklwd_r2r(mm1, mm1);			\
   312         punpcklwd_r2r(mm1, mm1);			\
   313 	ALPHA = 0x03E0;					\
   314         movd_m2r(*(&ALPHA), mm4);			\
   315         punpcklwd_r2r(mm4, mm4);			\
   316         punpcklwd_r2r(mm4, mm4);			\
   317 	ALPHA = 0x001F;					\
   318 	movd_m2r(*(&ALPHA), mm7);			\
   319         punpcklwd_r2r(mm7, mm7);			\
   320         punpcklwd_r2r(mm7, mm7);			\
   321 	alpha &= ~(1+2+4);				\
   322         i = (Uint32)alpha | (Uint32)alpha << 16;	\
   323         movd_m2r(*(&i), mm0);				\
   324         punpckldq_r2r(mm0, mm0);			\
   325         i = ((int)(length) & 3);				\
   326         ALPHA = alpha >> 3;				\
   327 	for(; i > 0; --i) {				\
   328 	    Uint32 s = *srcp++;				\
   329 	    Uint32 d = *dstp;				\
   330 	    s = (s | s << 16) & 0x03e07c1f;		\
   331 	    d = (d | d << 16) & 0x03e07c1f;		\
   332 	    d += (s - d) * ALPHA >> 5;			\
   333 	    d &= 0x03e07c1f;				\
   334 	    *dstp++ = d | d >> 16;			\
   335 	    n++;					\
   336 	}						\
   337 	i = (int)(length) - n;				\
   338 	for(; i > 0; --i) {				\
   339 	  movq_m2r((*dstp), mm3);			\
   340 	  movq_m2r((*srcp), mm2);			\
   341 	  movq_r2r(mm2, mm5);				\
   342 	  pand_r2r(mm1 , mm5);				\
   343 	  psrlq_i2r(10, mm5);				\
   344 	  movq_r2r(mm3, mm6);				\
   345 	  pand_r2r(mm1 , mm6);				\
   346 	  psrlq_i2r(10, mm6);				\
   347 	  psubw_r2r(mm6, mm5);				\
   348 	  pmullw_r2r(mm0, mm5);				\
   349 	  psrlw_i2r(8, mm5);				\
   350 	  paddw_r2r(mm5, mm6);				\
   351 	  psllq_i2r(10, mm6);				\
   352 	  pand_r2r(mm1, mm6);				\
   353 	  movq_r2r(mm4, mm5);				\
   354 	  por_r2r(mm7, mm5);				\
   355 	  pand_r2r(mm5, mm3);				\
   356 	  por_r2r(mm6, mm3);				\
   357 	  movq_r2r(mm2, mm5);				\
   358 	  pand_r2r(mm4 , mm5);				\
   359 	  psrlq_i2r(5, mm5);				\
   360 	  movq_r2r(mm3, mm6);				\
   361 	  pand_r2r(mm4 , mm6);				\
   362 	  psrlq_i2r(5, mm6);				\
   363 	  psubw_r2r(mm6, mm5);				\
   364 	  pmullw_r2r(mm0, mm5);				\
   365 	  psrlw_i2r(8, mm5);				\
   366 	  paddw_r2r(mm5, mm6);				\
   367 	  psllq_i2r(5, mm6);				\
   368 	  pand_r2r(mm4, mm6);				\
   369 	  movq_r2r(mm1, mm5);				\
   370 	  por_r2r(mm7, mm5);				\
   371 	  pand_r2r(mm5, mm3);				\
   372 	  por_r2r(mm6, mm3);				\
   373 	  movq_r2r(mm2, mm5);				\
   374 	  pand_r2r(mm7 , mm5);				\
   375           movq_r2r(mm3, mm6);				\
   376 	  pand_r2r(mm7 , mm6);				\
   377 	  psubw_r2r(mm6, mm5);				\
   378 	  pmullw_r2r(mm0, mm5);				\
   379 	  psrlw_i2r(8, mm5);				\
   380 	  paddw_r2r(mm5, mm6);				\
   381 	  pand_r2r(mm7, mm6);				\
   382 	  movq_r2r(mm1, mm5);				\
   383 	  por_r2r(mm4, mm5);				\
   384 	  pand_r2r(mm5, mm3);				\
   385 	  por_r2r(mm6, mm3);				\
   386 	  movq_r2m(mm3, *dstp);				\
   387 	  srcp += 4;					\
   388 	  dstp += 4;					\
   389 	  i -= 3;					\
   390 	}						\
   391 	emms();						\
   392     } while(0)
   393 
   394 #endif
   395 
   396 /*
   397  * For 32bpp pixels on the form 0x00rrggbb:
   398  * If we treat the middle component separately, we can process the two
   399  * remaining in parallel. This is safe to do because of the gap to the left
   400  * of each component, so the bits from the multiplication don't collide.
   401  * This can be used for any RGB permutation of course.
   402  */
   403 #define ALPHA_BLIT32_888(to, from, length, bpp, alpha)		\
   404     do {							\
   405         int i;							\
   406 	Uint32 *src = (Uint32 *)(from);				\
   407 	Uint32 *dst = (Uint32 *)(to);				\
   408 	for(i = 0; i < (int)(length); i++) {			\
   409 	    Uint32 s = *src++;					\
   410 	    Uint32 d = *dst;					\
   411 	    Uint32 s1 = s & 0xff00ff;				\
   412 	    Uint32 d1 = d & 0xff00ff;				\
   413 	    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
   414 	    s &= 0xff00;					\
   415 	    d &= 0xff00;					\
   416 	    d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
   417 	    *dst++ = d1 | d;					\
   418 	}							\
   419     } while(0)
   420 
   421 /*
   422  * For 16bpp pixels we can go a step further: put the middle component
   423  * in the high 16 bits of a 32 bit word, and process all three RGB
   424  * components at the same time. Since the smallest gap is here just
   425  * 5 bits, we have to scale alpha down to 5 bits as well.
   426  */
   427 #define ALPHA_BLIT16_565(to, from, length, bpp, alpha)	\
   428     do {						\
   429         int i;						\
   430 	Uint16 *src = (Uint16 *)(from);			\
   431 	Uint16 *dst = (Uint16 *)(to);			\
   432 	Uint32 ALPHA = alpha >> 3;			\
   433 	for(i = 0; i < (int)(length); i++) {		\
   434 	    Uint32 s = *src++;				\
   435 	    Uint32 d = *dst;				\
   436 	    s = (s | s << 16) & 0x07e0f81f;		\
   437 	    d = (d | d << 16) & 0x07e0f81f;		\
   438 	    d += (s - d) * ALPHA >> 5;			\
   439 	    d &= 0x07e0f81f;				\
   440 	    *dst++ = d | d >> 16;			\
   441 	}						\
   442     } while(0)
   443 
   444 #define ALPHA_BLIT16_555(to, from, length, bpp, alpha)	\
   445     do {						\
   446         int i;						\
   447 	Uint16 *src = (Uint16 *)(from);			\
   448 	Uint16 *dst = (Uint16 *)(to);			\
   449 	Uint32 ALPHA = alpha >> 3;			\
   450 	for(i = 0; i < (int)(length); i++) {		\
   451 	    Uint32 s = *src++;				\
   452 	    Uint32 d = *dst;				\
   453 	    s = (s | s << 16) & 0x03e07c1f;		\
   454 	    d = (d | d << 16) & 0x03e07c1f;		\
   455 	    d += (s - d) * ALPHA >> 5;			\
   456 	    d &= 0x03e07c1f;				\
   457 	    *dst++ = d | d >> 16;			\
   458 	}						\
   459     } while(0)
   460 
   461 /*
   462  * The general slow catch-all function, for remaining depths and formats
   463  */
   464 #define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)			\
   465     do {								\
   466         int i;								\
   467 	Uint8 *src = from;						\
   468 	Uint8 *dst = to;						\
   469 	for(i = 0; i < (int)(length); i++) {				\
   470 	    Uint32 s, d;						\
   471 	    unsigned rs, gs, bs, rd, gd, bd;				\
   472 	    switch(bpp) {						\
   473 	    case 2:							\
   474 		s = *(Uint16 *)src;					\
   475 		d = *(Uint16 *)dst;					\
   476 		break;							\
   477 	    case 3:							\
   478 		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
   479 		    s = (src[0] << 16) | (src[1] << 8) | src[2];	\
   480 		    d = (dst[0] << 16) | (dst[1] << 8) | dst[2];	\
   481 		} else {						\
   482 		    s = (src[2] << 16) | (src[1] << 8) | src[0];	\
   483 		    d = (dst[2] << 16) | (dst[1] << 8) | dst[0];	\
   484 		}							\
   485 		break;							\
   486 	    case 4:							\
   487 		s = *(Uint32 *)src;					\
   488 		d = *(Uint32 *)dst;					\
   489 		break;							\
   490 	    }								\
   491 	    RGB_FROM_PIXEL(s, fmt, rs, gs, bs);				\
   492 	    RGB_FROM_PIXEL(d, fmt, rd, gd, bd);				\
   493 	    rd += (rs - rd) * alpha >> 8;				\
   494 	    gd += (gs - gd) * alpha >> 8;				\
   495 	    bd += (bs - bd) * alpha >> 8;				\
   496 	    PIXEL_FROM_RGB(d, fmt, rd, gd, bd);				\
   497 	    switch(bpp) {						\
   498 	    case 2:							\
   499 		*(Uint16 *)dst = d;					\
   500 		break;							\
   501 	    case 3:							\
   502 		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
   503 		    dst[0] = d >> 16;					\
   504 		    dst[1] = d >> 8;					\
   505 		    dst[2] = d;						\
   506 		} else {						\
   507 		    dst[0] = d;						\
   508 		    dst[1] = d >> 8;					\
   509 		    dst[2] = d >> 16;					\
   510 		}							\
   511 		break;							\
   512 	    case 4:							\
   513 		*(Uint32 *)dst = d;					\
   514 		break;							\
   515 	    }								\
   516 	    src += bpp;							\
   517 	    dst += bpp;							\
   518 	}								\
   519     } while(0)
   520 
   521 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
   522 
   523 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
   524     do {								\
   525 	Uint32 *srcp = (Uint32 *)(from);				\
   526 	Uint32 *dstp = (Uint32 *)(to);					\
   527         int i = 0x00fefefe;						\
   528         movd_m2r(*(&i), mm4);						\
   529         punpckldq_r2r(mm4, mm4);					\
   530         i = 0x00010101;							\
   531         movd_m2r(*(&i), mm3);						\
   532         punpckldq_r2r(mm3, mm3);					\
   533         i = (int)(length);						\
   534         if( i & 1 ) {							\
   535 	  Uint32 s = *srcp++;						\
   536 	  Uint32 d = *dstp;						\
   537 	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
   538 		     + (s & d & 0x00010101);				\
   539 	  i--;								\
   540 	}								\
   541 	for(; i > 0; --i) {						\
   542 	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
   543 	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
   544 	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
   545 	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
   546 	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
   547 	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
   548 	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
   549 	    psrld_i2r(1, mm5);						\
   550 	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
   551 	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
   552 	    paddd_r2r(mm5, mm2);					\
   553 	    movq_r2m(mm2, (*dstp));					\
   554 	    dstp += 2;							\
   555 	    srcp += 2;							\
   556 	    i--;							\
   557 	}								\
   558 	emms();								\
   559     } while(0)
   560 
   561 #endif
   562     
   563 /*
   564  * Special case: 50% alpha (alpha=128)
   565  * This is treated specially because it can be optimized very well, and
   566  * since it is good for many cases of semi-translucency.
   567  * The theory is to do all three components at the same time:
   568  * First zero the lowest bit of each component, which gives us room to
   569  * add them. Then shift right and add the sum of the lowest bits.
   570  */
   571 #define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)		\
   572     do {								\
   573         int i;								\
   574 	Uint32 *src = (Uint32 *)(from);					\
   575 	Uint32 *dst = (Uint32 *)(to);					\
   576 	for(i = 0; i < (int)(length); i++) {				\
   577 	    Uint32 s = *src++;						\
   578 	    Uint32 d = *dst;						\
   579 	    *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
   580 		     + (s & d & 0x00010101);				\
   581 	}								\
   582     } while(0)
   583 
   584 /*
   585  * For 16bpp, we can actually blend two pixels in parallel, if we take
   586  * care to shift before we add, not after.
   587  */
   588 
   589 /* helper: blend a single 16 bit pixel at 50% */
   590 #define BLEND16_50(dst, src, mask)			\
   591     do {						\
   592         Uint32 s = *src++;				\
   593 	Uint32 d = *dst;				\
   594 	*dst++ = (((s & mask) + (d & mask)) >> 1)	\
   595 	         + (s & d & (~mask & 0xffff));		\
   596     } while(0)
   597 
   598 /* basic 16bpp blender. mask is the pixels to keep when adding. */
   599 #define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)		\
   600     do {								\
   601 	unsigned n = (length);						\
   602 	Uint16 *src = (Uint16 *)(from);					\
   603 	Uint16 *dst = (Uint16 *)(to);					\
   604 	if(((unsigned long)src ^ (unsigned long)dst) & 3) {		\
   605 	    /* source and destination not in phase, blit one by one */	\
   606 	    while(n--)							\
   607 		BLEND16_50(dst, src, mask);				\
   608 	} else {							\
   609 	    if((unsigned long)src & 3) {				\
   610 		/* first odd pixel */					\
   611 		BLEND16_50(dst, src, mask);				\
   612 		n--;							\
   613 	    }								\
   614 	    for(; n > 1; n -= 2) {					\
   615 		Uint32 s = *(Uint32 *)src;				\
   616 		Uint32 d = *(Uint32 *)dst;				\
   617 		*(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)	\
   618 		               + ((d & (mask | mask << 16)) >> 1)	\
   619 		               + (s & d & (~(mask | mask << 16)));	\
   620 		src += 2;						\
   621 		dst += 2;						\
   622 	    }								\
   623 	    if(n)							\
   624 		BLEND16_50(dst, src, mask); /* last odd pixel */	\
   625 	}								\
   626     } while(0)
   627 
   628 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)	\
   629     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
   630 
   631 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
   632     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
   633 
   634 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
   635 
   636 #define CHOOSE_BLIT(blitter, alpha, fmt)				\
   637     do {								\
   638         if(alpha == 255) {						\
   639 	    switch(fmt->BytesPerPixel) {				\
   640 	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
   641 	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
   642 	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
   643 	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
   644 	    }								\
   645 	} else {							\
   646 	    switch(fmt->BytesPerPixel) {				\
   647 	    case 1:							\
   648 		/* No 8bpp alpha blitting */				\
   649 		break;							\
   650 									\
   651 	    case 2:							\
   652 		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
   653 		case 0xffff:						\
   654 		    if(fmt->Gmask == 0x07e0				\
   655 		       || fmt->Rmask == 0x07e0				\
   656 		       || fmt->Bmask == 0x07e0) {			\
   657 			if(alpha == 128)				\
   658 			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
   659 			else {						\
   660 			    if((CPU_Flags()&MMX_CPU)!=0)		\
   661 				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
   662 			    else					\
   663 				blitter(2, Uint8, ALPHA_BLIT16_565);	\
   664 			}						\
   665 		    } else						\
   666 			goto general16;					\
   667 		    break;						\
   668 									\
   669 		case 0x7fff:						\
   670 		    if(fmt->Gmask == 0x03e0				\
   671 		       || fmt->Rmask == 0x03e0				\
   672 		       || fmt->Bmask == 0x03e0) {			\
   673 			if(alpha == 128)				\
   674 			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
   675 			else {						\
   676 			    if((CPU_Flags()&MMX_CPU)!=0)		\
   677 				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
   678 			    else					\
   679 				blitter(2, Uint8, ALPHA_BLIT16_555);	\
   680 			}						\
   681 			break;						\
   682 		    }							\
   683 		    /* fallthrough */					\
   684 									\
   685 		default:						\
   686 		general16:						\
   687 		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
   688 		}							\
   689 		break;							\
   690 									\
   691 	    case 3:							\
   692 		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
   693 		break;							\
   694 									\
   695 	    case 4:							\
   696 		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
   697 		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
   698 		       || fmt->Bmask == 0xff00)) {			\
   699 		    if(alpha == 128)					\
   700 		    {							\
   701 			if((CPU_Flags()&MMX_CPU)!=0)			\
   702 				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
   703 			else						\
   704 				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
   705 		    }							\
   706 		    else						\
   707 		    {							\
   708 			if((CPU_Flags()&MMX_CPU)!=0)			\
   709 				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
   710 			else						\
   711 				blitter(4, Uint16, ALPHA_BLIT32_888);	\
   712 		    }							\
   713 		} else							\
   714 		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
   715 		break;							\
   716 	    }								\
   717 	}								\
   718     } while(0)
   719 
   720 #else
   721 	
   722 #define CHOOSE_BLIT(blitter, alpha, fmt)				\
   723     do {								\
   724         if(alpha == 255) {						\
   725 	    switch(fmt->BytesPerPixel) {				\
   726 	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
   727 	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
   728 	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
   729 	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
   730 	    }								\
   731 	} else {							\
   732 	    switch(fmt->BytesPerPixel) {				\
   733 	    case 1:							\
   734 		/* No 8bpp alpha blitting */				\
   735 		break;							\
   736 									\
   737 	    case 2:							\
   738 		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
   739 		case 0xffff:						\
   740 		    if(fmt->Gmask == 0x07e0				\
   741 		       || fmt->Rmask == 0x07e0				\
   742 		       || fmt->Bmask == 0x07e0) {			\
   743 			if(alpha == 128)				\
   744 			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
   745 			else {						\
   746 			    blitter(2, Uint8, ALPHA_BLIT16_565);	\
   747 			}						\
   748 		    } else						\
   749 			goto general16;					\
   750 		    break;						\
   751 									\
   752 		case 0x7fff:						\
   753 		    if(fmt->Gmask == 0x03e0				\
   754 		       || fmt->Rmask == 0x03e0				\
   755 		       || fmt->Bmask == 0x03e0) {			\
   756 			if(alpha == 128)				\
   757 			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
   758 			else {						\
   759 			    blitter(2, Uint8, ALPHA_BLIT16_555);	\
   760 			}						\
   761 			break;						\
   762 		    }							\
   763 		    /* fallthrough */					\
   764 									\
   765 		default:						\
   766 		general16:						\
   767 		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
   768 		}							\
   769 		break;							\
   770 									\
   771 	    case 3:							\
   772 		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
   773 		break;							\
   774 									\
   775 	    case 4:							\
   776 		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
   777 		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
   778 		       || fmt->Bmask == 0xff00)) {			\
   779 		    if(alpha == 128)					\
   780 			blitter(4, Uint16, ALPHA_BLIT32_888_50);	\
   781 		    else						\
   782 			blitter(4, Uint16, ALPHA_BLIT32_888);		\
   783 		} else							\
   784 		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
   785 		break;							\
   786 	    }								\
   787 	}								\
   788     } while(0)
   789 
   790 #endif
   791 
   792 /*
   793  * This takes care of the case when the surface is clipped on the left and/or
   794  * right. Top clipping has already been taken care of.
   795  */
   796 static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
   797 			Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
   798 {
   799     SDL_PixelFormat *fmt = dst->format;
   800 
   801 #define RLECLIPBLIT(bpp, Type, do_blit)					   \
   802     do {								   \
   803 	int linecount = srcrect->h;					   \
   804 	int ofs = 0;							   \
   805 	int left = srcrect->x;						   \
   806 	int right = left + srcrect->w;					   \
   807 	dstbuf -= left * bpp;						   \
   808 	for(;;) {							   \
   809 	    int run;							   \
   810 	    ofs += *(Type *)srcbuf;					   \
   811 	    run = ((Type *)srcbuf)[1];					   \
   812 	    srcbuf += 2 * sizeof(Type);					   \
   813 	    if(run) {							   \
   814 		/* clip to left and right borders */			   \
   815 		if(ofs < right) {					   \
   816 		    int start = 0;					   \
   817 		    int len = run;					   \
   818 		    int startcol;					   \
   819 		    if(left - ofs > 0) {				   \
   820 			start = left - ofs;				   \
   821 			len -= start;					   \
   822 			if(len <= 0)					   \
   823 			    goto nocopy ## bpp ## do_blit;		   \
   824 		    }							   \
   825 		    startcol = ofs + start;				   \
   826 		    if(len > right - startcol)				   \
   827 			len = right - startcol;				   \
   828 		    do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
   829 			    len, bpp, alpha);				   \
   830 		}							   \
   831 	    nocopy ## bpp ## do_blit:					   \
   832 		srcbuf += run * bpp;					   \
   833 		ofs += run;						   \
   834 	    } else if(!ofs)						   \
   835 		break;							   \
   836 	    if(ofs == w) {						   \
   837 		ofs = 0;						   \
   838 		dstbuf += dst->pitch;					   \
   839 		if(!--linecount)					   \
   840 		    break;						   \
   841 	    }								   \
   842 	}								   \
   843     } while(0)
   844 
   845     CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
   846 
   847 #undef RLECLIPBLIT
   848 
   849 }
   850 
   851 
   852 /* blit a colorkeyed RLE surface */
   853 int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
   854 		SDL_Surface *dst, SDL_Rect *dstrect)
   855 {
   856 	Uint8 *dstbuf;
   857 	Uint8 *srcbuf;
   858 	int x, y;
   859 	int w = src->w;
   860 	unsigned alpha;
   861 
   862 	/* Lock the destination if necessary */
   863 	if ( SDL_MUSTLOCK(dst) ) {
   864 		if ( SDL_LockSurface(dst) < 0 ) {
   865 			return(-1);
   866 		}
   867 	}
   868 
   869 	/* Set up the source and destination pointers */
   870 	x = dstrect->x;
   871 	y = dstrect->y;
   872 	dstbuf = (Uint8 *)dst->pixels
   873 	         + y * dst->pitch + x * src->format->BytesPerPixel;
   874 	srcbuf = (Uint8 *)src->map->sw_data->aux_data;
   875 
   876 	{
   877 	    /* skip lines at the top if neccessary */
   878 	    int vskip = srcrect->y;
   879 	    int ofs = 0;
   880 	    if(vskip) {
   881 
   882 #define RLESKIP(bpp, Type)			\
   883 		for(;;) {			\
   884 		    int run;			\
   885 		    ofs += *(Type *)srcbuf;	\
   886 		    run = ((Type *)srcbuf)[1];	\
   887 		    srcbuf += sizeof(Type) * 2;	\
   888 		    if(run) {			\
   889 			srcbuf += run * bpp;	\
   890 			ofs += run;		\
   891 		    } else if(!ofs)		\
   892 			goto done;		\
   893 		    if(ofs == w) {		\
   894 			ofs = 0;		\
   895 			if(!--vskip)		\
   896 			    break;		\
   897 		    }				\
   898 		}
   899 
   900 		switch(src->format->BytesPerPixel) {
   901 		case 1: RLESKIP(1, Uint8); break;
   902 		case 2: RLESKIP(2, Uint8); break;
   903 		case 3: RLESKIP(3, Uint8); break;
   904 		case 4: RLESKIP(4, Uint16); break;
   905 		}
   906 
   907 #undef RLESKIP
   908 
   909 	    }
   910 	}
   911 
   912 	alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
   913 	        ? src->format->alpha : 255;
   914 	/* if left or right edge clipping needed, call clip blit */
   915 	if ( srcrect->x || srcrect->w != src->w ) {
   916 	    RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
   917 	} else {
   918 	    SDL_PixelFormat *fmt = src->format;
   919 
   920 #define RLEBLIT(bpp, Type, do_blit)					      \
   921 	    do {							      \
   922 		int linecount = srcrect->h;				      \
   923 		int ofs = 0;						      \
   924 		for(;;) {						      \
   925 		    unsigned run;					      \
   926 		    ofs += *(Type *)srcbuf;				      \
   927 		    run = ((Type *)srcbuf)[1];				      \
   928 		    srcbuf += 2 * sizeof(Type);				      \
   929 		    if(run) {						      \
   930 			do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
   931 			srcbuf += run * bpp;				      \
   932 			ofs += run;					      \
   933 		    } else if(!ofs)					      \
   934 			break;						      \
   935 		    if(ofs == w) {					      \
   936 			ofs = 0;					      \
   937 			dstbuf += dst->pitch;				      \
   938 			if(!--linecount)				      \
   939 			    break;					      \
   940 		    }							      \
   941 		}							      \
   942 	    } while(0)
   943 
   944 	    CHOOSE_BLIT(RLEBLIT, alpha, fmt);
   945 
   946 #undef RLEBLIT
   947 	}
   948 
   949 done:
   950 	/* Unlock the destination if necessary */
   951 	if ( SDL_MUSTLOCK(dst) ) {
   952 		SDL_UnlockSurface(dst);
   953 	}
   954 	return(0);
   955 }
   956 
   957 #undef OPAQUE_BLIT
   958 
   959 /*
   960  * Per-pixel blitting macros for translucent pixels:
   961  * These use the same techniques as the per-surface blitting macros
   962  */
   963 
   964 /*
   965  * For 32bpp pixels, we have made sure the alpha is stored in the top
   966  * 8 bits, so proceed as usual
   967  */
   968 #define BLIT_TRANSL_888(src, dst)				\
   969     do {							\
   970         Uint32 s = src;						\
   971 	Uint32 d = dst;						\
   972 	unsigned alpha = s >> 24;				\
   973 	Uint32 s1 = s & 0xff00ff;				\
   974 	Uint32 d1 = d & 0xff00ff;				\
   975 	d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
   976 	s &= 0xff00;						\
   977 	d &= 0xff00;						\
   978 	d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
   979 	dst = d1 | d;						\
   980     } while(0)
   981 
   982 /*
   983  * For 16bpp pixels, we have stored the 5 most significant alpha bits in
   984  * bits 5-10. As before, we can process all 3 RGB components at the same time.
   985  */
   986 #define BLIT_TRANSL_565(src, dst)		\
   987     do {					\
   988         Uint32 s = src;				\
   989 	Uint32 d = dst;				\
   990 	unsigned alpha = (s & 0x3e0) >> 5;	\
   991 	s &= 0x07e0f81f;			\
   992 	d = (d | d << 16) & 0x07e0f81f;		\
   993 	d += (s - d) * alpha >> 5;		\
   994 	d &= 0x07e0f81f;			\
   995 	dst = d | d >> 16;			\
   996     } while(0)
   997 
   998 #define BLIT_TRANSL_555(src, dst)		\
   999     do {					\
  1000         Uint32 s = src;				\
  1001 	Uint32 d = dst;				\
  1002 	unsigned alpha = (s & 0x3e0) >> 5;	\
  1003 	s &= 0x03e07c1f;			\
  1004 	d = (d | d << 16) & 0x03e07c1f;		\
  1005 	d += (s - d) * alpha >> 5;		\
  1006 	d &= 0x03e07c1f;			\
  1007 	dst = d | d >> 16;			\
  1008     } while(0)
  1009 
  1010 /* used to save the destination format in the encoding. Designed to be
  1011    macro-compatible with SDL_PixelFormat but without the unneeded fields */
  1012 typedef struct {
  1013     	Uint8  BytesPerPixel;
  1014 	Uint8  Rloss;
  1015 	Uint8  Gloss;
  1016 	Uint8  Bloss;
  1017 	Uint8  Rshift;
  1018 	Uint8  Gshift;
  1019 	Uint8  Bshift;
  1020 	Uint8  Ashift;
  1021 	Uint32 Rmask;
  1022 	Uint32 Gmask;
  1023 	Uint32 Bmask;
  1024 	Uint32 Amask;
  1025 } RLEDestFormat;
  1026 
  1027 /* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
  1028 static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
  1029 			     Uint8 *dstbuf, SDL_Rect *srcrect)
  1030 {
  1031     SDL_PixelFormat *df = dst->format;
  1032     /*
  1033      * clipped blitter: Ptype is the destination pixel type,
  1034      * Ctype the translucent count type, and do_blend the macro
  1035      * to blend one pixel.
  1036      */
  1037 #define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)			  \
  1038     do {								  \
  1039 	int linecount = srcrect->h;					  \
  1040 	int left = srcrect->x;						  \
  1041 	int right = left + srcrect->w;					  \
  1042 	dstbuf -= left * sizeof(Ptype);					  \
  1043 	do {								  \
  1044 	    int ofs = 0;						  \
  1045 	    /* blit opaque pixels on one line */			  \
  1046 	    do {							  \
  1047 		unsigned run;						  \
  1048 		ofs += ((Ctype *)srcbuf)[0];				  \
  1049 		run = ((Ctype *)srcbuf)[1];				  \
  1050 		srcbuf += 2 * sizeof(Ctype);				  \
  1051 		if(run) {						  \
  1052 		    /* clip to left and right borders */		  \
  1053 		    int cofs = ofs;					  \
  1054 		    int crun = run;					  \
  1055 		    if(left - cofs > 0) {				  \
  1056 			crun -= left - cofs;				  \
  1057 			cofs = left;					  \
  1058 		    }							  \
  1059 		    if(crun > right - cofs)				  \
  1060 			crun = right - cofs;				  \
  1061 		    if(crun > 0)					  \
  1062 			PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),	  \
  1063 				   srcbuf + (cofs - ofs) * sizeof(Ptype), \
  1064 				   (unsigned)crun, sizeof(Ptype));	  \
  1065 		    srcbuf += run * sizeof(Ptype);			  \
  1066 		    ofs += run;						  \
  1067 		} else if(!ofs)						  \
  1068 		    return;						  \
  1069 	    } while(ofs < w);						  \
  1070 	    /* skip padding if necessary */				  \
  1071 	    if(sizeof(Ptype) == 2)					  \
  1072 		srcbuf += (unsigned long)srcbuf & 2;			  \
  1073 	    /* blit translucent pixels on the same line */		  \
  1074 	    ofs = 0;							  \
  1075 	    do {							  \
  1076 		unsigned run;						  \
  1077 		ofs += ((Uint16 *)srcbuf)[0];				  \
  1078 		run = ((Uint16 *)srcbuf)[1];				  \
  1079 		srcbuf += 4;						  \
  1080 		if(run) {						  \
  1081 		    /* clip to left and right borders */		  \
  1082 		    int cofs = ofs;					  \
  1083 		    int crun = run;					  \
  1084 		    if(left - cofs > 0) {				  \
  1085 			crun -= left - cofs;				  \
  1086 			cofs = left;					  \
  1087 		    }							  \
  1088 		    if(crun > right - cofs)				  \
  1089 			crun = right - cofs;				  \
  1090 		    if(crun > 0) {					  \
  1091 			Ptype *dst = (Ptype *)dstbuf + cofs;		  \
  1092 			Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);	  \
  1093 			int i;						  \
  1094 			for(i = 0; i < crun; i++)			  \
  1095 			    do_blend(src[i], dst[i]);			  \
  1096 		    }							  \
  1097 		    srcbuf += run * 4;					  \
  1098 		    ofs += run;						  \
  1099 		}							  \
  1100 	    } while(ofs < w);						  \
  1101 	    dstbuf += dst->pitch;					  \
  1102 	} while(--linecount);						  \
  1103     } while(0)
  1104 
  1105     switch(df->BytesPerPixel) {
  1106     case 2:
  1107 	if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
  1108 	   || df->Bmask == 0x07e0)
  1109 	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
  1110 	else
  1111 	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
  1112 	break;
  1113     case 4:
  1114 	RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
  1115 	break;
  1116     }
  1117 }
  1118 
  1119 /* blit a pixel-alpha RLE surface */
  1120 int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
  1121 		     SDL_Surface *dst, SDL_Rect *dstrect)
  1122 {
  1123     int x, y;
  1124     int w = src->w;
  1125     Uint8 *srcbuf, *dstbuf;
  1126     SDL_PixelFormat *df = dst->format;
  1127 
  1128     /* Lock the destination if necessary */
  1129     if ( SDL_MUSTLOCK(dst) ) {
  1130 	if ( SDL_LockSurface(dst) < 0 ) {
  1131 	    return -1;
  1132 	}
  1133     }
  1134 
  1135     x = dstrect->x;
  1136     y = dstrect->y;
  1137     dstbuf = (Uint8 *)dst->pixels
  1138 	     + y * dst->pitch + x * df->BytesPerPixel;
  1139     srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
  1140 
  1141     {
  1142 	/* skip lines at the top if necessary */
  1143 	int vskip = srcrect->y;
  1144 	if(vskip) {
  1145 	    int ofs;
  1146 	    if(df->BytesPerPixel == 2) {
  1147 		/* the 16/32 interleaved format */
  1148 		do {
  1149 		    /* skip opaque line */
  1150 		    ofs = 0;
  1151 		    do {
  1152 			int run;
  1153 			ofs += srcbuf[0];
  1154 			run = srcbuf[1];
  1155 			srcbuf += 2;
  1156 			if(run) {
  1157 			    srcbuf += 2 * run;
  1158 			    ofs += run;
  1159 			} else if(!ofs)
  1160 			    goto done;
  1161 		    } while(ofs < w);
  1162 
  1163 		    /* skip padding */
  1164 		    srcbuf += (unsigned long)srcbuf & 2;
  1165 
  1166 		    /* skip translucent line */
  1167 		    ofs = 0;
  1168 		    do {
  1169 			int run;
  1170 			ofs += ((Uint16 *)srcbuf)[0];
  1171 			run = ((Uint16 *)srcbuf)[1];
  1172 			srcbuf += 4 * (run + 1);
  1173 			ofs += run;
  1174 		    } while(ofs < w);
  1175 		} while(--vskip);
  1176 	    } else {
  1177 		/* the 32/32 interleaved format */
  1178 		vskip <<= 1;	/* opaque and translucent have same format */
  1179 		do {
  1180 		    ofs = 0;
  1181 		    do {
  1182 			int run;
  1183 			ofs += ((Uint16 *)srcbuf)[0];
  1184 			run = ((Uint16 *)srcbuf)[1];
  1185 			srcbuf += 4;
  1186 			if(run) {
  1187 			    srcbuf += 4 * run;
  1188 			    ofs += run;
  1189 			} else if(!ofs)
  1190 			    goto done;
  1191 		    } while(ofs < w);
  1192 		} while(--vskip);
  1193 	    }
  1194 	}
  1195     }
  1196 
  1197     /* if left or right edge clipping needed, call clip blit */
  1198     if(srcrect->x || srcrect->w != src->w) {
  1199 	RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
  1200     } else {
  1201 
  1202 	/*
  1203 	 * non-clipped blitter. Ptype is the destination pixel type,
  1204 	 * Ctype the translucent count type, and do_blend the
  1205 	 * macro to blend one pixel.
  1206 	 */
  1207 #define RLEALPHABLIT(Ptype, Ctype, do_blend)				 \
  1208 	do {								 \
  1209 	    int linecount = srcrect->h;					 \
  1210 	    do {							 \
  1211 		int ofs = 0;						 \
  1212 		/* blit opaque pixels on one line */			 \
  1213 		do {							 \
  1214 		    unsigned run;					 \
  1215 		    ofs += ((Ctype *)srcbuf)[0];			 \
  1216 		    run = ((Ctype *)srcbuf)[1];				 \
  1217 		    srcbuf += 2 * sizeof(Ctype);			 \
  1218 		    if(run) {						 \
  1219 			PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
  1220 				   run, sizeof(Ptype));			 \
  1221 			srcbuf += run * sizeof(Ptype);			 \
  1222 			ofs += run;					 \
  1223 		    } else if(!ofs)					 \
  1224 			goto done;					 \
  1225 		} while(ofs < w);					 \
  1226 		/* skip padding if necessary */				 \
  1227 		if(sizeof(Ptype) == 2)					 \
  1228 		    srcbuf += (unsigned long)srcbuf & 2;		 \
  1229 		/* blit translucent pixels on the same line */		 \
  1230 		ofs = 0;						 \
  1231 		do {							 \
  1232 		    unsigned run;					 \
  1233 		    ofs += ((Uint16 *)srcbuf)[0];			 \
  1234 		    run = ((Uint16 *)srcbuf)[1];			 \
  1235 		    srcbuf += 4;					 \
  1236 		    if(run) {						 \
  1237 			Ptype *dst = (Ptype *)dstbuf + ofs;		 \
  1238 			unsigned i;					 \
  1239 			for(i = 0; i < run; i++) {			 \
  1240 			    Uint32 src = *(Uint32 *)srcbuf;		 \
  1241 			    do_blend(src, *dst);			 \
  1242 			    srcbuf += 4;				 \
  1243 			    dst++;					 \
  1244 			}						 \
  1245 			ofs += run;					 \
  1246 		    }							 \
  1247 		} while(ofs < w);					 \
  1248 		dstbuf += dst->pitch;					 \
  1249 	    } while(--linecount);					 \
  1250 	} while(0)
  1251 
  1252 	switch(df->BytesPerPixel) {
  1253 	case 2:
  1254 	    if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
  1255 	       || df->Bmask == 0x07e0)
  1256 		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
  1257 	    else
  1258 		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
  1259 	    break;
  1260 	case 4:
  1261 	    RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
  1262 	    break;
  1263 	}
  1264     }
  1265 
  1266  done:
  1267     /* Unlock the destination if necessary */
  1268     if ( SDL_MUSTLOCK(dst) ) {
  1269 	SDL_UnlockSurface(dst);
  1270     }
  1271     return 0;
  1272 }
  1273 
  1274 /*
  1275  * Auxiliary functions:
  1276  * The encoding functions take 32bpp rgb + a, and
  1277  * return the number of bytes copied to the destination.
  1278  * The decoding functions copy to 32bpp rgb + a, and
  1279  * return the number of bytes copied from the source.
  1280  * These are only used in the encoder and un-RLE code and are therefore not
  1281  * highly optimised.
  1282  */
  1283 
  1284 /* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
  1285 static int copy_opaque_16(void *dst, Uint32 *src, int n,
  1286 			  SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
  1287 {
  1288     int i;
  1289     Uint16 *d = dst;
  1290     for(i = 0; i < n; i++) {
  1291 	unsigned r, g, b;
  1292 	RGB_FROM_PIXEL(*src, sfmt, r, g, b);
  1293 	PIXEL_FROM_RGB(*d, dfmt, r, g, b);
  1294 	src++;
  1295 	d++;
  1296     }
  1297     return n * 2;
  1298 }
  1299 
  1300 /* decode opaque pixels from 16bpp to 32bpp rgb + a */
  1301 static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
  1302 			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
  1303 {
  1304     int i;
  1305     Uint16 *s = src;
  1306     unsigned alpha = dfmt->Amask ? 255 : 0;
  1307     for(i = 0; i < n; i++) {
  1308 	unsigned r, g, b;
  1309 	RGB_FROM_PIXEL(*s, sfmt, r, g, b);
  1310 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
  1311 	s++;
  1312 	dst++;
  1313     }
  1314     return n * 2;
  1315 }
  1316 
  1317 
  1318 
  1319 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
  1320 static int copy_transl_565(void *dst, Uint32 *src, int n,
  1321 			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
  1322 {
  1323     int i;
  1324     Uint32 *d = dst;
  1325     for(i = 0; i < n; i++) {
  1326 	unsigned r, g, b, a;
  1327 	Uint16 pix;
  1328 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
  1329 	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
  1330 	*d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
  1331 	src++;
  1332 	d++;
  1333     }
  1334     return n * 4;
  1335 }
  1336 
  1337 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
  1338 static int copy_transl_555(void *dst, Uint32 *src, int n,
  1339 			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
  1340 {
  1341     int i;
  1342     Uint32 *d = dst;
  1343     for(i = 0; i < n; i++) {
  1344 	unsigned r, g, b, a;
  1345 	Uint16 pix;
  1346 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
  1347 	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
  1348 	*d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
  1349 	src++;
  1350 	d++;
  1351     }
  1352     return n * 4;
  1353 }
  1354 
  1355 /* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
  1356 static int uncopy_transl_16(Uint32 *dst, void *src, int n,
  1357 			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
  1358 {
  1359     int i;
  1360     Uint32 *s = src;
  1361     for(i = 0; i < n; i++) {
  1362 	unsigned r, g, b, a;
  1363 	Uint32 pix = *s++;
  1364 	a = (pix & 0x3e0) >> 2;
  1365 	pix = (pix & ~0x3e0) | pix >> 16;
  1366 	RGB_FROM_PIXEL(pix, sfmt, r, g, b);
  1367 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
  1368 	dst++;
  1369     }
  1370     return n * 4;
  1371 }
  1372 
  1373 /* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
  1374 static int copy_32(void *dst, Uint32 *src, int n,
  1375 		   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
  1376 {
  1377     int i;
  1378     Uint32 *d = dst;
  1379     for(i = 0; i < n; i++) {
  1380 	unsigned r, g, b, a;
  1381 	Uint32 pixel;
  1382 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
  1383 	PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
  1384 	*d++ = pixel | a << 24;
  1385 	src++;
  1386     }
  1387     return n * 4;
  1388 }
  1389 
  1390 /* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
  1391 static int uncopy_32(Uint32 *dst, void *src, int n,
  1392 		     RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
  1393 {
  1394     int i;
  1395     Uint32 *s = src;
  1396     for(i = 0; i < n; i++) {
  1397 	unsigned r, g, b, a;
  1398 	Uint32 pixel = *s++;
  1399 	RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
  1400 	a = pixel >> 24;
  1401 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
  1402 	dst++;
  1403     }
  1404     return n * 4;
  1405 }
  1406 
  1407 #define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
  1408 
  1409 #define ISTRANSL(pixel, fmt)	\
  1410     ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
  1411 
  1412 /* convert surface to be quickly alpha-blittable onto dest, if possible */
  1413 static int RLEAlphaSurface(SDL_Surface *surface)
  1414 {
  1415     SDL_Surface *dest;
  1416     SDL_PixelFormat *df;
  1417     int maxsize = 0;
  1418     int max_opaque_run;
  1419     int max_transl_run = 65535;
  1420     unsigned masksum;
  1421     Uint8 *rlebuf, *dst;
  1422     int (*copy_opaque)(void *, Uint32 *, int,
  1423 		       SDL_PixelFormat *, SDL_PixelFormat *);
  1424     int (*copy_transl)(void *, Uint32 *, int,
  1425 		       SDL_PixelFormat *, SDL_PixelFormat *);
  1426 
  1427     dest = surface->map->dst;
  1428     if(!dest)
  1429 	return -1;
  1430     df = dest->format;
  1431     if(surface->format->BitsPerPixel != 32)
  1432 	return -1;		/* only 32bpp source supported */
  1433 
  1434     /* find out whether the destination is one we support,
  1435        and determine the max size of the encoded result */
  1436     masksum = df->Rmask | df->Gmask | df->Bmask;
  1437     switch(df->BytesPerPixel) {
  1438     case 2:
  1439 	/* 16bpp: only support 565 and 555 formats */
  1440 	switch(masksum) {
  1441 	case 0xffff:
  1442 	    if(df->Gmask == 0x07e0
  1443 	       || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
  1444 		copy_opaque = copy_opaque_16;
  1445 		copy_transl = copy_transl_565;
  1446 	    } else
  1447 		return -1;
  1448 	    break;
  1449 	case 0x7fff:
  1450 	    if(df->Gmask == 0x03e0
  1451 	       || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
  1452 		copy_opaque = copy_opaque_16;
  1453 		copy_transl = copy_transl_555;
  1454 	    } else
  1455 		return -1;
  1456 	    break;
  1457 	default:
  1458 	    return -1;
  1459 	}
  1460 	max_opaque_run = 255;	/* runs stored as bytes */
  1461 
  1462 	/* worst case is alternating opaque and translucent pixels,
  1463 	   with room for alignment padding between lines */
  1464 	maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
  1465 	break;
  1466     case 4:
  1467 	if(masksum != 0x00ffffff)
  1468 	    return -1;		/* requires unused high byte */
  1469 	copy_opaque = copy_32;
  1470 	copy_transl = copy_32;
  1471 	max_opaque_run = 255;	/* runs stored as short ints */
  1472 
  1473 	/* worst case is alternating opaque and translucent pixels */
  1474 	maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
  1475 	break;
  1476     default:
  1477 	return -1;		/* anything else unsupported right now */
  1478     }
  1479 
  1480     maxsize += sizeof(RLEDestFormat);
  1481     rlebuf = (Uint8 *)malloc(maxsize);
  1482     if(!rlebuf) {
  1483 	SDL_OutOfMemory();
  1484 	return -1;
  1485     }
  1486     {
  1487 	/* save the destination format so we can undo the encoding later */
  1488 	RLEDestFormat *r = (RLEDestFormat *)rlebuf;
  1489 	r->BytesPerPixel = df->BytesPerPixel;
  1490 	r->Rloss = df->Rloss;
  1491 	r->Gloss = df->Gloss;
  1492 	r->Bloss = df->Bloss;
  1493 	r->Rshift = df->Rshift;
  1494 	r->Gshift = df->Gshift;
  1495 	r->Bshift = df->Bshift;
  1496 	r->Ashift = df->Ashift;
  1497 	r->Rmask = df->Rmask;
  1498 	r->Gmask = df->Gmask;
  1499 	r->Bmask = df->Bmask;
  1500 	r->Amask = df->Amask;
  1501     }
  1502     dst = rlebuf + sizeof(RLEDestFormat);
  1503 
  1504     /* Do the actual encoding */
  1505     {
  1506 	int x, y;
  1507 	int h = surface->h, w = surface->w;
  1508 	SDL_PixelFormat *sf = surface->format;
  1509 	Uint32 *src = (Uint32 *)surface->pixels;
  1510 	Uint8 *lastline = dst;	/* end of last non-blank line */
  1511 
  1512 	/* opaque counts are 8 or 16 bits, depending on target depth */
  1513 #define ADD_OPAQUE_COUNTS(n, m)			\
  1514 	if(df->BytesPerPixel == 4) {		\
  1515 	    ((Uint16 *)dst)[0] = n;		\
  1516 	    ((Uint16 *)dst)[1] = m;		\
  1517 	    dst += 4;				\
  1518 	} else {				\
  1519 	    dst[0] = n;				\
  1520 	    dst[1] = m;				\
  1521 	    dst += 2;				\
  1522 	}
  1523 
  1524 	/* translucent counts are always 16 bit */
  1525 #define ADD_TRANSL_COUNTS(n, m)		\
  1526 	(((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
  1527 
  1528 	for(y = 0; y < h; y++) {
  1529 	    int runstart, skipstart;
  1530 	    int blankline = 0;
  1531 	    /* First encode all opaque pixels of a scan line */
  1532 	    x = 0;
  1533 	    do {
  1534 		int run, skip, len;
  1535 		skipstart = x;
  1536 		while(x < w && !ISOPAQUE(src[x], sf))
  1537 		    x++;
  1538 		runstart = x;
  1539 		while(x < w && ISOPAQUE(src[x], sf))
  1540 		    x++;
  1541 		skip = runstart - skipstart;
  1542 		if(skip == w)
  1543 		    blankline = 1;
  1544 		run = x - runstart;
  1545 		while(skip > max_opaque_run) {
  1546 		    ADD_OPAQUE_COUNTS(max_opaque_run, 0);
  1547 		    skip -= max_opaque_run;
  1548 		}
  1549 		len = MIN(run, max_opaque_run);
  1550 		ADD_OPAQUE_COUNTS(skip, len);
  1551 		dst += copy_opaque(dst, src + runstart, len, sf, df);
  1552 		runstart += len;
  1553 		run -= len;
  1554 		while(run) {
  1555 		    len = MIN(run, max_opaque_run);
  1556 		    ADD_OPAQUE_COUNTS(0, len);
  1557 		    dst += copy_opaque(dst, src + runstart, len, sf, df);
  1558 		    runstart += len;
  1559 		    run -= len;
  1560 		}
  1561 	    } while(x < w);
  1562 
  1563 	    /* Make sure the next output address is 32-bit aligned */
  1564 	    dst += (unsigned long)dst & 2;
  1565 
  1566 	    /* Next, encode all translucent pixels of the same scan line */
  1567 	    x = 0;
  1568 	    do {
  1569 		int run, skip, len;
  1570 		skipstart = x;
  1571 		while(x < w && !ISTRANSL(src[x], sf))
  1572 		    x++;
  1573 		runstart = x;
  1574 		while(x < w && ISTRANSL(src[x], sf))
  1575 		    x++;
  1576 		skip = runstart - skipstart;
  1577 		blankline &= (skip == w);
  1578 		run = x - runstart;
  1579 		while(skip > max_transl_run) {
  1580 		    ADD_TRANSL_COUNTS(max_transl_run, 0);
  1581 		    skip -= max_transl_run;
  1582 		}
  1583 		len = MIN(run, max_transl_run);
  1584 		ADD_TRANSL_COUNTS(skip, len);
  1585 		dst += copy_transl(dst, src + runstart, len, sf, df);
  1586 		runstart += len;
  1587 		run -= len;
  1588 		while(run) {
  1589 		    len = MIN(run, max_transl_run);
  1590 		    ADD_TRANSL_COUNTS(0, len);
  1591 		    dst += copy_transl(dst, src + runstart, len, sf, df);
  1592 		    runstart += len;
  1593 		    run -= len;
  1594 		}
  1595 		if(!blankline)
  1596 		    lastline = dst;
  1597 	    } while(x < w);
  1598 
  1599 	    src += surface->pitch >> 2;
  1600 	}
  1601 	dst = lastline;		/* back up past trailing blank lines */
  1602 	ADD_OPAQUE_COUNTS(0, 0);
  1603     }
  1604 
  1605 #undef ADD_OPAQUE_COUNTS
  1606 #undef ADD_TRANSL_COUNTS
  1607 
  1608     /* Now that we have it encoded, release the original pixels */
  1609     if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
  1610        && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
  1611 	free( surface->pixels );
  1612 	surface->pixels = NULL;
  1613     }
  1614 
  1615     /* realloc the buffer to release unused memory */
  1616     {
  1617 	Uint8 *p = realloc(rlebuf, dst - rlebuf);
  1618 	if(!p)
  1619 	    p = rlebuf;
  1620 	surface->map->sw_data->aux_data = p;
  1621     }
  1622 
  1623     return 0;
  1624 }
  1625 
  1626 static Uint32 getpix_8(Uint8 *srcbuf)
  1627 {
  1628     return *srcbuf;
  1629 }
  1630 
  1631 static Uint32 getpix_16(Uint8 *srcbuf)
  1632 {
  1633     return *(Uint16 *)srcbuf;
  1634 }
  1635 
  1636 static Uint32 getpix_24(Uint8 *srcbuf)
  1637 {
  1638     if(SDL_BYTEORDER == SDL_LIL_ENDIAN)
  1639 	return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
  1640     else
  1641 	return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
  1642 }
  1643 
  1644 static Uint32 getpix_32(Uint8 *srcbuf)
  1645 {
  1646     return *(Uint32 *)srcbuf;
  1647 }
  1648 
  1649 typedef Uint32 (*getpix_func)(Uint8 *);
  1650 
  1651 static getpix_func getpixes[4] = {
  1652     getpix_8, getpix_16, getpix_24, getpix_32
  1653 };
  1654 
  1655 static int RLEColorkeySurface(SDL_Surface *surface)
  1656 {
  1657         Uint8 *rlebuf, *dst;
  1658 	int maxn;
  1659 	int y;
  1660 	Uint8 *srcbuf, *curbuf, *lastline;
  1661 	int maxsize = 0;
  1662 	int skip, run;
  1663 	int bpp = surface->format->BytesPerPixel;
  1664 	getpix_func getpix;
  1665 	Uint32 ckey, rgbmask;
  1666 	int w, h;
  1667 
  1668 	/* calculate the worst case size for the compressed surface */
  1669 	switch(bpp) {
  1670 	case 1:
  1671 	    /* worst case is alternating opaque and transparent pixels,
  1672 	       starting with an opaque pixel */
  1673 	    maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
  1674 	    break;
  1675 	case 2:
  1676 	case 3:
  1677 	    /* worst case is solid runs, at most 255 pixels wide */
  1678 	    maxsize = surface->h * (2 * (surface->w / 255 + 1)
  1679 				    + surface->w * bpp) + 2;
  1680 	    break;
  1681 	case 4:
  1682 	    /* worst case is solid runs, at most 65535 pixels wide */
  1683 	    maxsize = surface->h * (4 * (surface->w / 65535 + 1)
  1684 				    + surface->w * 4) + 4;
  1685 	    break;
  1686 	}
  1687 
  1688 	rlebuf = (Uint8 *)malloc(maxsize);
  1689 	if ( rlebuf == NULL ) {
  1690 		SDL_OutOfMemory();
  1691 		return(-1);
  1692 	}
  1693 
  1694 	/* Set up the conversion */
  1695 	srcbuf = (Uint8 *)surface->pixels;
  1696 	curbuf = srcbuf;
  1697 	maxn = bpp == 4 ? 65535 : 255;
  1698 	skip = run = 0;
  1699 	dst = rlebuf;
  1700 	rgbmask = ~surface->format->Amask;
  1701 	ckey = surface->format->colorkey & rgbmask;
  1702 	lastline = dst;
  1703 	getpix = getpixes[bpp - 1];
  1704 	w = surface->w;
  1705 	h = surface->h;
  1706 
  1707 #define ADD_COUNTS(n, m)			\
  1708 	if(bpp == 4) {				\
  1709 	    ((Uint16 *)dst)[0] = n;		\
  1710 	    ((Uint16 *)dst)[1] = m;		\
  1711 	    dst += 4;				\
  1712 	} else {				\
  1713 	    dst[0] = n;				\
  1714 	    dst[1] = m;				\
  1715 	    dst += 2;				\
  1716 	}
  1717 
  1718 	for(y = 0; y < h; y++) {
  1719 	    int x = 0;
  1720 	    int blankline = 0;
  1721 	    do {
  1722 		int run, skip, len;
  1723 		int runstart;
  1724 		int skipstart = x;
  1725 
  1726 		/* find run of transparent, then opaque pixels */
  1727 		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
  1728 		    x++;
  1729 		runstart = x;
  1730 		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
  1731 		    x++;
  1732 		skip = runstart - skipstart;
  1733 		if(skip == w)
  1734 		    blankline = 1;
  1735 		run = x - runstart;
  1736 
  1737 		/* encode segment */
  1738 		while(skip > maxn) {
  1739 		    ADD_COUNTS(maxn, 0);
  1740 		    skip -= maxn;
  1741 		}
  1742 		len = MIN(run, maxn);
  1743 		ADD_COUNTS(skip, len);
  1744 		memcpy(dst, srcbuf + runstart * bpp, len * bpp);
  1745 		dst += len * bpp;
  1746 		run -= len;
  1747 		runstart += len;
  1748 		while(run) {
  1749 		    len = MIN(run, maxn);
  1750 		    ADD_COUNTS(0, len);
  1751 		    memcpy(dst, srcbuf + runstart * bpp, len * bpp);
  1752 		    dst += len * bpp;
  1753 		    runstart += len;
  1754 		    run -= len;
  1755 		}
  1756 		if(!blankline)
  1757 		    lastline = dst;
  1758 	    } while(x < w);
  1759 
  1760 	    srcbuf += surface->pitch;
  1761 	}
  1762 	dst = lastline;		/* back up bast trailing blank lines */
  1763 	ADD_COUNTS(0, 0);
  1764 
  1765 #undef ADD_COUNTS
  1766 
  1767 	/* Now that we have it encoded, release the original pixels */
  1768 	if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
  1769 	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
  1770 	    free( surface->pixels );
  1771 	    surface->pixels = NULL;
  1772 	}
  1773 
  1774 	/* realloc the buffer to release unused memory */
  1775 	{
  1776 	    /* If realloc returns NULL, the original block is left intact */
  1777 	    Uint8 *p = realloc(rlebuf, dst - rlebuf);
  1778 	    if(!p)
  1779 		p = rlebuf;
  1780 	    surface->map->sw_data->aux_data = p;
  1781 	}
  1782 
  1783 	return(0);
  1784 }
  1785 
  1786 int SDL_RLESurface(SDL_Surface *surface)
  1787 {
  1788 	int retcode;
  1789 
  1790 	/* Clear any previous RLE conversion */
  1791 	if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
  1792 		SDL_UnRLESurface(surface, 1);
  1793 	}
  1794 
  1795 	/* We don't support RLE encoding of bitmaps */
  1796 	if ( surface->format->BitsPerPixel < 8 ) {
  1797 		return(-1);
  1798 	}
  1799 
  1800 	/* Lock the surface if it's in hardware */
  1801 	if ( SDL_MUSTLOCK(surface) ) {
  1802 		if ( SDL_LockSurface(surface) < 0 ) {
  1803 			return(-1);
  1804 		}
  1805 	}
  1806 
  1807 	/* Encode */
  1808 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  1809 	    retcode = RLEColorkeySurface(surface);
  1810 	} else {
  1811 	    if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
  1812 	       && surface->format->Amask != 0)
  1813 		retcode = RLEAlphaSurface(surface);
  1814 	    else
  1815 		retcode = -1;	/* no RLE for per-surface alpha sans ckey */
  1816 	}
  1817 
  1818 	/* Unlock the surface if it's in hardware */
  1819 	if ( SDL_MUSTLOCK(surface) ) {
  1820 		SDL_UnlockSurface(surface);
  1821 	}
  1822 
  1823 	if(retcode < 0)
  1824 	    return -1;
  1825 
  1826 	/* The surface is now accelerated */
  1827 	surface->flags |= SDL_RLEACCEL;
  1828 
  1829 	return(0);
  1830 }
  1831 
  1832 /*
  1833  * Un-RLE a surface with pixel alpha
  1834  * This may not give back exactly the image before RLE-encoding; all
  1835  * completely transparent pixels will be lost, and colour and alpha depth
  1836  * may have been reduced (when encoding for 16bpp targets).
  1837  */
  1838 static void UnRLEAlpha(SDL_Surface *surface)
  1839 {
  1840     Uint8 *srcbuf;
  1841     Uint32 *dst;
  1842     SDL_PixelFormat *sf = surface->format;
  1843     RLEDestFormat *df = surface->map->sw_data->aux_data;
  1844     int (*uncopy_opaque)(Uint32 *, void *, int,
  1845 			 RLEDestFormat *, SDL_PixelFormat *);
  1846     int (*uncopy_transl)(Uint32 *, void *, int,
  1847 			 RLEDestFormat *, SDL_PixelFormat *);
  1848     int w = surface->w;
  1849     int bpp = df->BytesPerPixel;
  1850 
  1851     if(bpp == 2) {
  1852 	uncopy_opaque = uncopy_opaque_16;
  1853 	uncopy_transl = uncopy_transl_16;
  1854     } else {
  1855 	uncopy_opaque = uncopy_transl = uncopy_32;
  1856     }
  1857 
  1858     surface->pixels = malloc(surface->h * surface->pitch);
  1859     /* fill background with transparent pixels */
  1860     memset(surface->pixels, 0, surface->h * surface->pitch);
  1861 
  1862     dst = surface->pixels;
  1863     srcbuf = (Uint8 *)(df + 1);
  1864     for(;;) {
  1865 	/* copy opaque pixels */
  1866 	int ofs = 0;
  1867 	do {
  1868 	    unsigned run;
  1869 	    if(bpp == 2) {
  1870 		ofs += srcbuf[0];
  1871 		run = srcbuf[1];
  1872 		srcbuf += 2;
  1873 	    } else {
  1874 		ofs += ((Uint16 *)srcbuf)[0];
  1875 		run = ((Uint16 *)srcbuf)[1];
  1876 		srcbuf += 4;
  1877 	    }
  1878 	    if(run) {
  1879 		srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
  1880 		ofs += run;
  1881 	    } else if(!ofs)
  1882 		return;
  1883 	} while(ofs < w);
  1884 
  1885 	/* skip padding if needed */
  1886 	if(bpp == 2)
  1887 	    srcbuf += (unsigned long)srcbuf & 2;
  1888 	
  1889 	/* copy translucent pixels */
  1890 	ofs = 0;
  1891 	do {
  1892 	    unsigned run;
  1893 	    ofs += ((Uint16 *)srcbuf)[0];
  1894 	    run = ((Uint16 *)srcbuf)[1];
  1895 	    srcbuf += 4;
  1896 	    if(run) {
  1897 		srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
  1898 		ofs += run;
  1899 	    }
  1900 	} while(ofs < w);
  1901 	dst += surface->pitch >> 2;
  1902     }
  1903 }
  1904 
  1905 void SDL_UnRLESurface(SDL_Surface *surface, int recode)
  1906 {
  1907     if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
  1908 	surface->flags &= ~SDL_RLEACCEL;
  1909 
  1910 	if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
  1911 	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
  1912 	    if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
  1913 		SDL_Rect full;
  1914 		unsigned alpha_flag;
  1915 
  1916 		/* re-create the original surface */
  1917 		surface->pixels = malloc(surface->h * surface->pitch);
  1918 
  1919 		/* fill it with the background colour */
  1920 		SDL_FillRect(surface, NULL, surface->format->colorkey);
  1921 
  1922 		/* now render the encoded surface */
  1923 		full.x = full.y = 0;
  1924 		full.w = surface->w;
  1925 		full.h = surface->h;
  1926 		alpha_flag = surface->flags & SDL_SRCALPHA;
  1927 		surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
  1928 		SDL_RLEBlit(surface, &full, surface, &full);
  1929 		surface->flags |= alpha_flag;
  1930 	    } else
  1931 		UnRLEAlpha(surface);
  1932 	}
  1933 
  1934 	if ( surface->map && surface->map->sw_data->aux_data ) {
  1935 	    free(surface->map->sw_data->aux_data);
  1936 	    surface->map->sw_data->aux_data = NULL;
  1937 	}
  1938     }
  1939 }
  1940 
  1941