This repository has been archived by the owner on Feb 11, 2021. It is now read-only.
/
SDL_blit_N.c
2597 lines (2428 loc) · 88.6 KB
1
2
/*
SDL - Simple DirectMedia Layer
3
Copyright (C) 1997-2006 Sam Lantinga
4
5
This library is free software; you can redistribute it and/or
6
modify it under the terms of the GNU Lesser General Public
7
License as published by the Free Software Foundation; either
8
version 2.1 of the License, or (at your option) any later version.
9
10
11
12
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
Lesser General Public License for more details.
14
15
16
17
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19
Sam Lantinga
20
slouken@libsdl.org
21
*/
22
#include "SDL_config.h"
23
24
#include "SDL_video.h"
25
#include "SDL_endian.h"
26
#include "SDL_cpuinfo.h"
27
#include "SDL_blit.h"
28
29
30
/* Functions to blit from N-bit surfaces to other surfaces */
31
#if SDL_ALTIVEC_BLITTERS
32
33
34
#if __MWERKS__
#pragma altivec_model on
#endif
35
#ifdef HAVE_ALTIVEC_H
36
#include <altivec.h>
37
#endif
38
#define assert(X)
39
#ifdef __MACOSX__
40
#include <sys/sysctl.h>
41
static size_t
42
GetL3CacheSize(void)
43
44
45
{
const char key[] = "hw.l3cachesize";
u_int64_t result = 0;
46
size_t typeSize = sizeof(result);
47
48
49
int err = sysctlbyname(key, &result, &typeSize, NULL, 0);
50
51
if (0 != err)
return 0;
52
53
54
55
return result;
}
#else
56
static size_t
57
GetL3CacheSize(void)
58
59
60
61
{
/* XXX: Just guess G4 */
return 2097152;
}
62
#endif /* __MACOSX__ */
63
64
#if (defined(__MACOSX__) && (__GNUC__ < 4))
65
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
66
(vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
67
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
68
69
(vector unsigned short) ( a,b,c,d,e,f,g,h )
#else
70
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
71
(vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
72
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
73
74
75
(vector unsigned short) { a,b,c,d,e,f,g,h }
#endif
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
0x04+a, 0x04+b, 0x04+c, 0x04+d, \
0x08+a, 0x08+b, 0x08+c, 0x08+d, \
0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
#define MAKE8888(dstfmt, r, g, b, a) \
( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
((a<<dstfmt->Ashift)&dstfmt->Amask) )
/*
* Data Stream Touch...Altivec cache prefetching.
*
* Don't use this on a G5...however, the speed boost is very significant
* on a G4.
*/
#define DST_CHAN_SRC 1
#define DST_CHAN_DEST 2
/* macro to set DST control word value... */
#define DST_CTRL(size, count, stride) \
(((size) << 24) | ((count) << 16) | (stride))
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
? vec_lvsl(0, src) \
: vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
/* Calculate the permute vector used for 32->32 swizzling */
107
static vector unsigned char
108
calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
109
110
{
/*
111
* We have to assume that the bits that aren't used by other
112
113
114
115
116
117
118
119
120
* colors is alpha, and it's one complete byte, since some formats
* leave alpha with a zero mask, but we should still swizzle the bits.
*/
/* ARGB */
const static struct SDL_PixelFormat default_pixel_format = {
NULL, 0, 0,
0, 0, 0, 0,
16, 8, 0, 24,
0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
121
122
0, 0
};
123
124
125
126
127
128
if (!srcfmt) {
srcfmt = &default_pixel_format;
}
if (!dstfmt) {
dstfmt = &default_pixel_format;
}
129
130
131
132
133
const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
0x04, 0x04, 0x04, 0x04,
0x08, 0x08, 0x08, 0x08,
0x0C, 0x0C, 0x0C,
0x0C);
134
135
136
vector unsigned char vswiz;
vector unsigned int srcvec;
#define RESHIFT(X) (3 - ((X) >> 3))
137
138
139
Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
140
141
142
Uint32 amask;
/* Use zero for alpha if either surface doesn't have alpha */
if (dstfmt->Amask) {
143
amask =
144
145
((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
Ashift);
146
147
148
149
150
151
152
} else {
amask =
0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
0xFFFFFFFF);
}
#undef RESHIFT
((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
153
vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
154
return (vswiz);
155
156
}
157
static void Blit_RGB888_RGB565(SDL_BlitInfo * info);
158
static void
159
Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info)
160
{
161
162
163
164
165
166
int height = info->d_height;
Uint8 *src = (Uint8 *) info->s_pixels;
int srcskip = info->s_skip;
Uint8 *dst = (Uint8 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
167
168
169
170
171
172
173
174
vector unsigned char valpha = vec_splat_u8(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
0x00, 0x0a, 0x00, 0x0e,
0x00, 0x12, 0x00, 0x16,
0x00, 0x1a, 0x00, 0x1e);
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
175
vector unsigned short v3f =
176
177
VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
0x003f, 0x003f, 0x003f, 0x003f);
178
vector unsigned short vfc =
179
180
181
182
VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
0x00fc, 0x00fc, 0x00fc, 0x00fc);
vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
vf800 = vec_sl(vf800, vec_splat_u16(8));
183
184
185
186
187
188
189
190
191
192
193
194
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
int width = info->d_width;
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
195
Uint32 Pixel; \
196
unsigned sR, sG, sB, sA; \
197
DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \
198
199
200
201
202
203
204
205
206
sR, sG, sB, sA); \
*(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
((sG << 3) & 0x000007E0) | \
((sB >> 3) & 0x0000001F)); \
dst += 2; \
src += 4; \
widthvar--; \
}
207
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
208
209
/* After all that work, here's the vector part! */
210
extrawidth = (width % 8); /* trailing unaligned stores */
211
width -= extrawidth;
212
213
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
214
215
216
217
218
219
while (width) {
vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
vector unsigned int vsrc1, vsrc2;
vector unsigned char vdst;
220
221
222
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
223
224
src += 16;
vsrc = voverflow;
225
226
227
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
228
/* 1555 */
229
230
231
232
233
234
235
vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2);
vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge);
vgpixel = vec_and(vgpixel, vfc);
vgpixel = vec_sl(vgpixel, v3);
vrpixel = vec_sl(vpixel, v1);
vrpixel = vec_and(vrpixel, vf800);
vbpixel = vec_and(vpixel, v3f);
236
vdst =
237
238
vec_or((vector unsigned char) vrpixel,
(vector unsigned char) vgpixel);
239
/* 565 */
240
241
vdst = vec_or(vdst, (vector unsigned char) vbpixel);
vec_st(vdst, 0, dst);
242
243
244
245
246
247
248
width -= 8;
src += 16;
dst += 16;
vsrc = voverflow;
}
249
assert(width == 0);
250
251
/* do scalar until we can align... */
252
ONE_PIXEL_BLEND((extrawidth), extrawidth);
253
254
#undef ONE_PIXEL_BLEND
255
src += srcskip; /* move to next row, accounting for pitch. */
256
257
258
259
260
261
dst += dstskip;
}
}
262
static void
263
Blit_RGB565_32Altivec(SDL_BlitInfo * info)
264
{
265
266
267
268
269
270
271
272
273
274
275
int height = info->d_height;
Uint8 *src = (Uint8 *) info->s_pixels;
int srcskip = info->s_skip;
Uint8 *dst = (Uint8 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
276
277
278
279
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v2 = vec_splat_u16(2);
vector unsigned short v3 = vec_splat_u16(3);
280
/*
281
282
283
284
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
285
286
287
288
289
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
290
291
vector unsigned char vredalpha2 =
(vector unsigned
292
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
293
);
294
/*
295
296
297
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
298
299
300
301
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
302
vector unsigned char vblue2 =
303
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
304
);
305
/*
306
307
308
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
309
310
311
312
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
313
314
vector unsigned char vgreen2 =
(vector unsigned
315
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
316
317
318
);
319
320
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
321
322
323
vf800 = (vector unsigned short) vec_splat_u8(-7);
vf800 = vec_sl(vf800, vec_splat_u16(8));
324
325
if (dstfmt->Amask && srcfmt->alpha) {
326
((unsigned char *) &valpha)[0] = alpha = srcfmt->alpha;
327
valpha = vec_splat(valpha, 0);
328
329
} else {
alpha = 0;
330
valpha = vec_splat_u8(0);
331
332
}
333
vpermute = calc_swizzle32(NULL, dstfmt);
334
335
336
337
338
339
340
341
342
343
344
345
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
int width = info->d_width;
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
346
347
348
349
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 8) & 0xf8; \
sG = (Pixel >> 3) & 0xfc; \
sB = (Pixel << 3) & 0xf8; \
350
351
352
353
354
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
355
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
356
357
/* After all that work, here's the vector part! */
358
extrawidth = (width % 8); /* trailing unaligned stores */
359
width -= extrawidth;
360
361
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
362
363
364
365
366
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
367
368
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
369
370
371
372
vR = vec_and((vector unsigned short) vsrc, vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
vG = vec_sl(vB, v2);
373
374
vdst1 =
375
376
377
378
379
380
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
381
382
vdst2 =
383
384
385
386
387
388
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
389
390
391
392
393
394
395
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
396
assert(width == 0);
397
398
399
/* do scalar until we can align... */
400
ONE_PIXEL_BLEND((extrawidth), extrawidth);
401
402
#undef ONE_PIXEL_BLEND
403
src += srcskip; /* move to next row, accounting for pitch. */
404
405
406
407
408
dst += dstskip;
}
}
409
410
static void
411
Blit_RGB555_32Altivec(SDL_BlitInfo * info)
412
{
413
414
415
416
417
418
419
420
421
422
423
int height = info->d_height;
Uint8 *src = (Uint8 *) info->s_pixels;
int srcskip = info->s_skip;
Uint8 *dst = (Uint8 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
424
425
426
427
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
428
/*
429
430
431
432
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
433
434
435
436
437
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
438
439
vector unsigned char vredalpha2 =
(vector unsigned
440
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
441
);
442
/*
443
444
445
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
446
447
448
449
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
450
vector unsigned char vblue2 =
451
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
452
);
453
/*
454
455
456
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
457
458
459
460
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
461
462
vector unsigned char vgreen2 =
(vector unsigned
463
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
464
465
466
);
467
468
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
469
470
471
vf800 = (vector unsigned short) vec_splat_u8(-7);
vf800 = vec_sl(vf800, vec_splat_u16(8));
472
473
if (dstfmt->Amask && srcfmt->alpha) {
474
((unsigned char *) &valpha)[0] = alpha = srcfmt->alpha;
475
valpha = vec_splat(valpha, 0);
476
477
} else {
alpha = 0;
478
valpha = vec_splat_u8(0);
479
480
}
481
vpermute = calc_swizzle32(NULL, dstfmt);
482
483
484
485
486
487
488
489
490
491
492
493
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
int width = info->d_width;
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
494
495
496
497
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 7) & 0xf8; \
sG = (Pixel >> 2) & 0xf8; \
sB = (Pixel << 3) & 0xf8; \
498
499
500
501
502
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
503
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
504
505
/* After all that work, here's the vector part! */
506
extrawidth = (width % 8); /* trailing unaligned stores */
507
width -= extrawidth;
508
509
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
510
511
512
513
514
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
515
516
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
517
518
519
520
vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
vG = vec_sl(vB, v3);
521
522
vdst1 =
523
524
525
526
527
528
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
529
530
vdst2 =
531
532
533
534
535
536
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
537
538
539
540
541
542
543
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
544
assert(width == 0);
545
546
547
/* do scalar until we can align... */
548
ONE_PIXEL_BLEND((extrawidth), extrawidth);
549
550
#undef ONE_PIXEL_BLEND
551
src += srcskip; /* move to next row, accounting for pitch. */
552
553
554
555
556
dst += dstskip;
}
}
557
558
static void BlitNtoNKey(SDL_BlitInfo * info);
static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info);
559
static void
560
Blit32to32KeyAltivec(SDL_BlitInfo * info)
561
562
563
564
565
566
567
568
569
570
571
{
int height = info->d_height;
Uint32 *srcp = (Uint32 *) info->s_pixels;
int srcskip = info->s_skip;
Uint32 *dstp = (Uint32 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
int srcbpp = srcfmt->BytesPerPixel;
SDL_PixelFormat *dstfmt = info->dst;
int dstbpp = dstfmt->BytesPerPixel;
int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
572
unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0;
573
Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
574
Uint32 ckey = info->src->colorkey;
575
576
577
578
579
vector unsigned int valpha;
vector unsigned char vpermute;
vector unsigned char vzero;
vector unsigned int vckey;
vector unsigned int vrgbmask;
580
vpermute = calc_swizzle32(srcfmt, dstfmt);
581
if (info->d_width < 16) {
582
if (copy_alpha) {
583
BlitNtoNKeyCopyAlpha(info);
584
} else {
585
BlitNtoNKey(info);
586
}
587
return;
588
}
589
vzero = vec_splat_u8(0);
590
if (alpha) {
591
592
((unsigned char *) &valpha)[0] = (unsigned char) alpha;
valpha =
593
(vector unsigned int) vec_splat((vector unsigned char) valpha, 0);
594
} else {
595
valpha = (vector unsigned int) vzero;
596
597
}
ckey &= rgbmask;
598
((unsigned int *) (char *) &vckey)[0] = ckey;
599
vckey = vec_splat(vckey, 0);
600
((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
601
vrgbmask = vec_splat(vrgbmask, 0);
602
603
604
605
606
while (height--) {
#define ONE_PIXEL_BLEND(condition, widthvar) \
if (copy_alpha) { \
while (condition) { \
607
Uint32 Pixel; \
608
unsigned sR, sG, sB, sA; \
609
DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \
610
sR, sG, sB, sA); \
611
if ( (Pixel & rgbmask) != ckey ) { \
612
613
614
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, sA); \
} \
615
616
dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \
617
618
619
620
widthvar--; \
} \
} else { \
while (condition) { \
621
Uint32 Pixel; \
622
unsigned sR, sG, sB; \
623
624
625
RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \
if ( Pixel != ckey ) { \
RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
626
627
628
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, alpha); \
} \
629
630
dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \
631
632
633
634
widthvar--; \
} \
}
int width = info->d_width;
635
636
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
assert(width > 0);
637
638
if (width > 0) {
int extrawidth = (width % 4);
639
640
vector unsigned char valigner = VEC_ALIGNER(srcp);
vector unsigned int vs = vec_ld(0, srcp);
641
width -= extrawidth;
642
assert(width >= 4);
643
644
645
while (width) {
vector unsigned char vsel;
vector unsigned int vd;
646
vector unsigned int voverflow = vec_ld(15, srcp);
647
/* load the source vec */
648
vs = vec_perm(vs, voverflow, valigner);
649
/* vsel is set for items that match the key */
650
651
vsel = (vector unsigned char) vec_and(vs, vrgbmask);
vsel = (vector unsigned char) vec_cmpeq(vs, vckey);
652
/* permute the src vec to the dest format */
653
vs = vec_perm(vs, valpha, vpermute);
654
/* load the destination vec */
655
vd = vec_ld(0, dstp);
656
/* select the source and dest into vs */
657
658
659
vd = (vector unsigned int) vec_sel((vector unsigned char) vs,
(vector unsigned char) vd,
vsel);
660
661
vec_st(vd, 0, dstp);
662
663
664
665
666
srcp += 4;
width -= 4;
dstp += 4;
vs = voverflow;
}
667
ONE_PIXEL_BLEND((extrawidth), extrawidth);
668
669
670
671
672
673
674
675
676
#undef ONE_PIXEL_BLEND
srcp += srcskip >> 2;
dstp += dstskip >> 2;
}
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G5 */
677
static void
678
ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info)
679
680
681
682
683
684
685
686
{
int height = info->d_height;
Uint32 *src = (Uint32 *) info->s_pixels;
int srcskip = info->s_skip;
Uint32 *dst = (Uint32 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
687
688
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
689
690
691
if (dstfmt->Amask && !srcfmt->Amask) {
if (srcfmt->alpha) {
vector unsigned char valpha;
692
((unsigned char *) &valpha)[0] = srcfmt->alpha;
693
vzero = (vector unsigned int) vec_splat(valpha, 0);
694
695
696
}
}
697
698
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
699
700
701
702
703
704
705
706
707
708
709
710
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
int width = info->d_width;
int extrawidth;
/* do scalar until we can align... */
711
while ((UNALIGNED_PTR(dst)) && (width)) {
712
bits = *(src++);
713
714
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
715
716
717
718
719
720
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
721
722
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
723
724
while (width) {
725
voverflow = vec_ld(15, src);
726
727
src += 4;
width -= 4;
728
729
730
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
731
732
733
734
dst += 4;
vbits = voverflow;
}
735
assert(width == 0);
736
737
738
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
739
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
740
741
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
742
743
744
extrawidth--;
}
745
src += srcskip >> 2; /* move to next row, accounting for pitch. */
746
747
748
749
750
751
752
dst += dstskip >> 2;
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G4 */
753
static void
754
ConvertAltivec32to32_prefetch(SDL_BlitInfo * info)
755
{
756
757
const int scalar_dst_lead = sizeof(Uint32) * 4;
const int vector_dst_lead = sizeof(Uint32) * 16;
758
759
760
761
762
763
764
765
int height = info->d_height;
Uint32 *src = (Uint32 *) info->s_pixels;
int srcskip = info->s_skip;
Uint32 *dst = (Uint32 *) info->d_pixels;
int dstskip = info->d_skip;
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
766
767
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
768
769
770
if (dstfmt->Amask && !srcfmt->Amask) {
if (srcfmt->alpha) {
vector unsigned char valpha;
771
((unsigned char *) &valpha)[0] = srcfmt->alpha;
772
vzero = (vector unsigned int) vec_splat(valpha, 0);
773
774
775
}
}
776
777
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
778
779
780
781
782
783
784
785
786
787
788
789
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
int width = info->d_width;
int extrawidth;
/* do scalar until we can align... */
790
791
792
793
794
while ((UNALIGNED_PTR(dst)) && (width)) {
vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
795
bits = *(src++);
796
797
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
798
799
800
801
802
803
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
804
805
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
806
807
while (width) {
808
809
810
811
812
vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
voverflow = vec_ld(15, src);
813
814
src += 4;
width -= 4;
815
816
817
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
818
819
820
dst += 4;
vbits = voverflow;
}
821
822
assert(width == 0);
823
824
825
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
826
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
827
828
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
829
830
831
extrawidth--;
}
832
src += srcskip >> 2; /* move to next row, accounting for pitch. */
833
834
835
dst += dstskip >> 2;
}
836
837
vec_dss(DST_CHAN_SRC);
vec_dss(DST_CHAN_DEST);
838
839
}
840
static Uint32
841
GetBlitFeatures(void)
842
843
844
845
{
static Uint32 features = 0xffffffff;
if (features == 0xffffffff) {
/* Provide an override for testing .. */
846
char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES");
847
848
if (override) {
features = 0;
849
SDL_sscanf(override, "%u", &features);
850
} else {
851
852
features = (0
/* Feature 1 is has-MMX */
853
| ((SDL_HasMMX())? 1 : 0)
854
/* Feature 2 is has-AltiVec */
855
| ((SDL_HasAltiVec())? 2 : 0)
856
857
/* Feature 4 is dont-use-prefetch */
/* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */
858
| ((GetL3CacheSize() == 0) ? 4 : 0)
859
);
860
861
862
863
}
}
return features;
}
864
865
866
867
#if __MWERKS__
#pragma altivec_model off
#endif
868
869
870
871
872
#else
/* Feature 1 is has-MMX */
#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
#endif
873
/* This is now endian dependent */
874
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
875
876
#define HI 1
#define LO 0
877
#else /* SDL_BYTEORDER == SDL_BIG_ENDIAN */
878
879
880
881
#define HI 0
#define LO 1
#endif
882
#if SDL_HERMES_BLITTERS
883
884
885
886
887
888
889
/* Heheheh, we coerce Hermes into using SDL blit information */
#define X86_ASSEMBLER
#define HermesConverterInterface SDL_BlitInfo
#define HermesClearInterface void
#define STACKCALL
890
891
#include "../hermes/HeadMMX.h"
#include "../hermes/HeadX86.h"
892
893
894
895
896
#else
/* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
#define RGB888_RGB332(dst, src) { \
897
898
899
dst = (Uint8)((((src)&0x00E00000)>>16)| \
(((src)&0x0000E000)>>11)| \
(((src)&0x000000C0)>>6)); \
900
}
901
static void
902
Blit_RGB888_index8(SDL_BlitInfo * info)
903
904
{
#ifndef USE_DUFFS_LOOP
905
int c;
906
#endif
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
int width, height;
Uint32 *src;
const Uint8 *map;
Uint8 *dst;
int srcskip, dstskip;
/* Set up some basic variables */
width = info->d_width;
height = info->d_height;
src = (Uint32 *) info->s_pixels;
srcskip = info->s_skip / 4;
dst = info->d_pixels;
dstskip = info->d_skip;
map = info->table;
if (map == NULL) {
while (height--) {
924
#ifdef USE_DUFFS_LOOP
925
/* *INDENT-OFF* */
926
927
928
DUFFS_LOOP(
RGB888_RGB332(*dst++, *src);
, width);
929
/* *INDENT-ON* */
930
#else
931
932
933
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
++src;
934
RGB888_RGB332(*dst++, *src);
935
++src;
936
RGB888_RGB332(*dst++, *src);
937
++src;
938
RGB888_RGB332(*dst++, *src);
939
940
941
942
++src;
}
switch (width & 3) {
case 3:
943
RGB888_RGB332(*dst++, *src);
944
945
++src;
case 2:
946
RGB888_RGB332(*dst++, *src);
947
948
++src;
case 1:
949
RGB888_RGB332(*dst++, *src);
950
951
++src;
}
952
#endif /* USE_DUFFS_LOOP */
953
954
955
956
957
src += srcskip;
dst += dstskip;
}
} else {
int Pixel;
958
959
while (height--) {
960
#ifdef USE_DUFFS_LOOP
961
/* *INDENT-OFF* */
962
DUFFS_LOOP(
963
964
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
965
966
++src;
, width);
967
/* *INDENT-ON* */
968
#else
969
970
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
971
RGB888_RGB332(Pixel, *src);
972
973
*dst++ = map[Pixel];
++src;
974
RGB888_RGB332(Pixel, *src);
975
976
*dst++ = map[Pixel];
++src;
977
RGB888_RGB332(Pixel, *src);
978
979
*dst++ = map[Pixel];
++src;
980
RGB888_RGB332(Pixel, *src);
981
982
983
984
985
*dst++ = map[Pixel];
++src;
}
switch (width & 3) {
case 3:
986
RGB888_RGB332(Pixel, *src);
987
988
989
*dst++ = map[Pixel];
++src;
case 2:
990
RGB888_RGB332(Pixel, *src);
991
992
993
*dst++ = map[Pixel];
++src;
case 1:
994
RGB888_RGB332(Pixel, *src);
995
996
997
*dst++ = map[Pixel];
++src;
}
998
#endif /* USE_DUFFS_LOOP */
999
1000
src += srcskip;
dst += dstskip;