This repository has been archived by the owner on Feb 11, 2021. It is now read-only.
/
SDL_blit_N.c
2503 lines (2342 loc) · 85.2 KB
1
2
/*
SDL - Simple DirectMedia Layer
3
Copyright (C) 1997-2006 Sam Lantinga
4
5
This library is free software; you can redistribute it and/or
6
modify it under the terms of the GNU Lesser General Public
7
License as published by the Free Software Foundation; either
8
version 2.1 of the License, or (at your option) any later version.
9
10
11
12
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
Lesser General Public License for more details.
14
15
16
17
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19
Sam Lantinga
20
slouken@libsdl.org
21
*/
22
#include "SDL_config.h"
23
24
#include "SDL_video.h"
25
#include "SDL_endian.h"
26
#include "SDL_cpuinfo.h"
27
#include "SDL_blit.h"
28
29
30
/* Functions to blit from N-bit surfaces to other surfaces */
31
#if SDL_ALTIVEC_BLITTERS
32
33
34
#if __MWERKS__
#pragma altivec_model on
#endif
35
#ifdef HAVE_ALTIVEC_H
36
#include <altivec.h>
37
#endif
38
#define assert(X)
39
#ifdef __MACOSX__
40
#include <sys/sysctl.h>
41
42
static size_t
GetL3CacheSize(void)
43
44
45
{
const char key[] = "hw.l3cachesize";
u_int64_t result = 0;
46
size_t typeSize = sizeof(result);
47
48
49
50
51
int err = sysctlbyname(key, &result, &typeSize, NULL, 0);
if (0 != err)
return 0;
52
53
54
55
return result;
}
#else
56
57
static size_t
GetL3CacheSize(void)
58
59
60
61
{
/* XXX: Just guess G4 */
return 2097152;
}
62
#endif /* __MACOSX__ */
63
64
#if (defined(__MACOSX__) && (__GNUC__ < 4))
65
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
66
(vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
67
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
68
69
(vector unsigned short) ( a,b,c,d,e,f,g,h )
#else
70
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
71
(vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
72
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
73
74
75
(vector unsigned short) { a,b,c,d,e,f,g,h }
#endif
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
0x04+a, 0x04+b, 0x04+c, 0x04+d, \
0x08+a, 0x08+b, 0x08+c, 0x08+d, \
0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
#define MAKE8888(dstfmt, r, g, b, a) \
( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
((a<<dstfmt->Ashift)&dstfmt->Amask) )
/*
* Data Stream Touch...Altivec cache prefetching.
*
* Don't use this on a G5...however, the speed boost is very significant
* on a G4.
*/
#define DST_CHAN_SRC 1
#define DST_CHAN_DEST 2
/* macro to set DST control word value... */
#define DST_CTRL(size, count, stride) \
(((size) << 24) | ((count) << 16) | (stride))
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
? vec_lvsl(0, src) \
: vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
/* Calculate the permute vector used for 32->32 swizzling */
107
108
static vector unsigned char
calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
109
110
{
/*
111
* We have to assume that the bits that aren't used by other
112
113
114
115
116
117
118
119
120
* colors is alpha, and it's one complete byte, since some formats
* leave alpha with a zero mask, but we should still swizzle the bits.
*/
/* ARGB */
const static struct SDL_PixelFormat default_pixel_format = {
NULL, 0, 0,
0, 0, 0, 0,
16, 8, 0, 24,
0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
121
122
0, 0
};
123
124
125
126
127
128
if (!srcfmt) {
srcfmt = &default_pixel_format;
}
if (!dstfmt) {
dstfmt = &default_pixel_format;
}
129
130
131
132
133
const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
0x04, 0x04, 0x04, 0x04,
0x08, 0x08, 0x08, 0x08,
0x0C, 0x0C, 0x0C,
0x0C);
134
135
136
137
138
139
140
141
142
vector unsigned char vswiz;
vector unsigned int srcvec;
#define RESHIFT(X) (3 - ((X) >> 3))
Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
Uint32 amask;
/* Use zero for alpha if either surface doesn't have alpha */
if (dstfmt->Amask) {
143
144
145
146
147
148
149
150
151
152
153
154
amask =
((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
Ashift);
} else {
amask =
0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
0xFFFFFFFF);
}
#undef RESHIFT
((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
return (vswiz);
155
156
}
157
158
159
160
static void Blit_RGB888_RGB565(SDL_BlitInfo * info);
static void
Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info)
{
161
162
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
163
int srcskip = info->src_skip;
164
Uint8 *dst = (Uint8 *) info->dst;
165
int dstskip = info->dst_skip;
166
167
168
SDL_PixelFormat *srcfmt = info->src;
vector unsigned char valpha = vec_splat_u8(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
169
170
171
172
vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
0x00, 0x0a, 0x00, 0x0e,
0x00, 0x12, 0x00, 0x16,
0x00, 0x1a, 0x00, 0x1e);
173
174
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
175
176
177
178
179
180
181
vector unsigned short v3f =
VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
0x003f, 0x003f, 0x003f, 0x003f);
vector unsigned short vfc =
VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
0x00fc, 0x00fc, 0x00fc, 0x00fc);
vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
182
183
184
185
186
187
188
vf800 = vec_sl(vf800, vec_splat_u16(8));
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
189
int width = info->dst_w;
190
191
192
193
194
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
195
Uint32 Pixel; \
196
unsigned sR, sG, sB, sA; \
197
DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \
198
199
200
201
202
203
204
205
206
207
208
209
sR, sG, sB, sA); \
*(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
((sG << 3) & 0x000007E0) | \
((sB >> 3) & 0x0000001F)); \
dst += 2; \
src += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
210
extrawidth = (width % 8); /* trailing unaligned stores */
211
212
213
214
215
216
217
218
219
220
221
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
vector unsigned int vsrc1, vsrc2;
vector unsigned char vdst;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
222
vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
223
224
225
226
src += 16;
vsrc = voverflow;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
227
vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
228
/* 1555 */
229
230
vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2);
vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge);
231
232
233
234
235
vgpixel = vec_and(vgpixel, vfc);
vgpixel = vec_sl(vgpixel, v3);
vrpixel = vec_sl(vpixel, v1);
vrpixel = vec_and(vrpixel, vf800);
vbpixel = vec_and(vpixel, v3f);
236
237
238
vdst =
vec_or((vector unsigned char) vrpixel,
(vector unsigned char) vgpixel);
239
/* 565 */
240
vdst = vec_or(vdst, (vector unsigned char) vbpixel);
241
242
243
244
245
246
247
248
249
250
251
252
253
254
vec_st(vdst, 0, dst);
width -= 8;
src += 16;
dst += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
255
src += srcskip; /* move to next row, accounting for pitch. */
256
257
258
259
260
261
dst += dstskip;
}
}
262
263
264
static void
Blit_RGB565_32Altivec(SDL_BlitInfo * info)
{
265
266
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
267
int srcskip = info->src_skip;
268
Uint8 *dst = (Uint8 *) info->dst;
269
int dstskip = info->dst_skip;
270
271
272
273
274
275
276
277
278
279
280
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v2 = vec_splat_u16(2);
vector unsigned short v3 = vec_splat_u16(3);
/*
281
282
283
284
285
286
287
288
289
290
291
292
293
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
vector unsigned char vredalpha2 =
(vector unsigned
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
);
294
/*
295
296
297
298
299
300
301
302
303
304
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
vector unsigned char vblue2 =
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
);
305
/*
306
307
308
309
310
311
312
313
314
315
316
317
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
vector unsigned char vgreen2 =
(vector unsigned
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
);
318
319
320
321
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
322
vf800 = (vector unsigned short) vec_splat_u8(-7);
323
324
vf800 = vec_sl(vf800, vec_splat_u16(8));
325
326
if (dstfmt->Amask && info->a) {
((unsigned char *) &valpha)[0] = alpha = info->a;
327
328
329
330
331
332
333
334
335
336
337
338
valpha = vec_splat(valpha, 0);
} else {
alpha = 0;
valpha = vec_splat_u8(0);
}
vpermute = calc_swizzle32(NULL, dstfmt);
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
339
int width = info->dst_w;
340
341
342
343
344
345
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
346
347
348
349
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 8) & 0xf8; \
sG = (Pixel >> 3) & 0xfc; \
sB = (Pixel << 3) & 0xf8; \
350
351
352
353
354
355
356
357
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
358
extrawidth = (width % 8); /* trailing unaligned stores */
359
360
361
362
363
364
365
366
367
368
369
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
370
371
vR = vec_and((vector unsigned short) vsrc, vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
372
373
vG = vec_sl(vB, v2);
374
375
376
377
378
vdst1 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
379
380
381
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
382
383
384
385
386
vdst2 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
387
388
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
389
390
391
392
393
394
395
396
397
398
399
400
401
402
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
403
src += srcskip; /* move to next row, accounting for pitch. */
404
405
406
407
408
dst += dstskip;
}
}
409
410
411
412
static void
Blit_RGB555_32Altivec(SDL_BlitInfo * info)
{
413
414
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
415
int srcskip = info->src_skip;
416
Uint8 *dst = (Uint8 *) info->dst;
417
int dstskip = info->dst_skip;
418
419
420
421
422
423
424
425
426
427
428
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
/*
429
430
431
432
433
434
435
436
437
438
439
440
441
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
vector unsigned char vredalpha2 =
(vector unsigned
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
);
442
/*
443
444
445
446
447
448
449
450
451
452
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
vector unsigned char vblue2 =
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
);
453
/*
454
455
456
457
458
459
460
461
462
463
464
465
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
vector unsigned char vgreen2 =
(vector unsigned
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
);
466
467
468
469
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
470
vf800 = (vector unsigned short) vec_splat_u8(-7);
471
472
vf800 = vec_sl(vf800, vec_splat_u16(8));
473
474
if (dstfmt->Amask && info->a) {
((unsigned char *) &valpha)[0] = alpha = info->a;
475
476
477
478
479
480
481
482
483
484
485
486
valpha = vec_splat(valpha, 0);
} else {
alpha = 0;
valpha = vec_splat_u8(0);
}
vpermute = calc_swizzle32(NULL, dstfmt);
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
487
int width = info->dst_w;
488
489
490
491
492
493
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
494
495
496
497
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 7) & 0xf8; \
sG = (Pixel >> 2) & 0xf8; \
sB = (Pixel << 3) & 0xf8; \
498
499
500
501
502
503
504
505
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
506
extrawidth = (width % 8); /* trailing unaligned stores */
507
508
509
510
511
512
513
514
515
516
517
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
518
519
vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
520
521
vG = vec_sl(vB, v3);
522
523
524
525
526
vdst1 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
527
528
529
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
530
531
532
533
534
vdst2 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
535
536
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
537
538
539
540
541
542
543
544
545
546
547
548
549
550
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
551
src += srcskip; /* move to next row, accounting for pitch. */
552
553
554
555
556
dst += dstskip;
}
}
557
558
559
560
static void BlitNtoNKey(SDL_BlitInfo * info);
static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info);
static void
Blit32to32KeyAltivec(SDL_BlitInfo * info)
561
{
562
563
int height = info->dst_h;
Uint32 *srcp = (Uint32 *) info->src;
564
int srcskip = info->src_skip;
565
Uint32 *dstp = (Uint32 *) info->dst;
566
int dstskip = info->dst_skip;
567
568
569
570
571
SDL_PixelFormat *srcfmt = info->src;
int srcbpp = srcfmt->BytesPerPixel;
SDL_PixelFormat *dstfmt = info->dst;
int dstbpp = dstfmt->BytesPerPixel;
int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
572
unsigned alpha = dstfmt->Amask ? info->a : 0;
573
Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
574
Uint32 ckey = info->colorkey;
575
576
577
578
579
580
vector unsigned int valpha;
vector unsigned char vpermute;
vector unsigned char vzero;
vector unsigned int vckey;
vector unsigned int vrgbmask;
vpermute = calc_swizzle32(srcfmt, dstfmt);
581
if (info->dst_w < 16) {
582
if (copy_alpha) {
583
BlitNtoNKeyCopyAlpha(info);
584
} else {
585
BlitNtoNKey(info);
586
}
587
return;
588
589
590
}
vzero = vec_splat_u8(0);
if (alpha) {
591
592
593
((unsigned char *) &valpha)[0] = (unsigned char) alpha;
valpha =
(vector unsigned int) vec_splat((vector unsigned char) valpha, 0);
594
} else {
595
valpha = (vector unsigned int) vzero;
596
597
}
ckey &= rgbmask;
598
((unsigned int *) (char *) &vckey)[0] = ckey;
599
vckey = vec_splat(vckey, 0);
600
((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
601
602
603
604
605
606
vrgbmask = vec_splat(vrgbmask, 0);
while (height--) {
#define ONE_PIXEL_BLEND(condition, widthvar) \
if (copy_alpha) { \
while (condition) { \
607
Uint32 Pixel; \
608
unsigned sR, sG, sB, sA; \
609
DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \
610
sR, sG, sB, sA); \
611
if ( (Pixel & rgbmask) != ckey ) { \
612
613
614
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, sA); \
} \
615
616
dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \
617
618
619
620
widthvar--; \
} \
} else { \
while (condition) { \
621
Uint32 Pixel; \
622
unsigned sR, sG, sB; \
623
624
625
RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \
if ( Pixel != ckey ) { \
RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
626
627
628
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, alpha); \
} \
629
630
dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \
631
632
633
widthvar--; \
} \
}
634
int width = info->dst_w;
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
assert(width > 0);
if (width > 0) {
int extrawidth = (width % 4);
vector unsigned char valigner = VEC_ALIGNER(srcp);
vector unsigned int vs = vec_ld(0, srcp);
width -= extrawidth;
assert(width >= 4);
while (width) {
vector unsigned char vsel;
vector unsigned int vd;
vector unsigned int voverflow = vec_ld(15, srcp);
/* load the source vec */
vs = vec_perm(vs, voverflow, valigner);
/* vsel is set for items that match the key */
650
651
vsel = (vector unsigned char) vec_and(vs, vrgbmask);
vsel = (vector unsigned char) vec_cmpeq(vs, vckey);
652
653
654
655
656
/* permute the src vec to the dest format */
vs = vec_perm(vs, valpha, vpermute);
/* load the destination vec */
vd = vec_ld(0, dstp);
/* select the source and dest into vs */
657
658
659
660
vd = (vector unsigned int) vec_sel((vector unsigned char) vs,
(vector unsigned char) vd,
vsel);
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
vec_st(vd, 0, dstp);
srcp += 4;
width -= 4;
dstp += 4;
vs = voverflow;
}
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
srcp += srcskip >> 2;
dstp += dstskip >> 2;
}
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G5 */
677
678
static void
ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info)
679
{
680
681
int height = info->dst_h;
Uint32 *src = (Uint32 *) info->src;
682
int srcskip = info->src_skip;
683
Uint32 *dst = (Uint32 *) info->dst;
684
int dstskip = info->dst_skip;
685
686
687
688
689
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
if (dstfmt->Amask && !srcfmt->Amask) {
690
if (info->a) {
691
vector unsigned char valpha;
692
((unsigned char *) &valpha)[0] = info->a;
693
vzero = (vector unsigned int) vec_splat(valpha, 0);
694
695
696
}
}
697
698
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
699
700
701
702
703
704
705
706
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
707
int width = info->dst_w;
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
int extrawidth;
/* do scalar until we can align... */
while ((UNALIGNED_PTR(dst)) && (width)) {
bits = *(src++);
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
724
while (width) {
725
726
727
voverflow = vec_ld(15, src);
src += 4;
width -= 4;
728
729
730
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
731
732
733
734
735
736
737
738
dst += 4;
vbits = voverflow;
}
assert(width == 0);
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
739
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
740
741
742
743
744
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
extrawidth--;
}
745
src += srcskip >> 2; /* move to next row, accounting for pitch. */
746
747
748
749
750
751
752
dst += dstskip >> 2;
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G4 */
753
754
static void
ConvertAltivec32to32_prefetch(SDL_BlitInfo * info)
755
{
756
757
const int scalar_dst_lead = sizeof(Uint32) * 4;
const int vector_dst_lead = sizeof(Uint32) * 16;
758
759
760
int height = info->dst_h;
Uint32 *src = (Uint32 *) info->src;
761
int srcskip = info->src_skip;
762
Uint32 *dst = (Uint32 *) info->dst;
763
int dstskip = info->dst_skip;
764
765
766
767
768
SDL_PixelFormat *srcfmt = info->src;
SDL_PixelFormat *dstfmt = info->dst;
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
if (dstfmt->Amask && !srcfmt->Amask) {
769
if (info->a) {
770
vector unsigned char valpha;
771
((unsigned char *) &valpha)[0] = info->a;
772
vzero = (vector unsigned int) vec_splat(valpha, 0);
773
774
775
}
}
776
777
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
778
779
780
781
782
783
784
785
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
786
int width = info->dst_w;
787
788
789
790
int extrawidth;
/* do scalar until we can align... */
while ((UNALIGNED_PTR(dst)) && (width)) {
791
792
793
794
vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
795
796
797
798
799
800
801
802
803
804
805
806
807
bits = *(src++);
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
while (width) {
808
809
810
811
vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
812
813
814
voverflow = vec_ld(15, src);
src += 4;
width -= 4;
815
816
817
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
818
819
820
dst += 4;
vbits = voverflow;
}
821
822
823
824
825
assert(width == 0);
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
826
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
827
828
829
830
831
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
extrawidth--;
}
832
src += srcskip >> 2; /* move to next row, accounting for pitch. */
833
834
835
836
837
838
839
dst += dstskip >> 2;
}
vec_dss(DST_CHAN_SRC);
vec_dss(DST_CHAN_DEST);
}
840
841
static Uint32
GetBlitFeatures(void)
842
843
844
845
{
static Uint32 features = 0xffffffff;
if (features == 0xffffffff) {
/* Provide an override for testing .. */
846
char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES");
847
848
if (override) {
features = 0;
849
SDL_sscanf(override, "%u", &features);
850
} else {
851
852
853
854
855
856
857
858
859
features = (0
/* Feature 1 is has-MMX */
| ((SDL_HasMMX())? 1 : 0)
/* Feature 2 is has-AltiVec */
| ((SDL_HasAltiVec())? 2 : 0)
/* Feature 4 is dont-use-prefetch */
/* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */
| ((GetL3CacheSize() == 0) ? 4 : 0)
);
860
861
862
863
}
}
return features;
}
864
865
866
867
#if __MWERKS__
#pragma altivec_model off
#endif
868
869
870
871
872
#else
/* Feature 1 is has-MMX */
#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
#endif
873
/* This is now endian dependent */
874
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
875
876
#define HI 1
#define LO 0
877
#else /* SDL_BYTEORDER == SDL_BIG_ENDIAN */
878
879
880
881
#define HI 0
#define LO 1
#endif
882
883
/* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
#define RGB888_RGB332(dst, src) { \
884
885
886
dst = (Uint8)((((src)&0x00E00000)>>16)| \
(((src)&0x0000E000)>>11)| \
(((src)&0x000000C0)>>6)); \
887
}
888
889
static void
Blit_RGB888_index8(SDL_BlitInfo * info)
890
891
{
#ifndef USE_DUFFS_LOOP
892
int c;
893
#endif
894
895
896
897
898
899
900
int width, height;
Uint32 *src;
const Uint8 *map;
Uint8 *dst;
int srcskip, dstskip;
/* Set up some basic variables */
901
902
903
width = info->dst_w;
height = info->dst_h;
src = (Uint32 *) info->src;
904
srcskip = info->src_skip / 4;
905
dst = info->dst;
906
dstskip = info->dst_skip;
907
908
909
910
map = info->table;
if (map == NULL) {
while (height--) {
911
#ifdef USE_DUFFS_LOOP
912
/* *INDENT-OFF* */
913
914
915
DUFFS_LOOP(
RGB888_RGB332(*dst++, *src);
, width);
916
/* *INDENT-ON* */
917
#else
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
++src;
RGB888_RGB332(*dst++, *src);
++src;
RGB888_RGB332(*dst++, *src);
++src;
RGB888_RGB332(*dst++, *src);
++src;
}
switch (width & 3) {
case 3:
RGB888_RGB332(*dst++, *src);
++src;
case 2:
RGB888_RGB332(*dst++, *src);
++src;
case 1:
RGB888_RGB332(*dst++, *src);
++src;
}
939
#endif /* USE_DUFFS_LOOP */
940
941
942
943
944
src += srcskip;
dst += dstskip;
}
} else {
int Pixel;
945
946
while (height--) {
947
#ifdef USE_DUFFS_LOOP
948
/* *INDENT-OFF* */
949
DUFFS_LOOP(
950
951
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
952
953
++src;
, width);
954
/* *INDENT-ON* */
955
#else
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
}
switch (width & 3) {
case 3:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
case 2:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
case 1:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
}
985
#endif /* USE_DUFFS_LOOP */
986
987
988
989
src += srcskip;
dst += dstskip;
}
}
990
}
991
992
993
/* Special optimized blit for RGB 8-8-8 --> RGB 5-5-5 */
#define RGB888_RGB555(dst, src) { \
994
995
996
*(Uint16 *)(dst) = (Uint16)((((*src)&0x00F80000)>>9)| \
(((*src)&0x0000F800)>>6)| \
(((*src)&0x000000F8)>>3)); \
997
998
999
1000
}
#define RGB888_RGB555_TWO(dst, src) { \
*(Uint32 *)(dst) = (((((src[HI])&0x00F80000)>>9)| \
(((src[HI])&0x0000F800)>>6)| \