This repository has been archived by the owner on Feb 11, 2021. It is now read-only.
/
SDL_blit_N.c
2511 lines (2341 loc) · 85.1 KB
1
2
/*
SDL - Simple DirectMedia Layer
3
Copyright (C) 1997-2010 Sam Lantinga
4
5
This library is free software; you can redistribute it and/or
6
modify it under the terms of the GNU Lesser General Public
7
License as published by the Free Software Foundation; either
8
version 2.1 of the License, or (at your option) any later version.
9
10
11
12
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
Lesser General Public License for more details.
14
15
16
17
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19
Sam Lantinga
20
slouken@libsdl.org
21
*/
22
#include "SDL_config.h"
23
24
#include "SDL_video.h"
25
#include "SDL_endian.h"
26
#include "SDL_cpuinfo.h"
27
#include "SDL_blit.h"
28
29
30
/* Functions to blit from N-bit surfaces to other surfaces */
31
#if SDL_ALTIVEC_BLITTERS
32
33
34
#if __MWERKS__
#pragma altivec_model on
#endif
35
#ifdef HAVE_ALTIVEC_H
36
#include <altivec.h>
37
#endif
38
#define assert(X)
39
#ifdef __MACOSX__
40
#include <sys/sysctl.h>
41
42
static size_t
GetL3CacheSize(void)
43
44
45
{
const char key[] = "hw.l3cachesize";
u_int64_t result = 0;
46
size_t typeSize = sizeof(result);
47
48
49
50
51
int err = sysctlbyname(key, &result, &typeSize, NULL, 0);
if (0 != err)
return 0;
52
53
54
55
return result;
}
#else
56
57
static size_t
GetL3CacheSize(void)
58
59
60
61
{
/* XXX: Just guess G4 */
return 2097152;
}
62
#endif /* __MACOSX__ */
63
64
#if (defined(__MACOSX__) && (__GNUC__ < 4))
65
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
66
(vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
67
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
68
69
(vector unsigned short) ( a,b,c,d,e,f,g,h )
#else
70
#define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
71
(vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
72
#define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
73
74
75
(vector unsigned short) { a,b,c,d,e,f,g,h }
#endif
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
#define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
0x04+a, 0x04+b, 0x04+c, 0x04+d, \
0x08+a, 0x08+b, 0x08+c, 0x08+d, \
0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
#define MAKE8888(dstfmt, r, g, b, a) \
( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
((a<<dstfmt->Ashift)&dstfmt->Amask) )
/*
* Data Stream Touch...Altivec cache prefetching.
*
* Don't use this on a G5...however, the speed boost is very significant
* on a G4.
*/
#define DST_CHAN_SRC 1
#define DST_CHAN_DEST 2
/* macro to set DST control word value... */
#define DST_CTRL(size, count, stride) \
(((size) << 24) | ((count) << 16) | (stride))
#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
? vec_lvsl(0, src) \
: vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
/* Calculate the permute vector used for 32->32 swizzling */
107
108
static vector unsigned char
calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
109
110
{
/*
111
* We have to assume that the bits that aren't used by other
112
113
114
115
* colors is alpha, and it's one complete byte, since some formats
* leave alpha with a zero mask, but we should still swizzle the bits.
*/
/* ARGB */
116
const static const struct SDL_PixelFormat default_pixel_format = {
117
NULL, 32, 4,
118
119
0, 0, 0, 0,
16, 8, 0, 24,
120
0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
121
};
122
123
124
125
126
127
if (!srcfmt) {
srcfmt = &default_pixel_format;
}
if (!dstfmt) {
dstfmt = &default_pixel_format;
}
128
129
130
131
132
const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
0x04, 0x04, 0x04, 0x04,
0x08, 0x08, 0x08, 0x08,
0x0C, 0x0C, 0x0C,
0x0C);
133
134
135
136
137
138
139
140
141
vector unsigned char vswiz;
vector unsigned int srcvec;
#define RESHIFT(X) (3 - ((X) >> 3))
Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
Uint32 amask;
/* Use zero for alpha if either surface doesn't have alpha */
if (dstfmt->Amask) {
142
amask =
143
144
((srcfmt->Amask) ? RESHIFT(srcfmt->
Ashift) : 0x10) << (dstfmt->Ashift);
145
146
147
148
149
150
151
152
153
} else {
amask =
0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
0xFFFFFFFF);
}
#undef RESHIFT
((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
return (vswiz);
154
155
}
156
157
158
159
static void Blit_RGB888_RGB565(SDL_BlitInfo * info);
static void
Blit_RGB888_RGB565Altivec(SDL_BlitInfo * info)
{
160
161
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
162
int srcskip = info->src_skip;
163
Uint8 *dst = (Uint8 *) info->dst;
164
int dstskip = info->dst_skip;
165
SDL_PixelFormat *srcfmt = info->src_fmt;
166
167
vector unsigned char valpha = vec_splat_u8(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
168
169
170
171
vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
0x00, 0x0a, 0x00, 0x0e,
0x00, 0x12, 0x00, 0x16,
0x00, 0x1a, 0x00, 0x1e);
172
173
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
174
175
176
177
178
179
180
vector unsigned short v3f =
VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
0x003f, 0x003f, 0x003f, 0x003f);
vector unsigned short vfc =
VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
0x00fc, 0x00fc, 0x00fc, 0x00fc);
vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
181
182
183
184
185
186
187
vf800 = vec_sl(vf800, vec_splat_u16(8));
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
188
int width = info->dst_w;
189
190
191
192
193
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
194
Uint32 Pixel; \
195
unsigned sR, sG, sB, sA; \
196
DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, Pixel, \
197
198
199
200
201
202
203
204
205
206
207
208
sR, sG, sB, sA); \
*(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
((sG << 3) & 0x000007E0) | \
((sB >> 3) & 0x0000001F)); \
dst += 2; \
src += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
209
extrawidth = (width % 8); /* trailing unaligned stores */
210
211
212
213
214
215
216
217
218
219
220
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
vector unsigned int vsrc1, vsrc2;
vector unsigned char vdst;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
221
vsrc1 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
222
223
224
225
src += 16;
vsrc = voverflow;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
226
vsrc2 = (vector unsigned int) vec_perm(vsrc, valpha, vpermute);
227
/* 1555 */
228
229
vpixel = (vector unsigned short) vec_packpx(vsrc1, vsrc2);
vgpixel = (vector unsigned short) vec_perm(vsrc1, vsrc2, vgmerge);
230
231
232
233
234
vgpixel = vec_and(vgpixel, vfc);
vgpixel = vec_sl(vgpixel, v3);
vrpixel = vec_sl(vpixel, v1);
vrpixel = vec_and(vrpixel, vf800);
vbpixel = vec_and(vpixel, v3f);
235
236
237
vdst =
vec_or((vector unsigned char) vrpixel,
(vector unsigned char) vgpixel);
238
/* 565 */
239
vdst = vec_or(vdst, (vector unsigned char) vbpixel);
240
241
242
243
244
245
246
247
248
249
250
251
252
253
vec_st(vdst, 0, dst);
width -= 8;
src += 16;
dst += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
254
src += srcskip; /* move to next row, accounting for pitch. */
255
256
257
258
259
260
dst += dstskip;
}
}
261
262
263
static void
Blit_RGB565_32Altivec(SDL_BlitInfo * info)
{
264
265
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
266
int srcskip = info->src_skip;
267
Uint8 *dst = (Uint8 *) info->dst;
268
int dstskip = info->dst_skip;
269
270
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
271
272
273
274
275
276
277
278
279
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v2 = vec_splat_u16(2);
vector unsigned short v3 = vec_splat_u16(3);
/*
280
281
282
283
284
285
286
287
288
289
290
291
292
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
vector unsigned char vredalpha2 =
(vector unsigned
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
);
293
/*
294
295
296
297
298
299
300
301
302
303
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
vector unsigned char vblue2 =
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
);
304
/*
305
306
307
308
309
310
311
312
313
314
315
316
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
vector unsigned char vgreen2 =
(vector unsigned
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
);
317
318
319
320
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
321
vf800 = (vector unsigned short) vec_splat_u8(-7);
322
323
vf800 = vec_sl(vf800, vec_splat_u16(8));
324
325
if (dstfmt->Amask && info->a) {
((unsigned char *) &valpha)[0] = alpha = info->a;
326
327
328
329
330
331
332
333
334
335
336
337
valpha = vec_splat(valpha, 0);
} else {
alpha = 0;
valpha = vec_splat_u8(0);
}
vpermute = calc_swizzle32(NULL, dstfmt);
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
338
int width = info->dst_w;
339
340
341
342
343
344
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
345
346
347
348
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 8) & 0xf8; \
sG = (Pixel >> 3) & 0xfc; \
sB = (Pixel << 3) & 0xf8; \
349
350
351
352
353
354
355
356
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
357
extrawidth = (width % 8); /* trailing unaligned stores */
358
359
360
361
362
363
364
365
366
367
368
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
369
370
vR = vec_and((vector unsigned short) vsrc, vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
371
372
vG = vec_sl(vB, v2);
373
374
375
376
377
vdst1 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
378
379
380
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
381
382
383
384
385
vdst2 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
386
387
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
388
389
390
391
392
393
394
395
396
397
398
399
400
401
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
402
src += srcskip; /* move to next row, accounting for pitch. */
403
404
405
406
407
dst += dstskip;
}
}
408
409
410
411
static void
Blit_RGB555_32Altivec(SDL_BlitInfo * info)
{
412
413
int height = info->dst_h;
Uint8 *src = (Uint8 *) info->src;
414
int srcskip = info->src_skip;
415
Uint8 *dst = (Uint8 *) info->dst;
416
int dstskip = info->dst_skip;
417
418
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
419
420
421
422
423
424
425
426
427
unsigned alpha;
vector unsigned char valpha;
vector unsigned char vpermute;
vector unsigned short vf800;
vector unsigned int v8 = vec_splat_u32(8);
vector unsigned int v16 = vec_add(v8, v8);
vector unsigned short v1 = vec_splat_u16(1);
vector unsigned short v3 = vec_splat_u16(3);
/*
428
429
430
431
432
433
434
435
436
437
438
439
440
0x10 - 0x1f is the alpha
0x00 - 0x0e evens are the red
0x01 - 0x0f odds are zero
*/
vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
0x10, 0x02, 0x01, 0x01,
0x10, 0x04, 0x01, 0x01,
0x10, 0x06, 0x01,
0x01);
vector unsigned char vredalpha2 =
(vector unsigned
char) (vec_add((vector unsigned int) vredalpha1, vec_sl(v8, v16))
);
441
/*
442
443
444
445
446
447
448
449
450
451
0x00 - 0x0f is ARxx ARxx ARxx ARxx
0x11 - 0x0f odds are blue
*/
vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
0x04, 0x05, 0x06, 0x13,
0x08, 0x09, 0x0a, 0x15,
0x0c, 0x0d, 0x0e, 0x17);
vector unsigned char vblue2 =
(vector unsigned char) (vec_add((vector unsigned int) vblue1, v8)
);
452
/*
453
454
455
456
457
458
459
460
461
462
463
464
0x00 - 0x0f is ARxB ARxB ARxB ARxB
0x10 - 0x0e evens are green
*/
vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
0x04, 0x05, 0x12, 0x07,
0x08, 0x09, 0x14, 0x0b,
0x0c, 0x0d, 0x16, 0x0f);
vector unsigned char vgreen2 =
(vector unsigned
char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8, v8))
);
465
466
467
468
assert(srcfmt->BytesPerPixel == 2);
assert(dstfmt->BytesPerPixel == 4);
469
vf800 = (vector unsigned short) vec_splat_u8(-7);
470
471
vf800 = vec_sl(vf800, vec_splat_u16(8));
472
473
if (dstfmt->Amask && info->a) {
((unsigned char *) &valpha)[0] = alpha = info->a;
474
475
476
477
478
479
480
481
482
483
484
485
valpha = vec_splat(valpha, 0);
} else {
alpha = 0;
valpha = vec_splat_u8(0);
}
vpermute = calc_swizzle32(NULL, dstfmt);
while (height--) {
vector unsigned char valigner;
vector unsigned char voverflow;
vector unsigned char vsrc;
486
int width = info->dst_w;
487
488
489
490
491
492
int extrawidth;
/* do scalar until we can align... */
#define ONE_PIXEL_BLEND(condition, widthvar) \
while (condition) { \
unsigned sR, sG, sB; \
493
494
495
496
unsigned short Pixel = *((unsigned short *)src); \
sR = (Pixel >> 7) & 0xf8; \
sG = (Pixel >> 2) & 0xf8; \
sB = (Pixel << 3) & 0xf8; \
497
498
499
500
501
502
503
504
ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
src += 2; \
dst += 4; \
widthvar--; \
}
ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
/* After all that work, here's the vector part! */
505
extrawidth = (width % 8); /* trailing unaligned stores */
506
507
508
509
510
511
512
513
514
515
516
width -= extrawidth;
vsrc = vec_ld(0, src);
valigner = VEC_ALIGNER(src);
while (width) {
vector unsigned short vR, vG, vB;
vector unsigned char vdst1, vdst2;
voverflow = vec_ld(15, src);
vsrc = vec_perm(vsrc, voverflow, valigner);
517
518
vR = vec_and(vec_sl((vector unsigned short) vsrc, v1), vf800);
vB = vec_sl((vector unsigned short) vsrc, v3);
519
520
vG = vec_sl(vB, v3);
521
522
523
524
525
vdst1 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
526
527
528
vdst1 = vec_perm(vdst1, valpha, vpermute);
vec_st(vdst1, 0, dst);
529
530
531
532
533
vdst2 =
(vector unsigned char) vec_perm((vector unsigned char) vR,
valpha, vredalpha2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
534
535
vdst2 = vec_perm(vdst2, valpha, vpermute);
vec_st(vdst2, 16, dst);
536
537
538
539
540
541
542
543
544
545
546
547
548
549
width -= 8;
dst += 32;
src += 16;
vsrc = voverflow;
}
assert(width == 0);
/* do scalar until we can align... */
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
550
src += srcskip; /* move to next row, accounting for pitch. */
551
552
553
554
555
dst += dstskip;
}
}
556
557
558
559
static void BlitNtoNKey(SDL_BlitInfo * info);
static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo * info);
static void
Blit32to32KeyAltivec(SDL_BlitInfo * info)
560
{
561
562
int height = info->dst_h;
Uint32 *srcp = (Uint32 *) info->src;
563
int srcskip = info->src_skip / 4;
564
Uint32 *dstp = (Uint32 *) info->dst;
565
566
int dstskip = info->dst_skip / 4;
SDL_PixelFormat *srcfmt = info->src_fmt;
567
int srcbpp = srcfmt->BytesPerPixel;
568
SDL_PixelFormat *dstfmt = info->dst_fmt;
569
570
int dstbpp = dstfmt->BytesPerPixel;
int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
571
unsigned alpha = dstfmt->Amask ? info->a : 0;
572
Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
573
Uint32 ckey = info->colorkey;
574
575
576
577
578
579
vector unsigned int valpha;
vector unsigned char vpermute;
vector unsigned char vzero;
vector unsigned int vckey;
vector unsigned int vrgbmask;
vpermute = calc_swizzle32(srcfmt, dstfmt);
580
if (info->dst_w < 16) {
581
if (copy_alpha) {
582
BlitNtoNKeyCopyAlpha(info);
583
} else {
584
BlitNtoNKey(info);
585
}
586
return;
587
588
589
}
vzero = vec_splat_u8(0);
if (alpha) {
590
591
592
((unsigned char *) &valpha)[0] = (unsigned char) alpha;
valpha =
(vector unsigned int) vec_splat((vector unsigned char) valpha, 0);
593
} else {
594
valpha = (vector unsigned int) vzero;
595
596
}
ckey &= rgbmask;
597
((unsigned int *) (char *) &vckey)[0] = ckey;
598
vckey = vec_splat(vckey, 0);
599
((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
600
601
602
603
604
605
vrgbmask = vec_splat(vrgbmask, 0);
while (height--) {
#define ONE_PIXEL_BLEND(condition, widthvar) \
if (copy_alpha) { \
while (condition) { \
606
Uint32 Pixel; \
607
unsigned sR, sG, sB, sA; \
608
DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, Pixel, \
609
sR, sG, sB, sA); \
610
if ( (Pixel & rgbmask) != ckey ) { \
611
612
613
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, sA); \
} \
614
615
dstp = (Uint32 *) (((Uint8 *) dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *) srcp) + srcbpp); \
616
617
618
619
widthvar--; \
} \
} else { \
while (condition) { \
620
Uint32 Pixel; \
621
unsigned sR, sG, sB; \
622
623
624
RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, Pixel); \
if ( Pixel != ckey ) { \
RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
625
626
627
ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
sR, sG, sB, alpha); \
} \
628
629
dstp = (Uint32 *) (((Uint8 *)dstp) + dstbpp); \
srcp = (Uint32 *) (((Uint8 *)srcp) + srcbpp); \
630
631
632
widthvar--; \
} \
}
633
int width = info->dst_w;
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
assert(width > 0);
if (width > 0) {
int extrawidth = (width % 4);
vector unsigned char valigner = VEC_ALIGNER(srcp);
vector unsigned int vs = vec_ld(0, srcp);
width -= extrawidth;
assert(width >= 4);
while (width) {
vector unsigned char vsel;
vector unsigned int vd;
vector unsigned int voverflow = vec_ld(15, srcp);
/* load the source vec */
vs = vec_perm(vs, voverflow, valigner);
/* vsel is set for items that match the key */
649
650
vsel = (vector unsigned char) vec_and(vs, vrgbmask);
vsel = (vector unsigned char) vec_cmpeq(vs, vckey);
651
652
653
654
655
/* permute the src vec to the dest format */
vs = vec_perm(vs, valpha, vpermute);
/* load the destination vec */
vd = vec_ld(0, dstp);
/* select the source and dest into vs */
656
657
658
659
vd = (vector unsigned int) vec_sel((vector unsigned char) vs,
(vector unsigned char) vd,
vsel);
660
661
662
663
664
665
666
667
vec_st(vd, 0, dstp);
srcp += 4;
width -= 4;
dstp += 4;
vs = voverflow;
}
ONE_PIXEL_BLEND((extrawidth), extrawidth);
#undef ONE_PIXEL_BLEND
668
669
srcp += srcskip;
dstp += dstskip;
670
671
672
673
674
675
}
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G5 */
676
677
static void
ConvertAltivec32to32_noprefetch(SDL_BlitInfo * info)
678
{
679
680
int height = info->dst_h;
Uint32 *src = (Uint32 *) info->src;
681
int srcskip = info->src_skip / 4;
682
Uint32 *dst = (Uint32 *) info->dst;
683
684
685
int dstskip = info->dst_skip / 4;
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
686
687
688
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
if (dstfmt->Amask && !srcfmt->Amask) {
689
if (info->a) {
690
vector unsigned char valpha;
691
((unsigned char *) &valpha)[0] = info->a;
692
vzero = (vector unsigned int) vec_splat(valpha, 0);
693
694
695
}
}
696
697
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
698
699
700
701
702
703
704
705
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
706
int width = info->dst_w;
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
int extrawidth;
/* do scalar until we can align... */
while ((UNALIGNED_PTR(dst)) && (width)) {
bits = *(src++);
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
723
while (width) {
724
725
726
voverflow = vec_ld(15, src);
src += 4;
width -= 4;
727
728
729
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
730
731
732
733
734
735
736
737
dst += 4;
vbits = voverflow;
}
assert(width == 0);
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
738
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
739
740
741
742
743
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
extrawidth--;
}
744
745
src += srcskip;
dst += dstskip;
746
747
748
749
750
751
}
}
/* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
/* Use this on a G4 */
752
753
static void
ConvertAltivec32to32_prefetch(SDL_BlitInfo * info)
754
{
755
756
const int scalar_dst_lead = sizeof(Uint32) * 4;
const int vector_dst_lead = sizeof(Uint32) * 16;
757
758
759
int height = info->dst_h;
Uint32 *src = (Uint32 *) info->src;
760
int srcskip = info->src_skip / 4;
761
Uint32 *dst = (Uint32 *) info->dst;
762
763
764
int dstskip = info->dst_skip / 4;
SDL_PixelFormat *srcfmt = info->src_fmt;
SDL_PixelFormat *dstfmt = info->dst_fmt;
765
766
767
vector unsigned int vzero = vec_splat_u32(0);
vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
if (dstfmt->Amask && !srcfmt->Amask) {
768
if (info->a) {
769
vector unsigned char valpha;
770
((unsigned char *) &valpha)[0] = info->a;
771
vzero = (vector unsigned int) vec_splat(valpha, 0);
772
773
774
}
}
775
776
assert(srcfmt->BytesPerPixel == 4);
assert(dstfmt->BytesPerPixel == 4);
777
778
779
780
781
782
783
784
while (height--) {
vector unsigned char valigner;
vector unsigned int vbits;
vector unsigned int voverflow;
Uint32 bits;
Uint8 r, g, b, a;
785
int width = info->dst_w;
786
787
788
789
int extrawidth;
/* do scalar until we can align... */
while ((UNALIGNED_PTR(dst)) && (width)) {
790
791
792
793
vec_dstt(src + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + scalar_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
794
795
796
797
798
799
800
801
802
803
804
805
806
bits = *(src++);
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
width--;
}
/* After all that work, here's the vector part! */
extrawidth = (width % 4);
width -= extrawidth;
valigner = VEC_ALIGNER(src);
vbits = vec_ld(0, src);
while (width) {
807
808
809
810
vec_dstt(src + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_SRC);
vec_dstst(dst + vector_dst_lead, DST_CTRL(2, 32, 1024),
DST_CHAN_DEST);
811
812
813
voverflow = vec_ld(15, src);
src += 4;
width -= 4;
814
815
816
vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
vec_st(vbits, 0, dst); /* store it back out. */
817
818
819
dst += 4;
vbits = voverflow;
}
820
821
822
823
824
assert(width == 0);
/* cover pixels at the end of the row that didn't fit in 16 bytes. */
while (extrawidth) {
825
bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
826
827
828
829
830
RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
*(dst++) = MAKE8888(dstfmt, r, g, b, a);
extrawidth--;
}
831
832
src += srcskip;
dst += dstskip;
833
834
835
836
837
838
}
vec_dss(DST_CHAN_SRC);
vec_dss(DST_CHAN_DEST);
}
839
840
static Uint32
GetBlitFeatures(void)
841
842
843
844
{
static Uint32 features = 0xffffffff;
if (features == 0xffffffff) {
/* Provide an override for testing .. */
845
char *override = SDL_getenv("SDL_ALTIVEC_BLIT_FEATURES");
846
847
if (override) {
features = 0;
848
SDL_sscanf(override, "%u", &features);
849
} else {
850
851
852
853
854
855
856
857
858
features = (0
/* Feature 1 is has-MMX */
| ((SDL_HasMMX())? 1 : 0)
/* Feature 2 is has-AltiVec */
| ((SDL_HasAltiVec())? 2 : 0)
/* Feature 4 is dont-use-prefetch */
/* !!!! FIXME: Check for G5 or later, not the cache size! Always prefetch on a G4. */
| ((GetL3CacheSize() == 0) ? 4 : 0)
);
859
860
861
862
}
}
return features;
}
863
864
865
866
#if __MWERKS__
#pragma altivec_model off
#endif
867
868
869
870
871
#else
/* Feature 1 is has-MMX */
#define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
#endif
872
/* This is now endian dependent */
873
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
874
875
#define HI 1
#define LO 0
876
#else /* SDL_BYTEORDER == SDL_BIG_ENDIAN */
877
878
879
880
#define HI 0
#define LO 1
#endif
881
882
/* Special optimized blit for RGB 8-8-8 --> RGB 3-3-2 */
#define RGB888_RGB332(dst, src) { \
883
884
885
dst = (Uint8)((((src)&0x00E00000)>>16)| \
(((src)&0x0000E000)>>11)| \
(((src)&0x000000C0)>>6)); \
886
}
887
888
static void
Blit_RGB888_index8(SDL_BlitInfo * info)
889
890
{
#ifndef USE_DUFFS_LOOP
891
int c;
892
#endif
893
894
895
896
897
898
899
int width, height;
Uint32 *src;
const Uint8 *map;
Uint8 *dst;
int srcskip, dstskip;
/* Set up some basic variables */
900
901
902
width = info->dst_w;
height = info->dst_h;
src = (Uint32 *) info->src;
903
srcskip = info->src_skip / 4;
904
dst = info->dst;
905
dstskip = info->dst_skip;
906
907
908
909
map = info->table;
if (map == NULL) {
while (height--) {
910
#ifdef USE_DUFFS_LOOP
911
/* *INDENT-OFF* */
912
913
914
DUFFS_LOOP(
RGB888_RGB332(*dst++, *src);
, width);
915
/* *INDENT-ON* */
916
#else
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
++src;
RGB888_RGB332(*dst++, *src);
++src;
RGB888_RGB332(*dst++, *src);
++src;
RGB888_RGB332(*dst++, *src);
++src;
}
switch (width & 3) {
case 3:
RGB888_RGB332(*dst++, *src);
++src;
case 2:
RGB888_RGB332(*dst++, *src);
++src;
case 1:
RGB888_RGB332(*dst++, *src);
++src;
}
938
#endif /* USE_DUFFS_LOOP */
939
940
941
942
943
src += srcskip;
dst += dstskip;
}
} else {
int Pixel;
944
945
while (height--) {
946
#ifdef USE_DUFFS_LOOP
947
/* *INDENT-OFF* */
948
DUFFS_LOOP(
949
950
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
951
952
++src;
, width);
953
/* *INDENT-ON* */
954
#else
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
for (c = width / 4; c; --c) {
/* Pack RGB into 8bit pixel */
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
}
switch (width & 3) {
case 3:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
case 2:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
case 1:
RGB888_RGB332(Pixel, *src);
*dst++ = map[Pixel];
++src;
}
984
#endif /* USE_DUFFS_LOOP */
985
986
987
988
src += srcskip;
dst += dstskip;
}
}
989
}
990
991
992
/* Special optimized blit for RGB 8-8-8 --> RGB 5-5-5 */
#define RGB888_RGB555(dst, src) { \
993
994
995
*(Uint16 *)(dst) = (Uint16)((((*src)&0x00F80000)>>9)| \
(((*src)&0x0000F800)>>6)| \
(((*src)&0x000000F8)>>3)); \
996
997
998
999
1000
}
#define RGB888_RGB555_TWO(dst, src) { \
*(Uint32 *)(dst) = (((((src[HI])&0x00F80000)>>9)| \
(((src[HI])&0x0000F800)>>6)| \
(((src[HI])&0x000000F8)>>3))<<16)| \