1.1 --- a/src/video/SDL_blit_A.c Wed Aug 14 23:30:10 2013 -0700
1.2 +++ b/src/video/SDL_blit_A.c Fri Aug 16 06:59:19 2013 -0700
1.3 @@ -337,15 +337,14 @@
1.4 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1.5 Uint32 amask = sf->Amask;
1.6 Uint32 ashift = sf->Ashift;
1.7 - Uint64 multmask;
1.8 + Uint64 multmask, multmask2;
1.9
1.10 - __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1.11 + __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
1.12
1.13 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1.14 - multmask = 0xFFFF;
1.15 - multmask <<= (ashift * 2);
1.16 - multmask = ~multmask;
1.17 - dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
1.18 + multmask = 0x00FF;
1.19 + multmask <<= (ashift * 2);
1.20 + multmask2 = 0x00FF00FF00FF00FF;
1.21
1.22 while (height--) {
1.23 /* *INDENT-OFF* */
1.24 @@ -353,9 +352,8 @@
1.25 Uint32 alpha = *srcp & amask;
1.26 if (alpha == 0) {
1.27 /* do nothing */
1.28 - } else if (alpha == amask) {
1.29 - /* opaque alpha -- copy RGB, keep dst alpha */
1.30 - *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1.31 + } else if (alpha == amask || (*dstp & amask) == 0) {
1.32 + *dstp = *srcp;
1.33 } else {
1.34 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1.35 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1.36 @@ -366,15 +364,17 @@
1.37 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1.38 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1.39 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1.40 - mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1.41 - mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1.42 + mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
1.43 + mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
1.44 + mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
1.45
1.46 /* blend */
1.47 - src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
1.48 - src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
1.49 - src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1.50 - dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
1.51 - dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1.52 + src1 = _mm_mullo_pi16(src1, mm_alpha);
1.53 + src1 = _mm_srli_pi16(src1, 8);
1.54 + dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
1.55 + dst1 = _mm_srli_pi16(dst1, 8);
1.56 + dst1 = _mm_add_pi16(src1, dst1);
1.57 + dst1 = _mm_packs_pu16(dst1, mm_zero);
1.58
1.59 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1.60 }
1.61 @@ -481,23 +481,24 @@
1.62 compositioning used (>>8 instead of /255) doesn't handle
1.63 it correctly. Also special-case alpha=0 for speed?
1.64 Benchmark this! */
1.65 - if(alpha) {
1.66 - if(alpha == SDL_ALPHA_OPAQUE) {
1.67 - *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
1.68 + if (alpha) {
1.69 + if (alpha == SDL_ALPHA_OPAQUE) {
1.70 + *dstp = *srcp;
1.71 } else {
1.72 /*
1.73 * take out the middle component (green), and process
1.74 * the other two in parallel. One multiply less.
1.75 */
1.76 d = *dstp;
1.77 - dalpha = d & 0xff000000;
1.78 + dalpha = d >> 24;
1.79 s1 = s & 0xff00ff;
1.80 d1 = d & 0xff00ff;
1.81 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
1.82 s &= 0xff00;
1.83 d &= 0xff00;
1.84 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
1.85 - *dstp = d1 | d | dalpha;
1.86 + dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
1.87 + *dstp = d1 | d | (dalpha << 24);
1.88 }
1.89 }
1.90 ++srcp;
1.91 @@ -524,15 +525,14 @@
1.92 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1.93 Uint32 amask = sf->Amask;
1.94 Uint32 ashift = sf->Ashift;
1.95 - Uint64 multmask;
1.96 + Uint64 multmask, multmask2;
1.97
1.98 - __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1.99 + __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
1.100
1.101 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1.102 - multmask = 0xFFFF;
1.103 + multmask = 0x00FF;
1.104 multmask <<= (ashift * 2);
1.105 - multmask = ~multmask;
1.106 - dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
1.107 + multmask2 = 0x00FF00FF00FF00FF;
1.108
1.109 while (height--) {
1.110 /* *INDENT-OFF* */
1.111 @@ -545,9 +545,8 @@
1.112 alpha = *srcp & amask;
1.113 if (alpha == 0) {
1.114 /* do nothing */
1.115 - } else if (alpha == amask) {
1.116 - /* copy RGB, keep dst alpha */
1.117 - *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1.118 + } else if (alpha == amask || (*dstp & amask) == 0) {
1.119 + *dstp = *srcp;
1.120 } else {
1.121 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1.122 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1.123 @@ -558,15 +557,18 @@
1.124 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1.125 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1.126 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1.127 - mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1.128 - mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1.129 + mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
1.130 + mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha*/
1.131 + mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha*/
1.132 +
1.133
1.134 /* blend */
1.135 - src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1.136 - src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1.137 - src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1.138 - dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1.139 - dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1.140 + src1 = _mm_mullo_pi16(src1, mm_alpha);
1.141 + src1 = _mm_srli_pi16(src1, 8);
1.142 + dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
1.143 + dst1 = _mm_srli_pi16(dst1, 8);
1.144 + dst1 = _mm_add_pi16(src1, dst1);
1.145 + dst1 = _mm_packs_pu16(dst1, mm_zero);
1.146
1.147 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1.148 }