Fixed alpha blending for the MMX blit functions
authorSam Lantinga <slouken@libsdl.org>
Fri, 16 Aug 2013 06:59:19 -0700
changeset 764038284657fc79
parent 7639 9406b7dd2f2d
child 7641 0cd36d20df2b
Fixed alpha blending for the MMX blit functions

I see the Remarks of function SDL_BlitSurface shows that "when SDL_BLENDMODE_BLEND, we have dstA = srcA + (dstA * (1-srcA))". however, I tested some pictures but the result implies "dstA=arcA" actually. I stepped into the source code, and found after I set SDL_BLENDMODE_BLEND for the source surface, the final blit function is BlitRGBtoRGBPixelAlphaMMX when I use SDL_BlitSurface on my computer. And I found these codes:

else if (alpha == amask) {
/* opaque alpha -- copy RGB, keep dst alpha */
*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);

The same code is used in BlitRGBtoRGBPixelAlphaMMX3DNOW and BlitRGBtoRGBPixelAlpha. So I think they still keep dst alpha.

Best regards,
Jianyu Guan
src/video/SDL_blit_A.c
     1.1 --- a/src/video/SDL_blit_A.c	Wed Aug 14 23:30:10 2013 -0700
     1.2 +++ b/src/video/SDL_blit_A.c	Fri Aug 16 06:59:19 2013 -0700
     1.3 @@ -337,15 +337,14 @@
     1.4      Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
     1.5      Uint32 amask = sf->Amask;
     1.6      Uint32 ashift = sf->Ashift;
     1.7 -    Uint64 multmask;
     1.8 +    Uint64 multmask, multmask2;
     1.9  
    1.10 -    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
    1.11 +    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
    1.12  
    1.13      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
    1.14 -    multmask = 0xFFFF;
    1.15 -    multmask <<= (ashift * 2);
    1.16 -    multmask = ~multmask;
    1.17 -    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
    1.18 +    multmask = 0x00FF;
    1.19 +	multmask <<= (ashift * 2);
    1.20 +	multmask2 = 0x00FF00FF00FF00FF;
    1.21  
    1.22      while (height--) {
    1.23  		/* *INDENT-OFF* */
    1.24 @@ -353,9 +352,8 @@
    1.25  		Uint32 alpha = *srcp & amask;
    1.26  		if (alpha == 0) {
    1.27  			/* do nothing */
    1.28 -		} else if (alpha == amask) {
    1.29 -			/* opaque alpha -- copy RGB, keep dst alpha */
    1.30 -			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
    1.31 +		} else if (alpha == amask || (*dstp & amask) == 0) {
    1.32 +			*dstp = *srcp;
    1.33  		} else {
    1.34  			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
    1.35  			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
    1.36 @@ -366,15 +364,17 @@
    1.37  			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
    1.38  			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
    1.39  			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
    1.40 -			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
    1.41 -			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
    1.42 +			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
    1.43 +			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha*/
    1.44 +			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha*/
    1.45  
    1.46  			/* blend */		    
    1.47 -			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
    1.48 -			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
    1.49 -			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
    1.50 -			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
    1.51 -			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
    1.52 +			src1 = _mm_mullo_pi16(src1, mm_alpha);
    1.53 +			src1 = _mm_srli_pi16(src1, 8);
    1.54 +			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
    1.55 +			dst1 = _mm_srli_pi16(dst1, 8);
    1.56 +			dst1 = _mm_add_pi16(src1, dst1);
    1.57 +			dst1 = _mm_packs_pu16(dst1, mm_zero);
    1.58  			
    1.59  			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
    1.60  		}
    1.61 @@ -481,23 +481,24 @@
    1.62  		   compositioning used (>>8 instead of /255) doesn't handle
    1.63  		   it correctly. Also special-case alpha=0 for speed?
    1.64  		   Benchmark this! */
    1.65 -		if(alpha) {   
    1.66 -		  if(alpha == SDL_ALPHA_OPAQUE) {
    1.67 -		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
    1.68 +		if (alpha) {
    1.69 +		  if (alpha == SDL_ALPHA_OPAQUE) {
    1.70 +			  *dstp = *srcp;
    1.71  		  } else {
    1.72  		    /*
    1.73  		     * take out the middle component (green), and process
    1.74  		     * the other two in parallel. One multiply less.
    1.75  		     */
    1.76  		    d = *dstp;
    1.77 -		    dalpha = d & 0xff000000;
    1.78 +			dalpha = d >> 24;
    1.79  		    s1 = s & 0xff00ff;
    1.80  		    d1 = d & 0xff00ff;
    1.81  		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
    1.82  		    s &= 0xff00;
    1.83  		    d &= 0xff00;
    1.84  		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
    1.85 -		    *dstp = d1 | d | dalpha;
    1.86 +			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
    1.87 +		    *dstp = d1 | d | (dalpha << 24);
    1.88  		  }
    1.89  		}
    1.90  		++srcp;
    1.91 @@ -524,15 +525,14 @@
    1.92      Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
    1.93      Uint32 amask = sf->Amask;
    1.94      Uint32 ashift = sf->Ashift;
    1.95 -    Uint64 multmask;
    1.96 +    Uint64 multmask, multmask2;
    1.97  
    1.98 -    __m64 src1, dst1, mm_alpha, mm_zero, dmask;
    1.99 +    __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   1.100  
   1.101      mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   1.102 -    multmask = 0xFFFF;
   1.103 +    multmask = 0x00FF;
   1.104      multmask <<= (ashift * 2);
   1.105 -    multmask = ~multmask;
   1.106 -    dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   1.107 +    multmask2 = 0x00FF00FF00FF00FF;
   1.108  
   1.109      while (height--) {
   1.110  	    /* *INDENT-OFF* */
   1.111 @@ -545,9 +545,8 @@
   1.112  		alpha = *srcp & amask;
   1.113  		if (alpha == 0) {
   1.114  			/* do nothing */
   1.115 -		} else if (alpha == amask) {
   1.116 -			/* copy RGB, keep dst alpha */
   1.117 -			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
   1.118 +		} else if (alpha == amask || (*dstp & amask) == 0) {
   1.119 +			*dstp = *srcp;
   1.120  		} else {
   1.121  			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   1.122  			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   1.123 @@ -558,15 +557,18 @@
   1.124  			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   1.125  			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   1.126  			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   1.127 -			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   1.128 -			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   1.129 +			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   1.130 +			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha*/
   1.131 +			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha*/
   1.132 +
   1.133  
   1.134  			/* blend */		    
   1.135 -			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
   1.136 -			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
   1.137 -			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   1.138 -			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
   1.139 -			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   1.140 +			src1 = _mm_mullo_pi16(src1, mm_alpha);
   1.141 +			src1 = _mm_srli_pi16(src1, 8);
   1.142 +			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   1.143 +			dst1 = _mm_srli_pi16(dst1, 8);
   1.144 +			dst1 = _mm_add_pi16(src1, dst1);
   1.145 +			dst1 = _mm_packs_pu16(dst1, mm_zero);
   1.146  			
   1.147  			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   1.148  		}