src/video/SDL_blit_A.c
changeset 7640 38284657fc79
parent 7502 6ff02ff3cf06
child 7641 0cd36d20df2b
equal deleted inserted replaced
7639:9406b7dd2f2d 7640:38284657fc79
   335     int dstskip = info->dst_skip >> 2;
   335     int dstskip = info->dst_skip >> 2;
   336     SDL_PixelFormat *sf = info->src_fmt;
   336     SDL_PixelFormat *sf = info->src_fmt;
   337     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   337     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   338     Uint32 amask = sf->Amask;
   338     Uint32 amask = sf->Amask;
   339     Uint32 ashift = sf->Ashift;
   339     Uint32 ashift = sf->Ashift;
   340     Uint64 multmask;
   340     Uint64 multmask, multmask2;
   341 
   341 
   342     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   342     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   343 
   343 
   344     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   344     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   345     multmask = 0xFFFF;
   345     multmask = 0x00FF;
   346     multmask <<= (ashift * 2);
   346 	multmask <<= (ashift * 2);
   347     multmask = ~multmask;
   347 	multmask2 = 0x00FF00FF00FF00FF;
   348     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
       
   349 
   348 
   350     while (height--) {
   349     while (height--) {
   351 		/* *INDENT-OFF* */
   350 		/* *INDENT-OFF* */
   352 		DUFFS_LOOP4({
   351 		DUFFS_LOOP4({
   353 		Uint32 alpha = *srcp & amask;
   352 		Uint32 alpha = *srcp & amask;
   354 		if (alpha == 0) {
   353 		if (alpha == 0) {
   355 			/* do nothing */
   354 			/* do nothing */
   356 		} else if (alpha == amask) {
   355 		} else if (alpha == amask || (*dstp & amask) == 0) {
   357 			/* opaque alpha -- copy RGB, keep dst alpha */
   356 			*dstp = *srcp;
   358 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
       
   359 		} else {
   357 		} else {
   360 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   358 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   361 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   359 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   362 
   360 
   363 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   361 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   364 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   362 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   365 
   363 
   366 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   364 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   367 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   365 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   368 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   366 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   369 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   367 			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   370 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   368 			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha*/
       
   369 			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha*/
   371 
   370 
   372 			/* blend */		    
   371 			/* blend */		    
   373 			src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
   372 			src1 = _mm_mullo_pi16(src1, mm_alpha);
   374 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
   373 			src1 = _mm_srli_pi16(src1, 8);
   375 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   374 			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   376 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
   375 			dst1 = _mm_srli_pi16(dst1, 8);
   377 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   376 			dst1 = _mm_add_pi16(src1, dst1);
       
   377 			dst1 = _mm_packs_pu16(dst1, mm_zero);
   378 			
   378 			
   379 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   379 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   380 		}
   380 		}
   381 		++srcp;
   381 		++srcp;
   382 		++dstp;
   382 		++dstp;
   479 		Uint32 alpha = s >> 24;
   479 		Uint32 alpha = s >> 24;
   480 		/* FIXME: Here we special-case opaque alpha since the
   480 		/* FIXME: Here we special-case opaque alpha since the
   481 		   compositioning used (>>8 instead of /255) doesn't handle
   481 		   compositioning used (>>8 instead of /255) doesn't handle
   482 		   it correctly. Also special-case alpha=0 for speed?
   482 		   it correctly. Also special-case alpha=0 for speed?
   483 		   Benchmark this! */
   483 		   Benchmark this! */
   484 		if(alpha) {   
   484 		if (alpha) {
   485 		  if(alpha == SDL_ALPHA_OPAQUE) {
   485 		  if (alpha == SDL_ALPHA_OPAQUE) {
   486 		    *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
   486 			  *dstp = *srcp;
   487 		  } else {
   487 		  } else {
   488 		    /*
   488 		    /*
   489 		     * take out the middle component (green), and process
   489 		     * take out the middle component (green), and process
   490 		     * the other two in parallel. One multiply less.
   490 		     * the other two in parallel. One multiply less.
   491 		     */
   491 		     */
   492 		    d = *dstp;
   492 		    d = *dstp;
   493 		    dalpha = d & 0xff000000;
   493 			dalpha = d >> 24;
   494 		    s1 = s & 0xff00ff;
   494 		    s1 = s & 0xff00ff;
   495 		    d1 = d & 0xff00ff;
   495 		    d1 = d & 0xff00ff;
   496 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   496 		    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
   497 		    s &= 0xff00;
   497 		    s &= 0xff00;
   498 		    d &= 0xff00;
   498 		    d &= 0xff00;
   499 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   499 		    d = (d + ((s - d) * alpha >> 8)) & 0xff00;
   500 		    *dstp = d1 | d | dalpha;
   500 			dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
       
   501 		    *dstp = d1 | d | (dalpha << 24);
   501 		  }
   502 		  }
   502 		}
   503 		}
   503 		++srcp;
   504 		++srcp;
   504 		++dstp;
   505 		++dstp;
   505 	    }, width);
   506 	    }, width);
   522     int dstskip = info->dst_skip >> 2;
   523     int dstskip = info->dst_skip >> 2;
   523     SDL_PixelFormat *sf = info->src_fmt;
   524     SDL_PixelFormat *sf = info->src_fmt;
   524     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   525     Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
   525     Uint32 amask = sf->Amask;
   526     Uint32 amask = sf->Amask;
   526     Uint32 ashift = sf->Ashift;
   527     Uint32 ashift = sf->Ashift;
   527     Uint64 multmask;
   528     Uint64 multmask, multmask2;
   528 
   529 
   529     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   530     __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
   530 
   531 
   531     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   532     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   532     multmask = 0xFFFF;
   533     multmask = 0x00FF;
   533     multmask <<= (ashift * 2);
   534     multmask <<= (ashift * 2);
   534     multmask = ~multmask;
   535     multmask2 = 0x00FF00FF00FF00FF;
   535     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
       
   536 
   536 
   537     while (height--) {
   537     while (height--) {
   538 	    /* *INDENT-OFF* */
   538 	    /* *INDENT-OFF* */
   539 	    DUFFS_LOOP4({
   539 	    DUFFS_LOOP4({
   540 		Uint32 alpha;
   540 		Uint32 alpha;
   543 		_m_prefetch(dstp + 16);
   543 		_m_prefetch(dstp + 16);
   544 
   544 
   545 		alpha = *srcp & amask;
   545 		alpha = *srcp & amask;
   546 		if (alpha == 0) {
   546 		if (alpha == 0) {
   547 			/* do nothing */
   547 			/* do nothing */
   548 		} else if (alpha == amask) {
   548 		} else if (alpha == amask || (*dstp & amask) == 0) {
   549 			/* copy RGB, keep dst alpha */
   549 			*dstp = *srcp;
   550 			*dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
       
   551 		} else {
   550 		} else {
   552 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   551 			src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
   553 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   552 			src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
   554 
   553 
   555 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   554 			dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
   556 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   555 			dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   557 
   556 
   558 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   557 			mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
   559 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   558 			mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
   560 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   559 			mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
   561 			mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
   560 			mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
   562 			mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
   561 			mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask);	/* 0F0A0A0A -> mm_alpha*/
       
   562 			mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2);	/* 255 - mm_alpha -> mm_alpha*/
       
   563 
   563 
   564 
   564 			/* blend */		    
   565 			/* blend */		    
   565 			src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
   566 			src1 = _mm_mullo_pi16(src1, mm_alpha);
   566 			src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
   567 			src1 = _mm_srli_pi16(src1, 8);
   567 			src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
   568 			dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
   568 			dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
   569 			dst1 = _mm_srli_pi16(dst1, 8);
   569 			dst1 = _mm_packs_pu16(dst1, mm_zero);  /* 0000ARGB -> dst1 */
   570 			dst1 = _mm_add_pi16(src1, dst1);
       
   571 			dst1 = _mm_packs_pu16(dst1, mm_zero);
   570 			
   572 			
   571 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   573 			*dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   572 		}
   574 		}
   573 		++srcp;
   575 		++srcp;
   574 		++dstp;
   576 		++dstp;