src/video/SDL_blit_A.c
branchSDL-1.3
changeset 1668 4da1ee79c9af
parent 1662 782fd950bd46
child 1720 a1ebb17f9c52
equal deleted inserted replaced
1667:1fddae038bc8 1668:4da1ee79c9af
    45 
    45 
    46 /* Functions to perform alpha blended blitting */
    46 /* Functions to perform alpha blended blitting */
    47 
    47 
    48 /* N->1 blending with per-surface alpha */
    48 /* N->1 blending with per-surface alpha */
    49 static void
    49 static void
    50 BlitNto1SurfaceAlpha (SDL_BlitInfo * info)
    50 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
    51 {
    51 {
    52     int width = info->d_width;
    52     int width = info->d_width;
    53     int height = info->d_height;
    53     int height = info->d_height;
    54     Uint8 *src = info->s_pixels;
    54     Uint8 *src = info->s_pixels;
    55     int srcskip = info->s_skip;
    55     int srcskip = info->s_skip;
   101     }
   101     }
   102 }
   102 }
   103 
   103 
   104 /* N->1 blending with pixel alpha */
   104 /* N->1 blending with pixel alpha */
   105 static void
   105 static void
   106 BlitNto1PixelAlpha (SDL_BlitInfo * info)
   106 BlitNto1PixelAlpha(SDL_BlitInfo * info)
   107 {
   107 {
   108     int width = info->d_width;
   108     int width = info->d_width;
   109     int height = info->d_height;
   109     int height = info->d_height;
   110     Uint8 *src = info->s_pixels;
   110     Uint8 *src = info->s_pixels;
   111     int srcskip = info->s_skip;
   111     int srcskip = info->s_skip;
   157     }
   157     }
   158 }
   158 }
   159 
   159 
   160 /* colorkeyed N->1 blending with per-surface alpha */
   160 /* colorkeyed N->1 blending with per-surface alpha */
   161 static void
   161 static void
   162 BlitNto1SurfaceAlphaKey (SDL_BlitInfo * info)
   162 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
   163 {
   163 {
   164     int width = info->d_width;
   164     int width = info->d_width;
   165     int height = info->d_height;
   165     int height = info->d_height;
   166     Uint8 *src = info->s_pixels;
   166     Uint8 *src = info->s_pixels;
   167     int srcskip = info->s_skip;
   167     int srcskip = info->s_skip;
   217 }
   217 }
   218 
   218 
   219 #if GCC_ASMBLIT
   219 #if GCC_ASMBLIT
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   220 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   221 static void
   221 static void
   222 BlitRGBtoRGBSurfaceAlpha128MMX (SDL_BlitInfo * info)
   222 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   223 {
   223 {
   224     int width = info->d_width;
   224     int width = info->d_width;
   225     int height = info->d_height;
   225     int height = info->d_height;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   226     Uint32 *srcp = (Uint32 *) info->s_pixels;
   227     int srcskip = info->s_skip >> 2;
   227     int srcskip = info->s_skip >> 2;
   229     int dstskip = info->d_skip >> 2;
   229     int dstskip = info->d_skip >> 2;
   230     Uint32 dalpha = info->dst->Amask;
   230     Uint32 dalpha = info->dst->Amask;
   231     Uint8 load[8];
   231     Uint8 load[8];
   232 
   232 
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   233     *(Uint64 *) load = 0x00fefefe00fefefeULL;   /* alpha128 mask */
   234     movq_m2r (*load, mm4);      /* alpha128 mask -> mm4 */
   234     movq_m2r(*load, mm4);       /* alpha128 mask -> mm4 */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   235     *(Uint64 *) load = 0x0001010100010101ULL;   /* !alpha128 mask */
   236     movq_m2r (*load, mm3);      /* !alpha128 mask -> mm3 */
   236     movq_m2r(*load, mm3);       /* !alpha128 mask -> mm3 */
   237     movd_m2r (dalpha, mm7);     /* dst alpha mask */
   237     movd_m2r(dalpha, mm7);      /* dst alpha mask */
   238     punpckldq_r2r (mm7, mm7);   /* dst alpha mask | dst alpha mask -> mm7 */
   238     punpckldq_r2r(mm7, mm7);    /* dst alpha mask | dst alpha mask -> mm7 */
   239     while (height--) {
   239     while (height--) {
   240 		/* *INDENT-OFF* */
   240 		/* *INDENT-OFF* */
   241 		DUFFS_LOOP_DOUBLE2(
   241 		DUFFS_LOOP_DOUBLE2(
   242 		{
   242 		{
   243 			Uint32 s = *srcp++;
   243 			Uint32 s = *srcp++;
   266 		}, width);
   266 		}, width);
   267 		/* *INDENT-ON* */
   267 		/* *INDENT-ON* */
   268         srcp += srcskip;
   268         srcp += srcskip;
   269         dstp += dstskip;
   269         dstp += dstskip;
   270     }
   270     }
   271     emms ();
   271     emms();
   272 }
   272 }
   273 
   273 
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   274 /* fast RGB888->(A)RGB888 blending with surface alpha */
   275 static void
   275 static void
   276 BlitRGBtoRGBSurfaceAlphaMMX (SDL_BlitInfo * info)
   276 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   277 {
   277 {
   278     SDL_PixelFormat *df = info->dst;
   278     SDL_PixelFormat *df = info->dst;
   279     unsigned alpha = info->src->alpha;
   279     unsigned alpha = info->src->alpha;
   280 
   280 
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   281     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   282         /* only call a128 version when R,G,B occupy lower bits */
   282         /* only call a128 version when R,G,B occupy lower bits */
   283         BlitRGBtoRGBSurfaceAlpha128MMX (info);
   283         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   284     } else {
   284     } else {
   285         int width = info->d_width;
   285         int width = info->d_width;
   286         int height = info->d_height;
   286         int height = info->d_height;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   287         Uint32 *srcp = (Uint32 *) info->s_pixels;
   288         int srcskip = info->s_skip >> 2;
   288         int srcskip = info->s_skip >> 2;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   289         Uint32 *dstp = (Uint32 *) info->d_pixels;
   290         int dstskip = info->d_skip >> 2;
   290         int dstskip = info->d_skip >> 2;
   291 
   291 
   292         pxor_r2r (mm5, mm5);    /* 0 -> mm5 */
   292         pxor_r2r(mm5, mm5);     /* 0 -> mm5 */
   293         /* form the alpha mult */
   293         /* form the alpha mult */
   294         movd_m2r (alpha, mm4);  /* 0000000A -> mm4 */
   294         movd_m2r(alpha, mm4);   /* 0000000A -> mm4 */
   295         punpcklwd_r2r (mm4, mm4);       /* 00000A0A -> mm4 */
   295         punpcklwd_r2r(mm4, mm4);        /* 00000A0A -> mm4 */
   296         punpckldq_r2r (mm4, mm4);       /* 0A0A0A0A -> mm4 */
   296         punpckldq_r2r(mm4, mm4);        /* 0A0A0A0A -> mm4 */
   297         alpha =
   297         alpha =
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   298             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   299                                                            Bshift);
   299                                                            Bshift);
   300         movd_m2r (alpha, mm0);  /* 00000FFF -> mm0 */
   300         movd_m2r(alpha, mm0);   /* 00000FFF -> mm0 */
   301         punpcklbw_r2r (mm0, mm0);       /* 00FFFFFF -> mm0 */
   301         punpcklbw_r2r(mm0, mm0);        /* 00FFFFFF -> mm0 */
   302         pand_r2r (mm0, mm4);    /* 0A0A0A0A -> mm4, minus 1 chan */
   302         pand_r2r(mm0, mm4);     /* 0A0A0A0A -> mm4, minus 1 chan */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   303         /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
   304         movd_m2r (df->Amask, mm7);      /* dst alpha mask */
   304         movd_m2r(df->Amask, mm7);       /* dst alpha mask */
   305         punpckldq_r2r (mm7, mm7);       /* dst alpha mask | dst alpha mask -> mm7 */
   305         punpckldq_r2r(mm7, mm7);        /* dst alpha mask | dst alpha mask -> mm7 */
   306 
   306 
   307         while (height--) {
   307         while (height--) {
   308 			/* *INDENT-OFF* */
   308 			/* *INDENT-OFF* */
   309 			DUFFS_LOOP_DOUBLE2({
   309 			DUFFS_LOOP_DOUBLE2({
   310 				/* One Pixel Blend */
   310 				/* One Pixel Blend */
   355   			}, width);
   355   			}, width);
   356 			/* *INDENT-ON* */
   356 			/* *INDENT-ON* */
   357             srcp += srcskip;
   357             srcp += srcskip;
   358             dstp += dstskip;
   358             dstp += dstskip;
   359         }
   359         }
   360         emms ();
   360         emms();
   361     }
   361     }
   362 }
   362 }
   363 
   363 
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   364 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   365 static void
   365 static void
   366 BlitRGBtoRGBPixelAlphaMMX (SDL_BlitInfo * info)
   366 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   367 {
   367 {
   368     int width = info->d_width;
   368     int width = info->d_width;
   369     int height = info->d_height;
   369     int height = info->d_height;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   370     Uint32 *srcp = (Uint32 *) info->s_pixels;
   371     int srcskip = info->s_skip >> 2;
   371     int srcskip = info->s_skip >> 2;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   372     Uint32 *dstp = (Uint32 *) info->d_pixels;
   373     int dstskip = info->d_skip >> 2;
   373     int dstskip = info->d_skip >> 2;
   374     SDL_PixelFormat *sf = info->src;
   374     SDL_PixelFormat *sf = info->src;
   375     Uint32 amask = sf->Amask;
   375     Uint32 amask = sf->Amask;
   376 
   376 
   377     pxor_r2r (mm6, mm6);        /* 0 -> mm6 */
   377     pxor_r2r(mm6, mm6);         /* 0 -> mm6 */
   378     /* form multiplication mask */
   378     /* form multiplication mask */
   379     movd_m2r (sf->Amask, mm7);  /* 0000F000 -> mm7 */
   379     movd_m2r(sf->Amask, mm7);   /* 0000F000 -> mm7 */
   380     punpcklbw_r2r (mm7, mm7);   /* FF000000 -> mm7 */
   380     punpcklbw_r2r(mm7, mm7);    /* FF000000 -> mm7 */
   381     pcmpeqb_r2r (mm0, mm0);     /* FFFFFFFF -> mm0 */
   381     pcmpeqb_r2r(mm0, mm0);      /* FFFFFFFF -> mm0 */
   382     movq_r2r (mm0, mm3);        /* FFFFFFFF -> mm3 (for later) */
   382     movq_r2r(mm0, mm3);         /* FFFFFFFF -> mm3 (for later) */
   383     pxor_r2r (mm0, mm7);        /* 00FFFFFF -> mm7 (mult mask) */
   383     pxor_r2r(mm0, mm7);         /* 00FFFFFF -> mm7 (mult mask) */
   384     /* form channel masks */
   384     /* form channel masks */
   385     movq_r2r (mm7, mm0);        /* 00FFFFFF -> mm0 */
   385     movq_r2r(mm7, mm0);         /* 00FFFFFF -> mm0 */
   386     packsswb_r2r (mm6, mm0);    /* 00000FFF -> mm0 (channel mask) */
   386     packsswb_r2r(mm6, mm0);     /* 00000FFF -> mm0 (channel mask) */
   387     packsswb_r2r (mm6, mm3);    /* 0000FFFF -> mm3 */
   387     packsswb_r2r(mm6, mm3);     /* 0000FFFF -> mm3 */
   388     pxor_r2r (mm0, mm3);        /* 0000F000 -> mm3 (~channel mask) */
   388     pxor_r2r(mm0, mm3);         /* 0000F000 -> mm3 (~channel mask) */
   389     /* get alpha channel shift */
   389     /* get alpha channel shift */
   390     movd_m2r (sf->Ashift, mm5); /* Ashift -> mm5 */
   390     movd_m2r(sf->Ashift, mm5);  /* Ashift -> mm5 */
   391 
   391 
   392     while (height--) {
   392     while (height--) {
   393 	    /* *INDENT-OFF* */
   393 	    /* *INDENT-OFF* */
   394 	    DUFFS_LOOP4({
   394 	    DUFFS_LOOP4({
   395 		Uint32 alpha = *srcp & amask;
   395 		Uint32 alpha = *srcp & amask;
   437 	    }, width);
   437 	    }, width);
   438 	    /* *INDENT-ON* */
   438 	    /* *INDENT-ON* */
   439         srcp += srcskip;
   439         srcp += srcskip;
   440         dstp += dstskip;
   440         dstp += dstskip;
   441     }
   441     }
   442     emms ();
   442     emms();
   443 }
   443 }
   444 
   444 
   445 /* End GCC_ASMBLIT */
   445 /* End GCC_ASMBLIT */
   446 
   446 
   447 #elif MSVC_ASMBLIT
   447 #elif MSVC_ASMBLIT
   448 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   448 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
   449 static void
   449 static void
   450 BlitRGBtoRGBSurfaceAlpha128MMX (SDL_BlitInfo * info)
   450 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
   451 {
   451 {
   452     int width = info->d_width;
   452     int width = info->d_width;
   453     int height = info->d_height;
   453     int height = info->d_height;
   454     Uint32 *srcp = (Uint32 *) info->s_pixels;
   454     Uint32 *srcp = (Uint32 *) info->s_pixels;
   455     int srcskip = info->s_skip >> 2;
   455     int srcskip = info->s_skip >> 2;
   457     int dstskip = info->d_skip >> 2;
   457     int dstskip = info->d_skip >> 2;
   458     Uint32 dalpha = info->dst->Amask;
   458     Uint32 dalpha = info->dst->Amask;
   459 
   459 
   460     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   460     __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
   461 
   461 
   462     hmask = _mm_set_pi32 (0x00fefefe, 0x00fefefe);      /* alpha128 mask -> hmask */
   462     hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe);       /* alpha128 mask -> hmask */
   463     lmask = _mm_set_pi32 (0x00010101, 0x00010101);      /* !alpha128 mask -> lmask */
   463     lmask = _mm_set_pi32(0x00010101, 0x00010101);       /* !alpha128 mask -> lmask */
   464     dsta = _mm_set_pi32 (dalpha, dalpha);       /* dst alpha mask -> dsta */
   464     dsta = _mm_set_pi32(dalpha, dalpha);        /* dst alpha mask -> dsta */
   465 
   465 
   466     while (height--) {
   466     while (height--) {
   467         int n = width;
   467         int n = width;
   468         if (n & 1) {
   468         if (n & 1) {
   469             Uint32 s = *srcp++;
   469             Uint32 s = *srcp++;
   478             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   478             dst2 = dst1;        /* 2 x dst -> dst2(ARGBARGB) */
   479 
   479 
   480             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   480             src1 = *(__m64 *) srcp;     /* 2 x src -> src1(ARGBARGB) */
   481             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   481             src2 = src1;        /* 2 x src -> src2(ARGBARGB) */
   482 
   482 
   483             dst2 = _mm_and_si64 (dst2, hmask);  /* dst & mask -> dst2 */
   483             dst2 = _mm_and_si64(dst2, hmask);   /* dst & mask -> dst2 */
   484             src2 = _mm_and_si64 (src2, hmask);  /* src & mask -> src2 */
   484             src2 = _mm_and_si64(src2, hmask);   /* src & mask -> src2 */
   485             src2 = _mm_add_pi32 (src2, dst2);   /* dst2 + src2 -> src2 */
   485             src2 = _mm_add_pi32(src2, dst2);    /* dst2 + src2 -> src2 */
   486             src2 = _mm_srli_pi32 (src2, 1);     /* src2 >> 1 -> src2 */
   486             src2 = _mm_srli_pi32(src2, 1);      /* src2 >> 1 -> src2 */
   487 
   487 
   488             dst1 = _mm_and_si64 (dst1, src1);   /* src & dst -> dst1 */
   488             dst1 = _mm_and_si64(dst1, src1);    /* src & dst -> dst1 */
   489             dst1 = _mm_and_si64 (dst1, lmask);  /* dst1 & !mask -> dst1 */
   489             dst1 = _mm_and_si64(dst1, lmask);   /* dst1 & !mask -> dst1 */
   490             dst1 = _mm_add_pi32 (dst1, src2);   /* src2 + dst1 -> dst1 */
   490             dst1 = _mm_add_pi32(dst1, src2);    /* src2 + dst1 -> dst1 */
   491             dst1 = _mm_or_si64 (dst1, dsta);    /* dsta(full alpha) | dst1 -> dst1 */
   491             dst1 = _mm_or_si64(dst1, dsta);     /* dsta(full alpha) | dst1 -> dst1 */
   492 
   492 
   493             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   493             *(__m64 *) dstp = dst1;     /* dst1 -> 2 x dst pixels */
   494             dstp += 2;
   494             dstp += 2;
   495             srcp += 2;
   495             srcp += 2;
   496         }
   496         }
   497 
   497 
   498         srcp += srcskip;
   498         srcp += srcskip;
   499         dstp += dstskip;
   499         dstp += dstskip;
   500     }
   500     }
   501     _mm_empty ();
   501     _mm_empty();
   502 }
   502 }
   503 
   503 
   504 /* fast RGB888->(A)RGB888 blending with surface alpha */
   504 /* fast RGB888->(A)RGB888 blending with surface alpha */
   505 static void
   505 static void
   506 BlitRGBtoRGBSurfaceAlphaMMX (SDL_BlitInfo * info)
   506 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
   507 {
   507 {
   508     SDL_PixelFormat *df = info->dst;
   508     SDL_PixelFormat *df = info->dst;
   509     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   509     Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
   510     unsigned alpha = info->src->alpha;
   510     unsigned alpha = info->src->alpha;
   511 
   511 
   512     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   512     if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
   513         /* only call a128 version when R,G,B occupy lower bits */
   513         /* only call a128 version when R,G,B occupy lower bits */
   514         BlitRGBtoRGBSurfaceAlpha128MMX (info);
   514         BlitRGBtoRGBSurfaceAlpha128MMX(info);
   515     } else {
   515     } else {
   516         int width = info->d_width;
   516         int width = info->d_width;
   517         int height = info->d_height;
   517         int height = info->d_height;
   518         Uint32 *srcp = (Uint32 *) info->s_pixels;
   518         Uint32 *srcp = (Uint32 *) info->s_pixels;
   519         int srcskip = info->s_skip >> 2;
   519         int srcskip = info->s_skip >> 2;
   522         Uint32 dalpha = df->Amask;
   522         Uint32 dalpha = df->Amask;
   523         Uint32 amult;
   523         Uint32 amult;
   524 
   524 
   525         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   525         __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
   526 
   526 
   527         mm_zero = _mm_setzero_si64 ();  /* 0 -> mm_zero */
   527         mm_zero = _mm_setzero_si64();   /* 0 -> mm_zero */
   528         /* form the alpha mult */
   528         /* form the alpha mult */
   529         amult = alpha | (alpha << 8);
   529         amult = alpha | (alpha << 8);
   530         amult = amult | (amult << 16);
   530         amult = amult | (amult << 16);
   531         chanmask =
   531         chanmask =
   532             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   532             (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
   533                                                            Bshift);
   533                                                            Bshift);
   534         mm_alpha = _mm_set_pi32 (0, amult & chanmask);  /* 0000AAAA -> mm_alpha, minus 1 chan */
   534         mm_alpha = _mm_set_pi32(0, amult & chanmask);   /* 0000AAAA -> mm_alpha, minus 1 chan */
   535         mm_alpha = _mm_unpacklo_pi8 (mm_alpha, mm_zero);        /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   535         mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
   536         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   536         /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
   537         dsta = _mm_set_pi32 (dalpha, dalpha);   /* dst alpha mask -> dsta */
   537         dsta = _mm_set_pi32(dalpha, dalpha);    /* dst alpha mask -> dsta */
   538 
   538 
   539         while (height--) {
   539         while (height--) {
   540             int n = width;
   540             int n = width;
   541             if (n & 1) {
   541             if (n & 1) {
   542                 /* One Pixel Blend */
   542                 /* One Pixel Blend */
   543                 src2 = _mm_cvtsi32_si64 (*srcp);        /* src(ARGB) -> src2 (0000ARGB) */
   543                 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
   544                 src2 = _mm_unpacklo_pi8 (src2, mm_zero);        /* 0A0R0G0B -> src2 */
   544                 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
   545 
   545 
   546                 dst1 = _mm_cvtsi32_si64 (*dstp);        /* dst(ARGB) -> dst1 (0000ARGB) */
   546                 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
   547                 dst1 = _mm_unpacklo_pi8 (dst1, mm_zero);        /* 0A0R0G0B -> dst1 */
   547                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
   548 
   548 
   549                 src2 = _mm_sub_pi16 (src2, dst1);       /* src2 - dst2 -> src2 */
   549                 src2 = _mm_sub_pi16(src2, dst1);        /* src2 - dst2 -> src2 */
   550                 src2 = _mm_mullo_pi16 (src2, mm_alpha); /* src2 * alpha -> src2 */
   550                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   551                 src2 = _mm_srli_pi16 (src2, 8); /* src2 >> 8 -> src2 */
   551                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   552                 dst1 = _mm_add_pi8 (src2, dst1);        /* src2 + dst1 -> dst1 */
   552                 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
   553 
   553 
   554                 dst1 = _mm_packs_pu16 (dst1, mm_zero);  /* 0000ARGB -> dst1 */
   554                 dst1 = _mm_packs_pu16(dst1, mm_zero);   /* 0000ARGB -> dst1 */
   555                 dst1 = _mm_or_si64 (dst1, dsta);        /* dsta | dst1 -> dst1 */
   555                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   556                 *dstp = _mm_cvtsi64_si32 (dst1);        /* dst1 -> pixel */
   556                 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
   557 
   557 
   558                 ++srcp;
   558                 ++srcp;
   559                 ++dstp;
   559                 ++dstp;
   560 
   560 
   561                 n--;
   561                 n--;
   563 
   563 
   564             for (n >>= 1; n > 0; --n) {
   564             for (n >>= 1; n > 0; --n) {
   565                 /* Two Pixels Blend */
   565                 /* Two Pixels Blend */
   566                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   566                 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
   567                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   567                 src2 = src1;    /* 2 x src -> src2(ARGBARGB) */
   568                 src1 = _mm_unpacklo_pi8 (src1, mm_zero);        /* low - 0A0R0G0B -> src1 */
   568                 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
   569                 src2 = _mm_unpackhi_pi8 (src2, mm_zero);        /* high - 0A0R0G0B -> src2 */
   569                 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
   570 
   570 
   571                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   571                 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
   572                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   572                 dst2 = dst1;    /* 2 x dst -> dst2(ARGBARGB) */
   573                 dst1 = _mm_unpacklo_pi8 (dst1, mm_zero);        /* low - 0A0R0G0B -> dst1 */
   573                 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
   574                 dst2 = _mm_unpackhi_pi8 (dst2, mm_zero);        /* high - 0A0R0G0B -> dst2 */
   574                 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
   575 
   575 
   576                 src1 = _mm_sub_pi16 (src1, dst1);       /* src1 - dst1 -> src1 */
   576                 src1 = _mm_sub_pi16(src1, dst1);        /* src1 - dst1 -> src1 */
   577                 src1 = _mm_mullo_pi16 (src1, mm_alpha); /* src1 * alpha -> src1 */
   577                 src1 = _mm_mullo_pi16(src1, mm_alpha);  /* src1 * alpha -> src1 */
   578                 src1 = _mm_srli_pi16 (src1, 8); /* src1 >> 8 -> src1 */
   578                 src1 = _mm_srli_pi16(src1, 8);  /* src1 >> 8 -> src1 */
   579                 dst1 = _mm_add_pi8 (src1, dst1);        /* src1 + dst1(dst1) -> dst1 */
   579                 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
   580 
   580 
   581                 src2 = _mm_sub_pi16 (src2, dst2);       /* src2 - dst2 -> src2 */
   581                 src2 = _mm_sub_pi16(src2, dst2);        /* src2 - dst2 -> src2 */
   582                 src2 = _mm_mullo_pi16 (src2, mm_alpha); /* src2 * alpha -> src2 */
   582                 src2 = _mm_mullo_pi16(src2, mm_alpha);  /* src2 * alpha -> src2 */
   583                 src2 = _mm_srli_pi16 (src2, 8); /* src2 >> 8 -> src2 */
   583                 src2 = _mm_srli_pi16(src2, 8);  /* src2 >> 8 -> src2 */
   584                 dst2 = _mm_add_pi8 (src2, dst2);        /* src2 + dst2(dst2) -> dst2 */
   584                 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
   585 
   585 
   586                 dst1 = _mm_packs_pu16 (dst1, dst2);     /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   586                 dst1 = _mm_packs_pu16(dst1, dst2);      /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
   587                 dst1 = _mm_or_si64 (dst1, dsta);        /* dsta | dst1 -> dst1 */
   587                 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
   588 
   588 
   589                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   589                 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
   590 
   590 
   591                 srcp += 2;
   591                 srcp += 2;
   592                 dstp += 2;
   592                 dstp += 2;
   593             }
   593             }
   594             srcp += srcskip;
   594             srcp += srcskip;
   595             dstp += dstskip;
   595             dstp += dstskip;
   596         }
   596         }
   597         _mm_empty ();
   597         _mm_empty();
   598     }
   598     }
   599 }
   599 }
   600 
   600 
   601 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   601 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
   602 static void
   602 static void
   603 BlitRGBtoRGBPixelAlphaMMX (SDL_BlitInfo * info)
   603 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
   604 {
   604 {
   605     int width = info->d_width;
   605     int width = info->d_width;
   606     int height = info->d_height;
   606     int height = info->d_height;
   607     Uint32 *srcp = (Uint32 *) info->s_pixels;
   607     Uint32 *srcp = (Uint32 *) info->s_pixels;
   608     int srcskip = info->s_skip >> 2;
   608     int srcskip = info->s_skip >> 2;
   614     Uint32 ashift = sf->Ashift;
   614     Uint32 ashift = sf->Ashift;
   615     Uint64 multmask;
   615     Uint64 multmask;
   616 
   616 
   617     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   617     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
   618 
   618 
   619     mm_zero = _mm_setzero_si64 ();      /* 0 -> mm_zero */
   619     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
   620     multmask = ~(0xFFFFi 64 << (ashift * 2));
   620     multmask = ~(0xFFFFi 64 << (ashift * 2));
   621     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   621     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
   622 
   622 
   623     while (height--) {
   623     while (height--) {
   624 		/* *INDENT-OFF* */
   624 		/* *INDENT-OFF* */
   656 	    }, width);
   656 	    }, width);
   657 		/* *INDENT-ON* */
   657 		/* *INDENT-ON* */
   658         srcp += srcskip;
   658         srcp += srcskip;
   659         dstp += dstskip;
   659         dstp += dstskip;
   660     }
   660     }
   661     _mm_empty ();
   661     _mm_empty();
   662 }
   662 }
   663 
   663 
   664 /* End MSVC_ASMBLIT */
   664 /* End MSVC_ASMBLIT */
   665 
   665 
   666 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   666 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
   734     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   734     vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
   735 } while (0)
   735 } while (0)
   736 
   736 
   737 /* Calculate the permute vector used for 32->32 swizzling */
   737 /* Calculate the permute vector used for 32->32 swizzling */
   738 static vector unsigned char
   738 static vector unsigned char
   739 calc_swizzle32 (const SDL_PixelFormat * srcfmt,
   739 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
   740                 const SDL_PixelFormat * dstfmt)
       
   741 {
   740 {
   742     /*
   741     /*
   743      * We have to assume that the bits that aren't used by other
   742      * We have to assume that the bits that aren't used by other
   744      *  colors is alpha, and it's one complete byte, since some formats
   743      *  colors is alpha, and it's one complete byte, since some formats
   745      *  leave alpha with a zero mask, but we should still swizzle the bits.
   744      *  leave alpha with a zero mask, but we should still swizzle the bits.
   756         srcfmt = &default_pixel_format;
   755         srcfmt = &default_pixel_format;
   757     }
   756     }
   758     if (!dstfmt) {
   757     if (!dstfmt) {
   759         dstfmt = &default_pixel_format;
   758         dstfmt = &default_pixel_format;
   760     }
   759     }
   761     const vector unsigned char plus = VECUINT8_LITERAL
   760     const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
   762         (0x00, 0x00, 0x00, 0x00,
   761                                                        0x04, 0x04, 0x04, 0x04,
   763          0x04, 0x04, 0x04, 0x04,
   762                                                        0x08, 0x08, 0x08, 0x08,
   764          0x08, 0x08, 0x08, 0x08,
   763                                                        0x0C, 0x0C, 0x0C,
   765          0x0C, 0x0C, 0x0C, 0x0C);
   764                                                        0x0C);
   766     vector unsigned char vswiz;
   765     vector unsigned char vswiz;
   767     vector unsigned int srcvec;
   766     vector unsigned int srcvec;
   768 #define RESHIFT(X) (3 - ((X) >> 3))
   767 #define RESHIFT(X) (3 - ((X) >> 3))
   769     Uint32 rmask = RESHIFT (srcfmt->Rshift) << (dstfmt->Rshift);
   768     Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
   770     Uint32 gmask = RESHIFT (srcfmt->Gshift) << (dstfmt->Gshift);
   769     Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
   771     Uint32 bmask = RESHIFT (srcfmt->Bshift) << (dstfmt->Bshift);
   770     Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
   772     Uint32 amask;
   771     Uint32 amask;
   773     /* Use zero for alpha if either surface doesn't have alpha */
   772     /* Use zero for alpha if either surface doesn't have alpha */
   774     if (dstfmt->Amask) {
   773     if (dstfmt->Amask) {
   775         amask =
   774         amask =
   776             ((srcfmt->Amask) ? RESHIFT (srcfmt->Ashift) : 0x10) << (dstfmt->
   775             ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->
   777                                                                     Ashift);
   776                                                                    Ashift);
   778     } else {
   777     } else {
   779         amask =
   778         amask =
   780             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   779             0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
   781                           0xFFFFFFFF);
   780                           0xFFFFFFFF);
   782     }
   781     }
   783 #undef RESHIFT
   782 #undef RESHIFT
   784     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   783     ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
   785     vswiz = vec_add (plus, (vector unsigned char) vec_splat (srcvec, 0));
   784     vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
   786     return (vswiz);
   785     return (vswiz);
   787 }
   786 }
   788 
   787 
   789 static void
   788 static void
   790 Blit32to565PixelAlphaAltivec (SDL_BlitInfo * info)
   789 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
   791 {
   790 {
   792     int height = info->d_height;
   791     int height = info->d_height;
   793     Uint8 *src = (Uint8 *) info->s_pixels;
   792     Uint8 *src = (Uint8 *) info->s_pixels;
   794     int srcskip = info->s_skip;
   793     int srcskip = info->s_skip;
   795     Uint8 *dst = (Uint8 *) info->d_pixels;
   794     Uint8 *dst = (Uint8 *) info->d_pixels;
   796     int dstskip = info->d_skip;
   795     int dstskip = info->d_skip;
   797     SDL_PixelFormat *srcfmt = info->src;
   796     SDL_PixelFormat *srcfmt = info->src;
   798 
   797 
   799     vector unsigned char v0 = vec_splat_u8 (0);
   798     vector unsigned char v0 = vec_splat_u8(0);
   800     vector unsigned short v8_16 = vec_splat_u16 (8);
   799     vector unsigned short v8_16 = vec_splat_u16(8);
   801     vector unsigned short v1_16 = vec_splat_u16 (1);
   800     vector unsigned short v1_16 = vec_splat_u16(1);
   802     vector unsigned short v2_16 = vec_splat_u16 (2);
   801     vector unsigned short v2_16 = vec_splat_u16(2);
   803     vector unsigned short v3_16 = vec_splat_u16 (3);
   802     vector unsigned short v3_16 = vec_splat_u16(3);
   804     vector unsigned int v8_32 = vec_splat_u32 (8);
   803     vector unsigned int v8_32 = vec_splat_u32(8);
   805     vector unsigned int v16_32 = vec_add (v8_32, v8_32);
   804     vector unsigned int v16_32 = vec_add(v8_32, v8_32);
   806     vector unsigned short v3f =
   805     vector unsigned short v3f =
   807         VECUINT16_LITERAL (0x003f, 0x003f, 0x003f, 0x003f,
   806         VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
   808                            0x003f, 0x003f, 0x003f, 0x003f);
   807                           0x003f, 0x003f, 0x003f, 0x003f);
   809     vector unsigned short vfc =
   808     vector unsigned short vfc =
   810         VECUINT16_LITERAL (0x00fc, 0x00fc, 0x00fc, 0x00fc,
   809         VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
   811                            0x00fc, 0x00fc, 0x00fc, 0x00fc);
   810                           0x00fc, 0x00fc, 0x00fc, 0x00fc);
   812 
   811 
   813     /* 
   812     /* 
   814        0x10 - 0x1f is the alpha
   813        0x10 - 0x1f is the alpha
   815        0x00 - 0x0e evens are the red
   814        0x00 - 0x0e evens are the red
   816        0x01 - 0x0f odds are zero
   815        0x01 - 0x0f odds are zero
   817      */
   816      */
   818     vector unsigned char vredalpha1 =
   817     vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
   819         VECUINT8_LITERAL (0x10, 0x00, 0x01, 0x01,
   818                                                        0x10, 0x02, 0x01, 0x01,
   820                           0x10, 0x02, 0x01, 0x01,
   819                                                        0x10, 0x04, 0x01, 0x01,
   821                           0x10, 0x04, 0x01, 0x01,
   820                                                        0x10, 0x06, 0x01,
   822                           0x10, 0x06, 0x01, 0x01);
   821                                                        0x01);
   823     vector unsigned char vredalpha2 =
   822     vector unsigned char vredalpha2 =
   824         (vector unsigned char) (vec_add ((vector unsigned int) vredalpha1,
   823         (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
   825                                          vec_sl (v8_32, v16_32))
   824                                         vec_sl(v8_32, v16_32))
   826         );
   825         );
   827     /*
   826     /*
   828        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   827        0x00 - 0x0f is ARxx ARxx ARxx ARxx
   829        0x11 - 0x0f odds are blue
   828        0x11 - 0x0f odds are blue
   830      */
   829      */
   831     vector unsigned char vblue1 = VECUINT8_LITERAL (0x00, 0x01, 0x02, 0x11,
   830     vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
   832                                                     0x04, 0x05, 0x06, 0x13,
   831                                                    0x04, 0x05, 0x06, 0x13,
   833                                                     0x08, 0x09, 0x0a, 0x15,
   832                                                    0x08, 0x09, 0x0a, 0x15,
   834                                                     0x0c, 0x0d, 0x0e, 0x17);
   833                                                    0x0c, 0x0d, 0x0e, 0x17);
   835     vector unsigned char vblue2 =
   834     vector unsigned char vblue2 =
   836         (vector unsigned char) (vec_add ((vector unsigned int) vblue1, v8_32)
   835         (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
   837         );
   836         );
   838     /*
   837     /*
   839        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   838        0x00 - 0x0f is ARxB ARxB ARxB ARxB
   840        0x10 - 0x0e evens are green
   839        0x10 - 0x0e evens are green
   841      */
   840      */
   842     vector unsigned char vgreen1 = VECUINT8_LITERAL (0x00, 0x01, 0x10, 0x03,
   841     vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
   843                                                      0x04, 0x05, 0x12, 0x07,
   842                                                     0x04, 0x05, 0x12, 0x07,
   844                                                      0x08, 0x09, 0x14, 0x0b,
   843                                                     0x08, 0x09, 0x14, 0x0b,
   845                                                      0x0c, 0x0d, 0x16, 0x0f);
   844                                                     0x0c, 0x0d, 0x16, 0x0f);
   846     vector unsigned char vgreen2 =
   845     vector unsigned char vgreen2 =
   847         (vector unsigned
   846         (vector unsigned
   848          char) (vec_add ((vector unsigned int) vgreen1, vec_sl (v8_32, v8_32))
   847          char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
   849         );
   848         );
   850     vector unsigned char vgmerge = VECUINT8_LITERAL (0x00, 0x02, 0x00, 0x06,
   849     vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
   851                                                      0x00, 0x0a, 0x00, 0x0e,
   850                                                     0x00, 0x0a, 0x00, 0x0e,
   852                                                      0x00, 0x12, 0x00, 0x16,
   851                                                     0x00, 0x12, 0x00, 0x16,
   853                                                      0x00, 0x1a, 0x00, 0x1e);
   852                                                     0x00, 0x1a, 0x00, 0x1e);
   854     vector unsigned char mergePermute = VEC_MERGE_PERMUTE ();
   853     vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
   855     vector unsigned char vpermute = calc_swizzle32 (srcfmt, NULL);
   854     vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
   856     vector unsigned char valphaPermute =
   855     vector unsigned char valphaPermute =
   857         vec_and (vec_lvsl (0, (int *) NULL), vec_splat_u8 (0xC));
   856         vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
   858 
   857 
   859     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8 (-7);
   858     vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
   860     vf800 = vec_sl (vf800, vec_splat_u16 (8));
   859     vf800 = vec_sl(vf800, vec_splat_u16(8));
   861 
   860 
   862     while (height--) {
   861     while (height--) {
   863         int extrawidth;
   862         int extrawidth;
   864         vector unsigned char valigner;
   863         vector unsigned char valigner;
   865         vector unsigned char vsrc;
   864         vector unsigned char vsrc;
   883             } \
   882             } \
   884             src += 4; \
   883             src += 4; \
   885             dst += 2; \
   884             dst += 2; \
   886             widthvar--; \
   885             widthvar--; \
   887         }
   886         }
   888         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dst)) && (width), width);
   887         ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
   889         extrawidth = (width % 8);
   888         extrawidth = (width % 8);
   890         valigner = VEC_ALIGNER (src);
   889         valigner = VEC_ALIGNER(src);
   891         vsrc = (vector unsigned char) vec_ld (0, src);
   890         vsrc = (vector unsigned char) vec_ld(0, src);
   892         width -= extrawidth;
   891         width -= extrawidth;
   893         while (width) {
   892         while (width) {
   894             vector unsigned char valpha;
   893             vector unsigned char valpha;
   895             vector unsigned char vsrc1, vsrc2;
   894             vector unsigned char vsrc1, vsrc2;
   896             vector unsigned char vdst1, vdst2;
   895             vector unsigned char vdst1, vdst2;
   897             vector unsigned short vR, vG, vB;
   896             vector unsigned short vR, vG, vB;
   898             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   897             vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
   899 
   898 
   900             /* Load 8 pixels from src as ARGB */
   899             /* Load 8 pixels from src as ARGB */
   901             voverflow = (vector unsigned char) vec_ld (15, src);
   900             voverflow = (vector unsigned char) vec_ld(15, src);
   902             vsrc = vec_perm (vsrc, voverflow, valigner);
   901             vsrc = vec_perm(vsrc, voverflow, valigner);
   903             vsrc1 = vec_perm (vsrc, vsrc, vpermute);
   902             vsrc1 = vec_perm(vsrc, vsrc, vpermute);
   904             src += 16;
   903             src += 16;
   905             vsrc = (vector unsigned char) vec_ld (15, src);
   904             vsrc = (vector unsigned char) vec_ld(15, src);
   906             voverflow = vec_perm (voverflow, vsrc, valigner);
   905             voverflow = vec_perm(voverflow, vsrc, valigner);
   907             vsrc2 = vec_perm (voverflow, voverflow, vpermute);
   906             vsrc2 = vec_perm(voverflow, voverflow, vpermute);
   908             src += 16;
   907             src += 16;
   909 
   908 
   910             /* Load 8 pixels from dst as XRGB */
   909             /* Load 8 pixels from dst as XRGB */
   911             voverflow = vec_ld (0, dst);
   910             voverflow = vec_ld(0, dst);
   912             vR = vec_and ((vector unsigned short) voverflow, vf800);
   911             vR = vec_and((vector unsigned short) voverflow, vf800);
   913             vB = vec_sl ((vector unsigned short) voverflow, v3_16);
   912             vB = vec_sl((vector unsigned short) voverflow, v3_16);
   914             vG = vec_sl (vB, v2_16);
   913             vG = vec_sl(vB, v2_16);
   915             vdst1 =
   914             vdst1 =
   916                 (vector unsigned char) vec_perm ((vector unsigned char) vR,
   915                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   917                                                  (vector unsigned char) vR,
   916                                                 (vector unsigned char) vR,
   918                                                  vredalpha1);
   917                                                 vredalpha1);
   919             vdst1 = vec_perm (vdst1, (vector unsigned char) vB, vblue1);
   918             vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
   920             vdst1 = vec_perm (vdst1, (vector unsigned char) vG, vgreen1);
   919             vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
   921             vdst2 =
   920             vdst2 =
   922                 (vector unsigned char) vec_perm ((vector unsigned char) vR,
   921                 (vector unsigned char) vec_perm((vector unsigned char) vR,
   923                                                  (vector unsigned char) vR,
   922                                                 (vector unsigned char) vR,
   924                                                  vredalpha2);
   923                                                 vredalpha2);
   925             vdst2 = vec_perm (vdst2, (vector unsigned char) vB, vblue2);
   924             vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
   926             vdst2 = vec_perm (vdst2, (vector unsigned char) vG, vgreen2);
   925             vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
   927 
   926 
   928             /* Alpha blend 8 pixels as ARGB */
   927             /* Alpha blend 8 pixels as ARGB */
   929             valpha = vec_perm (vsrc1, v0, valphaPermute);
   928             valpha = vec_perm(vsrc1, v0, valphaPermute);
   930             VEC_MULTIPLY_ALPHA (vsrc1, vdst1, valpha, mergePermute, v1_16,
   929             VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
   931                                 v8_16);
   930                                v8_16);
   932             valpha = vec_perm (vsrc2, v0, valphaPermute);
   931             valpha = vec_perm(vsrc2, v0, valphaPermute);
   933             VEC_MULTIPLY_ALPHA (vsrc2, vdst2, valpha, mergePermute, v1_16,
   932             VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
   934                                 v8_16);
   933                                v8_16);
   935 
   934 
   936             /* Convert 8 pixels to 565 */
   935             /* Convert 8 pixels to 565 */
   937             vpixel = (vector unsigned short) vec_packpx ((vector unsigned int)
   936             vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
   938                                                          vdst1,
   937                                                         vdst1,
   939                                                          (vector unsigned int)
   938                                                         (vector unsigned int)
   940                                                          vdst2);
   939                                                         vdst2);
   941             vgpixel =
   940             vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
   942                 (vector unsigned short) vec_perm (vdst1, vdst2, vgmerge);
   941             vgpixel = vec_and(vgpixel, vfc);
   943             vgpixel = vec_and (vgpixel, vfc);
   942             vgpixel = vec_sl(vgpixel, v3_16);
   944             vgpixel = vec_sl (vgpixel, v3_16);
   943             vrpixel = vec_sl(vpixel, v1_16);
   945             vrpixel = vec_sl (vpixel, v1_16);
   944             vrpixel = vec_and(vrpixel, vf800);
   946             vrpixel = vec_and (vrpixel, vf800);
   945             vbpixel = vec_and(vpixel, v3f);
   947             vbpixel = vec_and (vpixel, v3f);
       
   948             vdst1 =
   946             vdst1 =
   949                 vec_or ((vector unsigned char) vrpixel,
   947                 vec_or((vector unsigned char) vrpixel,
   950                         (vector unsigned char) vgpixel);
   948                        (vector unsigned char) vgpixel);
   951             vdst1 = vec_or (vdst1, (vector unsigned char) vbpixel);
   949             vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
   952 
   950 
   953             /* Store 8 pixels */
   951             /* Store 8 pixels */
   954             vec_st (vdst1, 0, dst);
   952             vec_st(vdst1, 0, dst);
   955 
   953 
   956             width -= 8;
   954             width -= 8;
   957             dst += 16;
   955             dst += 16;
   958         }
   956         }
   959         ONE_PIXEL_BLEND ((extrawidth), extrawidth);
   957         ONE_PIXEL_BLEND((extrawidth), extrawidth);
   960 #undef ONE_PIXEL_BLEND
   958 #undef ONE_PIXEL_BLEND
   961         src += srcskip;
   959         src += srcskip;
   962         dst += dstskip;
   960         dst += dstskip;
   963     }
   961     }
   964 }
   962 }
   965 
   963 
   966 static void
   964 static void
   967 Blit32to32SurfaceAlphaKeyAltivec (SDL_BlitInfo * info)
   965 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
   968 {
   966 {
   969     unsigned alpha = info->src->alpha;
   967     unsigned alpha = info->src->alpha;
   970     int height = info->d_height;
   968     int height = info->d_height;
   971     Uint32 *srcp = (Uint32 *) info->s_pixels;
   969     Uint32 *srcp = (Uint32 *) info->s_pixels;
   972     int srcskip = info->s_skip >> 2;
   970     int srcskip = info->s_skip >> 2;
   989     vector unsigned short v1;
   987     vector unsigned short v1;
   990     vector unsigned short v8;
   988     vector unsigned short v8;
   991     vector unsigned int vckey;
   989     vector unsigned int vckey;
   992     vector unsigned int vrgbmask;
   990     vector unsigned int vrgbmask;
   993 
   991 
   994     mergePermute = VEC_MERGE_PERMUTE ();
   992     mergePermute = VEC_MERGE_PERMUTE();
   995     v0 = vec_splat_u8 (0);
   993     v0 = vec_splat_u8(0);
   996     v1 = vec_splat_u16 (1);
   994     v1 = vec_splat_u16(1);
   997     v8 = vec_splat_u16 (8);
   995     v8 = vec_splat_u16(8);
   998 
   996 
   999     /* set the alpha to 255 on the destination surf */
   997     /* set the alpha to 255 on the destination surf */
  1000     valphamask = VEC_ALPHA_MASK ();
   998     valphamask = VEC_ALPHA_MASK();
  1001 
   999 
  1002     vsrcPermute = calc_swizzle32 (srcfmt, NULL);
  1000     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1003     vdstPermute = calc_swizzle32 (NULL, dstfmt);
  1001     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1004     vsdstPermute = calc_swizzle32 (dstfmt, NULL);
  1002     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1005 
  1003 
  1006     /* set a vector full of alpha and 255-alpha */
  1004     /* set a vector full of alpha and 255-alpha */
  1007     ((unsigned char *) &valpha)[0] = alpha;
  1005     ((unsigned char *) &valpha)[0] = alpha;
  1008     valpha = vec_splat (valpha, 0);
  1006     valpha = vec_splat(valpha, 0);
  1009     vbits = (vector unsigned char) vec_splat_s8 (-1);
  1007     vbits = (vector unsigned char) vec_splat_s8(-1);
  1010 
  1008 
  1011     ckey &= rgbmask;
  1009     ckey &= rgbmask;
  1012     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1010     ((unsigned int *) (char *) &vckey)[0] = ckey;
  1013     vckey = vec_splat (vckey, 0);
  1011     vckey = vec_splat(vckey, 0);
  1014     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1012     ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
  1015     vrgbmask = vec_splat (vrgbmask, 0);
  1013     vrgbmask = vec_splat(vrgbmask, 0);
  1016 
  1014 
  1017     while (height--) {
  1015     while (height--) {
  1018         int width = info->d_width;
  1016         int width = info->d_width;
  1019 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1017 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1020         while (condition) { \
  1018         while (condition) { \
  1029             } \
  1027             } \
  1030             dstp++; \
  1028             dstp++; \
  1031             srcp++; \
  1029             srcp++; \
  1032             widthvar--; \
  1030             widthvar--; \
  1033         }
  1031         }
  1034         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dstp)) && (width), width);
  1032         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1035         if (width > 0) {
  1033         if (width > 0) {
  1036             int extrawidth = (width % 4);
  1034             int extrawidth = (width % 4);
  1037             vector unsigned char valigner = VEC_ALIGNER (srcp);
  1035             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1038             vector unsigned char vs = (vector unsigned char) vec_ld (0, srcp);
  1036             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1039             width -= extrawidth;
  1037             width -= extrawidth;
  1040             while (width) {
  1038             while (width) {
  1041                 vector unsigned char vsel;
  1039                 vector unsigned char vsel;
  1042                 vector unsigned char voverflow;
  1040                 vector unsigned char voverflow;
  1043                 vector unsigned char vd;
  1041                 vector unsigned char vd;
  1044                 vector unsigned char vd_orig;
  1042                 vector unsigned char vd_orig;
  1045 
  1043 
  1046                 /* s = *srcp */
  1044                 /* s = *srcp */
  1047                 voverflow = (vector unsigned char) vec_ld (15, srcp);
  1045                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1048                 vs = vec_perm (vs, voverflow, valigner);
  1046                 vs = vec_perm(vs, voverflow, valigner);
  1049 
  1047 
  1050                 /* vsel is set for items that match the key */
  1048                 /* vsel is set for items that match the key */
  1051                 vsel =
  1049                 vsel =
  1052                     (vector unsigned char) vec_and ((vector unsigned int) vs,
  1050                     (vector unsigned char) vec_and((vector unsigned int) vs,
  1053                                                     vrgbmask);
  1051                                                    vrgbmask);
  1054                 vsel = (vector unsigned char) vec_cmpeq ((vector unsigned int)
  1052                 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
  1055                                                          vsel, vckey);
  1053                                                         vsel, vckey);
  1056 
  1054 
  1057                 /* permute to source format */
  1055                 /* permute to source format */
  1058                 vs = vec_perm (vs, valpha, vsrcPermute);
  1056                 vs = vec_perm(vs, valpha, vsrcPermute);
  1059 
  1057 
  1060                 /* d = *dstp */
  1058                 /* d = *dstp */
  1061                 vd = (vector unsigned char) vec_ld (0, dstp);
  1059                 vd = (vector unsigned char) vec_ld(0, dstp);
  1062                 vd_orig = vd = vec_perm (vd, v0, vsdstPermute);
  1060                 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
  1063 
  1061 
  1064                 VEC_MULTIPLY_ALPHA (vs, vd, valpha, mergePermute, v1, v8);
  1062                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1065 
  1063 
  1066                 /* set the alpha channel to full on */
  1064                 /* set the alpha channel to full on */
  1067                 vd = vec_or (vd, valphamask);
  1065                 vd = vec_or(vd, valphamask);
  1068 
  1066 
  1069                 /* mask out color key */
  1067                 /* mask out color key */
  1070                 vd = vec_sel (vd, vd_orig, vsel);
  1068                 vd = vec_sel(vd, vd_orig, vsel);
  1071 
  1069 
  1072                 /* permute to dest format */
  1070                 /* permute to dest format */
  1073                 vd = vec_perm (vd, vbits, vdstPermute);
  1071                 vd = vec_perm(vd, vbits, vdstPermute);
  1074 
  1072 
  1075                 /* *dstp = res */
  1073                 /* *dstp = res */
  1076                 vec_st ((vector unsigned int) vd, 0, dstp);
  1074                 vec_st((vector unsigned int) vd, 0, dstp);
  1077 
  1075 
  1078                 srcp += 4;
  1076                 srcp += 4;
  1079                 dstp += 4;
  1077                 dstp += 4;
  1080                 width -= 4;
  1078                 width -= 4;
  1081                 vs = voverflow;
  1079                 vs = voverflow;
  1082             }
  1080             }
  1083             ONE_PIXEL_BLEND ((extrawidth), extrawidth);
  1081             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1084         }
  1082         }
  1085 #undef ONE_PIXEL_BLEND
  1083 #undef ONE_PIXEL_BLEND
  1086 
  1084 
  1087         srcp += srcskip;
  1085         srcp += srcskip;
  1088         dstp += dstskip;
  1086         dstp += dstskip;
  1089     }
  1087     }
  1090 }
  1088 }
  1091 
  1089 
  1092 
  1090 
  1093 static void
  1091 static void
  1094 Blit32to32PixelAlphaAltivec (SDL_BlitInfo * info)
  1092 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
  1095 {
  1093 {
  1096     int width = info->d_width;
  1094     int width = info->d_width;
  1097     int height = info->d_height;
  1095     int height = info->d_height;
  1098     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1096     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1099     int srcskip = info->s_skip >> 2;
  1097     int srcskip = info->s_skip >> 2;
  1110     vector unsigned char vpixelmask;
  1108     vector unsigned char vpixelmask;
  1111     vector unsigned char v0;
  1109     vector unsigned char v0;
  1112     vector unsigned short v1;
  1110     vector unsigned short v1;
  1113     vector unsigned short v8;
  1111     vector unsigned short v8;
  1114 
  1112 
  1115     v0 = vec_splat_u8 (0);
  1113     v0 = vec_splat_u8(0);
  1116     v1 = vec_splat_u16 (1);
  1114     v1 = vec_splat_u16(1);
  1117     v8 = vec_splat_u16 (8);
  1115     v8 = vec_splat_u16(8);
  1118     mergePermute = VEC_MERGE_PERMUTE ();
  1116     mergePermute = VEC_MERGE_PERMUTE();
  1119     valphamask = VEC_ALPHA_MASK ();
  1117     valphamask = VEC_ALPHA_MASK();
  1120     valphaPermute = vec_and (vec_lvsl (0, (int *) NULL), vec_splat_u8 (0xC));
  1118     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1121     vpixelmask = vec_nor (valphamask, v0);
  1119     vpixelmask = vec_nor(valphamask, v0);
  1122     vsrcPermute = calc_swizzle32 (srcfmt, NULL);
  1120     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1123     vdstPermute = calc_swizzle32 (NULL, dstfmt);
  1121     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1124     vsdstPermute = calc_swizzle32 (dstfmt, NULL);
  1122     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1125 
  1123 
  1126     while (height--) {
  1124     while (height--) {
  1127         width = info->d_width;
  1125         width = info->d_width;
  1128 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1126 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1129             Uint32 Pixel; \
  1127             Uint32 Pixel; \
  1136             } \
  1134             } \
  1137             ++srcp; \
  1135             ++srcp; \
  1138             ++dstp; \
  1136             ++dstp; \
  1139             widthvar--; \
  1137             widthvar--; \
  1140         }
  1138         }
  1141         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dstp)) && (width), width);
  1139         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1142         if (width > 0) {
  1140         if (width > 0) {
  1143             /* vsrcPermute */
  1141             /* vsrcPermute */
  1144             /* vdstPermute */
  1142             /* vdstPermute */
  1145             int extrawidth = (width % 4);
  1143             int extrawidth = (width % 4);
  1146             vector unsigned char valigner = VEC_ALIGNER (srcp);
  1144             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1147             vector unsigned char vs = (vector unsigned char) vec_ld (0, srcp);
  1145             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1148             width -= extrawidth;
  1146             width -= extrawidth;
  1149             while (width) {
  1147             while (width) {
  1150                 vector unsigned char voverflow;
  1148                 vector unsigned char voverflow;
  1151                 vector unsigned char vd;
  1149                 vector unsigned char vd;
  1152                 vector unsigned char valpha;
  1150                 vector unsigned char valpha;
  1153                 vector unsigned char vdstalpha;
  1151                 vector unsigned char vdstalpha;
  1154                 /* s = *srcp */
  1152                 /* s = *srcp */
  1155                 voverflow = (vector unsigned char) vec_ld (15, srcp);
  1153                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1156                 vs = vec_perm (vs, voverflow, valigner);
  1154                 vs = vec_perm(vs, voverflow, valigner);
  1157                 vs = vec_perm (vs, v0, vsrcPermute);
  1155                 vs = vec_perm(vs, v0, vsrcPermute);
  1158 
  1156 
  1159                 valpha = vec_perm (vs, v0, valphaPermute);
  1157                 valpha = vec_perm(vs, v0, valphaPermute);
  1160 
  1158 
  1161                 /* d = *dstp */
  1159                 /* d = *dstp */
  1162                 vd = (vector unsigned char) vec_ld (0, dstp);
  1160                 vd = (vector unsigned char) vec_ld(0, dstp);
  1163                 vd = vec_perm (vd, v0, vsdstPermute);
  1161                 vd = vec_perm(vd, v0, vsdstPermute);
  1164                 vdstalpha = vec_and (vd, valphamask);
  1162                 vdstalpha = vec_and(vd, valphamask);
  1165 
  1163 
  1166                 VEC_MULTIPLY_ALPHA (vs, vd, valpha, mergePermute, v1, v8);
  1164                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1167 
  1165 
  1168                 /* set the alpha to the dest alpha */
  1166                 /* set the alpha to the dest alpha */
  1169                 vd = vec_and (vd, vpixelmask);
  1167                 vd = vec_and(vd, vpixelmask);
  1170                 vd = vec_or (vd, vdstalpha);
  1168                 vd = vec_or(vd, vdstalpha);
  1171                 vd = vec_perm (vd, v0, vdstPermute);
  1169                 vd = vec_perm(vd, v0, vdstPermute);
  1172 
  1170 
  1173                 /* *dstp = res */
  1171                 /* *dstp = res */
  1174                 vec_st ((vector unsigned int) vd, 0, dstp);
  1172                 vec_st((vector unsigned int) vd, 0, dstp);
  1175 
  1173 
  1176                 srcp += 4;
  1174                 srcp += 4;
  1177                 dstp += 4;
  1175                 dstp += 4;
  1178                 width -= 4;
  1176                 width -= 4;
  1179                 vs = voverflow;
  1177                 vs = voverflow;
  1180 
  1178 
  1181             }
  1179             }
  1182             ONE_PIXEL_BLEND ((extrawidth), extrawidth);
  1180             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1183         }
  1181         }
  1184         srcp += srcskip;
  1182         srcp += srcskip;
  1185         dstp += dstskip;
  1183         dstp += dstskip;
  1186 #undef ONE_PIXEL_BLEND
  1184 #undef ONE_PIXEL_BLEND
  1187     }
  1185     }
  1188 }
  1186 }
  1189 
  1187 
  1190 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1188 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1191 static void
  1189 static void
  1192 BlitRGBtoRGBPixelAlphaAltivec (SDL_BlitInfo * info)
  1190 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
  1193 {
  1191 {
  1194     int width = info->d_width;
  1192     int width = info->d_width;
  1195     int height = info->d_height;
  1193     int height = info->d_height;
  1196     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1194     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1197     int srcskip = info->s_skip >> 2;
  1195     int srcskip = info->s_skip >> 2;
  1202     vector unsigned char valphamask;
  1200     vector unsigned char valphamask;
  1203     vector unsigned char vpixelmask;
  1201     vector unsigned char vpixelmask;
  1204     vector unsigned char v0;
  1202     vector unsigned char v0;
  1205     vector unsigned short v1;
  1203     vector unsigned short v1;
  1206     vector unsigned short v8;
  1204     vector unsigned short v8;
  1207     v0 = vec_splat_u8 (0);
  1205     v0 = vec_splat_u8(0);
  1208     v1 = vec_splat_u16 (1);
  1206     v1 = vec_splat_u16(1);
  1209     v8 = vec_splat_u16 (8);
  1207     v8 = vec_splat_u16(8);
  1210     mergePermute = VEC_MERGE_PERMUTE ();
  1208     mergePermute = VEC_MERGE_PERMUTE();
  1211     valphamask = VEC_ALPHA_MASK ();
  1209     valphamask = VEC_ALPHA_MASK();
  1212     valphaPermute = vec_and (vec_lvsl (0, (int *) NULL), vec_splat_u8 (0xC));
  1210     valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
  1213 
  1211 
  1214 
  1212 
  1215     vpixelmask = vec_nor (valphamask, v0);
  1213     vpixelmask = vec_nor(valphamask, v0);
  1216     while (height--) {
  1214     while (height--) {
  1217         width = info->d_width;
  1215         width = info->d_width;
  1218 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1216 #define ONE_PIXEL_BLEND(condition, widthvar) \
  1219         while ((condition)) { \
  1217         while ((condition)) { \
  1220             Uint32 dalpha; \
  1218             Uint32 dalpha; \
  1240             } \
  1238             } \
  1241             ++srcp; \
  1239             ++srcp; \
  1242             ++dstp; \
  1240             ++dstp; \
  1243             widthvar--; \
  1241             widthvar--; \
  1244 	    }
  1242 	    }
  1245         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dstp)) && (width), width);
  1243         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1246         if (width > 0) {
  1244         if (width > 0) {
  1247             int extrawidth = (width % 4);
  1245             int extrawidth = (width % 4);
  1248             vector unsigned char valigner = VEC_ALIGNER (srcp);
  1246             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1249             vector unsigned char vs = (vector unsigned char) vec_ld (0, srcp);
  1247             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1250             width -= extrawidth;
  1248             width -= extrawidth;
  1251             while (width) {
  1249             while (width) {
  1252                 vector unsigned char voverflow;
  1250                 vector unsigned char voverflow;
  1253                 vector unsigned char vd;
  1251                 vector unsigned char vd;
  1254                 vector unsigned char valpha;
  1252                 vector unsigned char valpha;
  1255                 vector unsigned char vdstalpha;
  1253                 vector unsigned char vdstalpha;
  1256                 /* s = *srcp */
  1254                 /* s = *srcp */
  1257                 voverflow = (vector unsigned char) vec_ld (15, srcp);
  1255                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1258                 vs = vec_perm (vs, voverflow, valigner);
  1256                 vs = vec_perm(vs, voverflow, valigner);
  1259 
  1257 
  1260                 valpha = vec_perm (vs, v0, valphaPermute);
  1258                 valpha = vec_perm(vs, v0, valphaPermute);
  1261 
  1259 
  1262                 /* d = *dstp */
  1260                 /* d = *dstp */
  1263                 vd = (vector unsigned char) vec_ld (0, dstp);
  1261                 vd = (vector unsigned char) vec_ld(0, dstp);
  1264                 vdstalpha = vec_and (vd, valphamask);
  1262                 vdstalpha = vec_and(vd, valphamask);
  1265 
  1263 
  1266                 VEC_MULTIPLY_ALPHA (vs, vd, valpha, mergePermute, v1, v8);
  1264                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1267 
  1265 
  1268                 /* set the alpha to the dest alpha */
  1266                 /* set the alpha to the dest alpha */
  1269                 vd = vec_and (vd, vpixelmask);
  1267                 vd = vec_and(vd, vpixelmask);
  1270                 vd = vec_or (vd, vdstalpha);
  1268                 vd = vec_or(vd, vdstalpha);
  1271 
  1269 
  1272                 /* *dstp = res */
  1270                 /* *dstp = res */
  1273                 vec_st ((vector unsigned int) vd, 0, dstp);
  1271                 vec_st((vector unsigned int) vd, 0, dstp);
  1274 
  1272 
  1275                 srcp += 4;
  1273                 srcp += 4;
  1276                 dstp += 4;
  1274                 dstp += 4;
  1277                 width -= 4;
  1275                 width -= 4;
  1278                 vs = voverflow;
  1276                 vs = voverflow;
  1279             }
  1277             }
  1280             ONE_PIXEL_BLEND ((extrawidth), extrawidth);
  1278             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1281         }
  1279         }
  1282         srcp += srcskip;
  1280         srcp += srcskip;
  1283         dstp += dstskip;
  1281         dstp += dstskip;
  1284     }
  1282     }
  1285 #undef ONE_PIXEL_BLEND
  1283 #undef ONE_PIXEL_BLEND
  1286 }
  1284 }
  1287 
  1285 
  1288 static void
  1286 static void
  1289 Blit32to32SurfaceAlphaAltivec (SDL_BlitInfo * info)
  1287 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
  1290 {
  1288 {
  1291     /* XXX : 6 */
  1289     /* XXX : 6 */
  1292     unsigned alpha = info->src->alpha;
  1290     unsigned alpha = info->src->alpha;
  1293     int height = info->d_height;
  1291     int height = info->d_height;
  1294     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1292     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1307     vector unsigned char valphamask;
  1305     vector unsigned char valphamask;
  1308     vector unsigned char vbits;
  1306     vector unsigned char vbits;
  1309     vector unsigned short v1;
  1307     vector unsigned short v1;
  1310     vector unsigned short v8;
  1308     vector unsigned short v8;
  1311 
  1309 
  1312     mergePermute = VEC_MERGE_PERMUTE ();
  1310     mergePermute = VEC_MERGE_PERMUTE();
  1313     v1 = vec_splat_u16 (1);
  1311     v1 = vec_splat_u16(1);
  1314     v8 = vec_splat_u16 (8);
  1312     v8 = vec_splat_u16(8);
  1315 
  1313 
  1316     /* set the alpha to 255 on the destination surf */
  1314     /* set the alpha to 255 on the destination surf */
  1317     valphamask = VEC_ALPHA_MASK ();
  1315     valphamask = VEC_ALPHA_MASK();
  1318 
  1316 
  1319     vsrcPermute = calc_swizzle32 (srcfmt, NULL);
  1317     vsrcPermute = calc_swizzle32(srcfmt, NULL);
  1320     vdstPermute = calc_swizzle32 (NULL, dstfmt);
  1318     vdstPermute = calc_swizzle32(NULL, dstfmt);
  1321     vsdstPermute = calc_swizzle32 (dstfmt, NULL);
  1319     vsdstPermute = calc_swizzle32(dstfmt, NULL);
  1322 
  1320 
  1323     /* set a vector full of alpha and 255-alpha */
  1321     /* set a vector full of alpha and 255-alpha */
  1324     ((unsigned char *) &valpha)[0] = alpha;
  1322     ((unsigned char *) &valpha)[0] = alpha;
  1325     valpha = vec_splat (valpha, 0);
  1323     valpha = vec_splat(valpha, 0);
  1326     vbits = (vector unsigned char) vec_splat_s8 (-1);
  1324     vbits = (vector unsigned char) vec_splat_s8(-1);
  1327 
  1325 
  1328     while (height--) {
  1326     while (height--) {
  1329         int width = info->d_width;
  1327         int width = info->d_width;
  1330 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1328 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1331             Uint32 Pixel; \
  1329             Uint32 Pixel; \
  1336             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1334             ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
  1337             ++srcp; \
  1335             ++srcp; \
  1338             ++dstp; \
  1336             ++dstp; \
  1339             widthvar--; \
  1337             widthvar--; \
  1340         }
  1338         }
  1341         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dstp)) && (width), width);
  1339         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1342         if (width > 0) {
  1340         if (width > 0) {
  1343             int extrawidth = (width % 4);
  1341             int extrawidth = (width % 4);
  1344             vector unsigned char valigner = vec_lvsl (0, srcp);
  1342             vector unsigned char valigner = vec_lvsl(0, srcp);
  1345             vector unsigned char vs = (vector unsigned char) vec_ld (0, srcp);
  1343             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1346             width -= extrawidth;
  1344             width -= extrawidth;
  1347             while (width) {
  1345             while (width) {
  1348                 vector unsigned char voverflow;
  1346                 vector unsigned char voverflow;
  1349                 vector unsigned char vd;
  1347                 vector unsigned char vd;
  1350 
  1348 
  1351                 /* s = *srcp */
  1349                 /* s = *srcp */
  1352                 voverflow = (vector unsigned char) vec_ld (15, srcp);
  1350                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1353                 vs = vec_perm (vs, voverflow, valigner);
  1351                 vs = vec_perm(vs, voverflow, valigner);
  1354                 vs = vec_perm (vs, valpha, vsrcPermute);
  1352                 vs = vec_perm(vs, valpha, vsrcPermute);
  1355 
  1353 
  1356                 /* d = *dstp */
  1354                 /* d = *dstp */
  1357                 vd = (vector unsigned char) vec_ld (0, dstp);
  1355                 vd = (vector unsigned char) vec_ld(0, dstp);
  1358                 vd = vec_perm (vd, vd, vsdstPermute);
  1356                 vd = vec_perm(vd, vd, vsdstPermute);
  1359 
  1357 
  1360                 VEC_MULTIPLY_ALPHA (vs, vd, valpha, mergePermute, v1, v8);
  1358                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1361 
  1359 
  1362                 /* set the alpha channel to full on */
  1360                 /* set the alpha channel to full on */
  1363                 vd = vec_or (vd, valphamask);
  1361                 vd = vec_or(vd, valphamask);
  1364                 vd = vec_perm (vd, vbits, vdstPermute);
  1362                 vd = vec_perm(vd, vbits, vdstPermute);
  1365 
  1363 
  1366                 /* *dstp = res */
  1364                 /* *dstp = res */
  1367                 vec_st ((vector unsigned int) vd, 0, dstp);
  1365                 vec_st((vector unsigned int) vd, 0, dstp);
  1368 
  1366 
  1369                 srcp += 4;
  1367                 srcp += 4;
  1370                 dstp += 4;
  1368                 dstp += 4;
  1371                 width -= 4;
  1369                 width -= 4;
  1372                 vs = voverflow;
  1370                 vs = voverflow;
  1373             }
  1371             }
  1374             ONE_PIXEL_BLEND ((extrawidth), extrawidth);
  1372             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1375         }
  1373         }
  1376 #undef ONE_PIXEL_BLEND
  1374 #undef ONE_PIXEL_BLEND
  1377 
  1375 
  1378         srcp += srcskip;
  1376         srcp += srcskip;
  1379         dstp += dstskip;
  1377         dstp += dstskip;
  1382 }
  1380 }
  1383 
  1381 
  1384 
  1382 
  1385 /* fast RGB888->(A)RGB888 blending */
  1383 /* fast RGB888->(A)RGB888 blending */
  1386 static void
  1384 static void
  1387 BlitRGBtoRGBSurfaceAlphaAltivec (SDL_BlitInfo * info)
  1385 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
  1388 {
  1386 {
  1389     unsigned alpha = info->src->alpha;
  1387     unsigned alpha = info->src->alpha;
  1390     int height = info->d_height;
  1388     int height = info->d_height;
  1391     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1389     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1392     int srcskip = info->s_skip >> 2;
  1390     int srcskip = info->s_skip >> 2;
  1396     vector unsigned char valpha;
  1394     vector unsigned char valpha;
  1397     vector unsigned char valphamask;
  1395     vector unsigned char valphamask;
  1398     vector unsigned short v1;
  1396     vector unsigned short v1;
  1399     vector unsigned short v8;
  1397     vector unsigned short v8;
  1400 
  1398 
  1401     mergePermute = VEC_MERGE_PERMUTE ();
  1399     mergePermute = VEC_MERGE_PERMUTE();
  1402     v1 = vec_splat_u16 (1);
  1400     v1 = vec_splat_u16(1);
  1403     v8 = vec_splat_u16 (8);
  1401     v8 = vec_splat_u16(8);
  1404 
  1402 
  1405     /* set the alpha to 255 on the destination surf */
  1403     /* set the alpha to 255 on the destination surf */
  1406     valphamask = VEC_ALPHA_MASK ();
  1404     valphamask = VEC_ALPHA_MASK();
  1407 
  1405 
  1408     /* set a vector full of alpha and 255-alpha */
  1406     /* set a vector full of alpha and 255-alpha */
  1409     ((unsigned char *) &valpha)[0] = alpha;
  1407     ((unsigned char *) &valpha)[0] = alpha;
  1410     valpha = vec_splat (valpha, 0);
  1408     valpha = vec_splat(valpha, 0);
  1411 
  1409 
  1412     while (height--) {
  1410     while (height--) {
  1413         int width = info->d_width;
  1411         int width = info->d_width;
  1414 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1412 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
  1415             Uint32 s = *srcp; \
  1413             Uint32 s = *srcp; \
  1424             *dstp = d1 | d | 0xff000000; \
  1422             *dstp = d1 | d | 0xff000000; \
  1425             ++srcp; \
  1423             ++srcp; \
  1426             ++dstp; \
  1424             ++dstp; \
  1427             widthvar--; \
  1425             widthvar--; \
  1428         }
  1426         }
  1429         ONE_PIXEL_BLEND ((UNALIGNED_PTR (dstp)) && (width), width);
  1427         ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
  1430         if (width > 0) {
  1428         if (width > 0) {
  1431             int extrawidth = (width % 4);
  1429             int extrawidth = (width % 4);
  1432             vector unsigned char valigner = VEC_ALIGNER (srcp);
  1430             vector unsigned char valigner = VEC_ALIGNER(srcp);
  1433             vector unsigned char vs = (vector unsigned char) vec_ld (0, srcp);
  1431             vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
  1434             width -= extrawidth;
  1432             width -= extrawidth;
  1435             while (width) {
  1433             while (width) {
  1436                 vector unsigned char voverflow;
  1434                 vector unsigned char voverflow;
  1437                 vector unsigned char vd;
  1435                 vector unsigned char vd;
  1438 
  1436 
  1439                 /* s = *srcp */
  1437                 /* s = *srcp */
  1440                 voverflow = (vector unsigned char) vec_ld (15, srcp);
  1438                 voverflow = (vector unsigned char) vec_ld(15, srcp);
  1441                 vs = vec_perm (vs, voverflow, valigner);
  1439                 vs = vec_perm(vs, voverflow, valigner);
  1442 
  1440 
  1443                 /* d = *dstp */
  1441                 /* d = *dstp */
  1444                 vd = (vector unsigned char) vec_ld (0, dstp);
  1442                 vd = (vector unsigned char) vec_ld(0, dstp);
  1445 
  1443 
  1446                 VEC_MULTIPLY_ALPHA (vs, vd, valpha, mergePermute, v1, v8);
  1444                 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
  1447 
  1445 
  1448                 /* set the alpha channel to full on */
  1446                 /* set the alpha channel to full on */
  1449                 vd = vec_or (vd, valphamask);
  1447                 vd = vec_or(vd, valphamask);
  1450 
  1448 
  1451                 /* *dstp = res */
  1449                 /* *dstp = res */
  1452                 vec_st ((vector unsigned int) vd, 0, dstp);
  1450                 vec_st((vector unsigned int) vd, 0, dstp);
  1453 
  1451 
  1454                 srcp += 4;
  1452                 srcp += 4;
  1455                 dstp += 4;
  1453                 dstp += 4;
  1456                 width -= 4;
  1454                 width -= 4;
  1457                 vs = voverflow;
  1455                 vs = voverflow;
  1458             }
  1456             }
  1459             ONE_PIXEL_BLEND ((extrawidth), extrawidth);
  1457             ONE_PIXEL_BLEND((extrawidth), extrawidth);
  1460         }
  1458         }
  1461 #undef ONE_PIXEL_BLEND
  1459 #undef ONE_PIXEL_BLEND
  1462 
  1460 
  1463         srcp += srcskip;
  1461         srcp += srcskip;
  1464         dstp += dstskip;
  1462         dstp += dstskip;
  1470 #endif
  1468 #endif
  1471 #endif /* SDL_ALTIVEC_BLITTERS */
  1469 #endif /* SDL_ALTIVEC_BLITTERS */
  1472 
  1470 
  1473 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1471 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
  1474 static void
  1472 static void
  1475 BlitRGBtoRGBSurfaceAlpha128 (SDL_BlitInfo * info)
  1473 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
  1476 {
  1474 {
  1477     int width = info->d_width;
  1475     int width = info->d_width;
  1478     int height = info->d_height;
  1476     int height = info->d_height;
  1479     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1477     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1480     int srcskip = info->s_skip >> 2;
  1478     int srcskip = info->s_skip >> 2;
  1495     }
  1493     }
  1496 }
  1494 }
  1497 
  1495 
  1498 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1496 /* fast RGB888->(A)RGB888 blending with surface alpha */
  1499 static void
  1497 static void
  1500 BlitRGBtoRGBSurfaceAlpha (SDL_BlitInfo * info)
  1498 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
  1501 {
  1499 {
  1502     unsigned alpha = info->src->alpha;
  1500     unsigned alpha = info->src->alpha;
  1503     if (alpha == 128) {
  1501     if (alpha == 128) {
  1504         BlitRGBtoRGBSurfaceAlpha128 (info);
  1502         BlitRGBtoRGBSurfaceAlpha128(info);
  1505     } else {
  1503     } else {
  1506         int width = info->d_width;
  1504         int width = info->d_width;
  1507         int height = info->d_height;
  1505         int height = info->d_height;
  1508         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1506         Uint32 *srcp = (Uint32 *) info->s_pixels;
  1509         int srcskip = info->s_skip >> 2;
  1507         int srcskip = info->s_skip >> 2;
  1567     }
  1565     }
  1568 }
  1566 }
  1569 
  1567 
  1570 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1568 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
  1571 static void
  1569 static void
  1572 BlitRGBtoRGBPixelAlpha (SDL_BlitInfo * info)
  1570 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
  1573 {
  1571 {
  1574     int width = info->d_width;
  1572     int width = info->d_width;
  1575     int height = info->d_height;
  1573     int height = info->d_height;
  1576     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1574     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1577     int srcskip = info->s_skip >> 2;
  1575     int srcskip = info->s_skip >> 2;
  1620 }
  1618 }
  1621 
  1619 
  1622 #if GCC_ASMBLIT
  1620 #if GCC_ASMBLIT
  1623 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1621 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1624 inline static void
  1622 inline static void
  1625 BlitRGBtoRGBPixelAlphaMMX3DNOW (SDL_BlitInfo * info)
  1623 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1626 {
  1624 {
  1627     int width = info->d_width;
  1625     int width = info->d_width;
  1628     int height = info->d_height;
  1626     int height = info->d_height;
  1629     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1627     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1630     int srcskip = info->s_skip >> 2;
  1628     int srcskip = info->s_skip >> 2;
  1631     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1629     Uint32 *dstp = (Uint32 *) info->d_pixels;
  1632     int dstskip = info->d_skip >> 2;
  1630     int dstskip = info->d_skip >> 2;
  1633     SDL_PixelFormat *sf = info->src;
  1631     SDL_PixelFormat *sf = info->src;
  1634     Uint32 amask = sf->Amask;
  1632     Uint32 amask = sf->Amask;
  1635 
  1633 
  1636     __asm__ (
  1634     __asm__(
  1637                 /* make mm6 all zeros. */
  1635                /* make mm6 all zeros. */
  1638                 "pxor       %%mm6, %%mm6\n"
  1636                "pxor       %%mm6, %%mm6\n"
  1639                 /* Make a mask to preserve the alpha. */
  1637                /* Make a mask to preserve the alpha. */
  1640                 "movd      %0, %%mm7\n\t"       /* 0000F000 -> mm7 */
  1638                "movd      %0, %%mm7\n\t"        /* 0000F000 -> mm7 */
  1641                 "punpcklbw %%mm7, %%mm7\n\t"    /* FF000000 -> mm7 */
  1639                "punpcklbw %%mm7, %%mm7\n\t"     /* FF000000 -> mm7 */
  1642                 "pcmpeqb   %%mm4, %%mm4\n\t"    /* FFFFFFFF -> mm4 */
  1640                "pcmpeqb   %%mm4, %%mm4\n\t"     /* FFFFFFFF -> mm4 */
  1643                 "movq      %%mm4, %%mm3\n\t"    /* FFFFFFFF -> mm3 (for later) */
  1641                "movq      %%mm4, %%mm3\n\t"     /* FFFFFFFF -> mm3 (for later) */
  1644                 "pxor      %%mm4, %%mm7\n\t"    /* 00FFFFFF -> mm7 (mult mask) */
  1642                "pxor      %%mm4, %%mm7\n\t"     /* 00FFFFFF -> mm7 (mult mask) */
  1645                 /* form channel masks */
  1643                /* form channel masks */
  1646                 "movq      %%mm7, %%mm4\n\t"    /* 00FFFFFF -> mm4 */
  1644                "movq      %%mm7, %%mm4\n\t"     /* 00FFFFFF -> mm4 */
  1647                 "packsswb  %%mm6, %%mm4\n\t"    /* 00000FFF -> mm4 (channel mask) */
  1645                "packsswb  %%mm6, %%mm4\n\t"     /* 00000FFF -> mm4 (channel mask) */
  1648                 "packsswb  %%mm6, %%mm3\n\t"    /* 0000FFFF -> mm3 */
  1646                "packsswb  %%mm6, %%mm3\n\t"     /* 0000FFFF -> mm3 */
  1649                 "pxor      %%mm4, %%mm3\n\t"    /* 0000F000 -> mm3 (~channel mask) */
  1647                "pxor      %%mm4, %%mm3\n\t"     /* 0000F000 -> mm3 (~channel mask) */
  1650                 /* get alpha channel shift */
  1648                /* get alpha channel shift */
  1651                 "movd      %1, %%mm5\n\t"       /* Ashift -> mm5 */
  1649                "movd      %1, %%mm5\n\t"        /* Ashift -> mm5 */
  1652   : /* nothing */ :            "m" (sf->Amask), "m" (sf->Ashift));
  1650   : /* nothing */ :            "m"(sf->Amask), "m"(sf->Ashift));
  1653 
  1651 
  1654     while (height--) {
  1652     while (height--) {
  1655 
  1653 
  1656 	    /* *INDENT-OFF* */
  1654 	    /* *INDENT-OFF* */
  1657 	    DUFFS_LOOP4({
  1655 	    DUFFS_LOOP4({
  1728 	    /* *INDENT-ON* */
  1726 	    /* *INDENT-ON* */
  1729         srcp += srcskip;
  1727         srcp += srcskip;
  1730         dstp += dstskip;
  1728         dstp += dstskip;
  1731     }
  1729     }
  1732 
  1730 
  1733   __asm__ ("emms\n":);
  1731   __asm__("emms\n":);
  1734 }
  1732 }
  1735 
  1733 
  1736 /* End GCC_ASMBLIT*/
  1734 /* End GCC_ASMBLIT*/
  1737 
  1735 
  1738 #elif MSVC_ASMBLIT
  1736 #elif MSVC_ASMBLIT
  1739 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1737 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
  1740 static void
  1738 static void
  1741 BlitRGBtoRGBPixelAlphaMMX3DNOW (SDL_BlitInfo * info)
  1739 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
  1742 {
  1740 {
  1743     int width = info->d_width;
  1741     int width = info->d_width;
  1744     int height = info->d_height;
  1742     int height = info->d_height;
  1745     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1743     Uint32 *srcp = (Uint32 *) info->s_pixels;
  1746     int srcskip = info->s_skip >> 2;
  1744     int srcskip = info->s_skip >> 2;
  1752     Uint32 ashift = sf->Ashift;
  1750     Uint32 ashift = sf->Ashift;
  1753     Uint64 multmask;
  1751     Uint64 multmask;
  1754 
  1752 
  1755     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1753     __m64 src1, dst1, mm_alpha, mm_zero, dmask;
  1756 
  1754 
  1757     mm_zero = _mm_setzero_si64 ();      /* 0 -> mm_zero */
  1755     mm_zero = _mm_setzero_si64();       /* 0 -> mm_zero */
  1758     multmask = ~(0xFFFFi 64 << (ashift * 2));
  1756     multmask = ~(0xFFFFi 64 << (ashift * 2));
  1759     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1757     dmask = *(__m64 *) & multmask;      /* dst alpha mask -> dmask */
  1760 
  1758 
  1761     while (height--) {
  1759     while (height--) {
  1762 	    /* *INDENT-OFF* */
  1760 	    /* *INDENT-OFF* */
  1799 	    }, width);
  1797 	    }, width);
  1800 	    /* *INDENT-ON* */
  1798 	    /* *INDENT-ON* */
  1801         srcp += srcskip;
  1799         srcp += srcskip;
  1802         dstp += dstskip;
  1800         dstp += dstskip;
  1803     }
  1801     }
  1804     _mm_empty ();
  1802     _mm_empty();
  1805 }
  1803 }
  1806 
  1804 
  1807 /* End MSVC_ASMBLIT */
  1805 /* End MSVC_ASMBLIT */
  1808 
  1806 
  1809 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1807 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  1818 #define BLEND2x16_50(d, s, mask)					     \
  1816 #define BLEND2x16_50(d, s, mask)					     \
  1819 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1817 	(((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
  1820 	 + (s & d & (~(mask | mask << 16))))
  1818 	 + (s & d & (~(mask | mask << 16))))
  1821 
  1819 
  1822 static void
  1820 static void
  1823 Blit16to16SurfaceAlpha128 (SDL_BlitInfo * info, Uint16 mask)
  1821 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
  1824 {
  1822 {
  1825     int width = info->d_width;
  1823     int width = info->d_width;
  1826     int height = info->d_height;
  1824     int height = info->d_height;
  1827     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1825     Uint16 *srcp = (Uint16 *) info->s_pixels;
  1828     int srcskip = info->s_skip >> 1;
  1826     int srcskip = info->s_skip >> 1;
  1840             int w = width;
  1838             int w = width;
  1841 
  1839 
  1842             /* handle odd destination */
  1840             /* handle odd destination */
  1843             if ((uintptr_t) dstp & 2) {
  1841             if ((uintptr_t) dstp & 2) {
  1844                 Uint16 d = *dstp, s = *srcp;
  1842                 Uint16 d = *dstp, s = *srcp;
  1845                 *dstp = BLEND16_50 (d, s, mask);
  1843                 *dstp = BLEND16_50(d, s, mask);
  1846                 dstp++;
  1844                 dstp++;
  1847                 srcp++;
  1845                 srcp++;
  1848                 w--;
  1846                 w--;
  1849             }
  1847             }
  1850             srcp++;             /* srcp is now 32-bit aligned */
  1848             srcp++;             /* srcp is now 32-bit aligned */
  1860                 s = (prev_sw << 16) + (sw >> 16);
  1858                 s = (prev_sw << 16) + (sw >> 16);
  1861 #else
  1859 #else
  1862                 s = (prev_sw >> 16) + (sw << 16);
  1860                 s = (prev_sw >> 16) + (sw << 16);
  1863 #endif
  1861 #endif
  1864                 prev_sw = sw;
  1862                 prev_sw = sw;
  1865                 *(Uint32 *) dstp = BLEND2x16_50 (dw, s, mask);
  1863                 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
  1866                 dstp += 2;
  1864                 dstp += 2;
  1867                 srcp += 2;
  1865                 srcp += 2;
  1868                 w -= 2;
  1866                 w -= 2;
  1869             }
  1867             }
  1870 
  1868 
  1874 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1872 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
  1875                 s = (Uint16) prev_sw;
  1873                 s = (Uint16) prev_sw;
  1876 #else
  1874 #else
  1877                 s = (Uint16) (prev_sw >> 16);
  1875                 s = (Uint16) (prev_sw >> 16);
  1878 #endif
  1876 #endif
  1879                 *dstp = BLEND16_50 (d, s, mask);
  1877                 *dstp = BLEND16_50(d, s, mask);
  1880                 srcp++;
  1878                 srcp++;
  1881                 dstp++;
  1879                 dstp++;
  1882             }
  1880             }
  1883             srcp += srcskip - 1;
  1881             srcp += srcskip - 1;
  1884             dstp += dstskip;
  1882             dstp += dstskip;
  1887             int w = width;
  1885             int w = width;
  1888 
  1886 
  1889             /* first odd pixel? */
  1887             /* first odd pixel? */
  1890             if ((uintptr_t) srcp & 2) {
  1888             if ((uintptr_t) srcp & 2) {
  1891                 Uint16 d = *dstp, s = *srcp;
  1889                 Uint16 d = *dstp, s = *srcp;
  1892                 *dstp = BLEND16_50 (d, s, mask);
  1890                 *dstp = BLEND16_50(d, s, mask);
  1893                 srcp++;
  1891                 srcp++;
  1894                 dstp++;
  1892                 dstp++;
  1895                 w--;
  1893                 w--;
  1896             }
  1894             }
  1897             /* srcp and dstp are now 32-bit aligned */
  1895             /* srcp and dstp are now 32-bit aligned */
  1898 
  1896 
  1899             while (w > 1) {
  1897             while (w > 1) {
  1900                 Uint32 sw = *(Uint32 *) srcp;
  1898                 Uint32 sw = *(Uint32 *) srcp;
  1901                 Uint32 dw = *(Uint32 *) dstp;
  1899                 Uint32 dw = *(Uint32 *) dstp;
  1902                 *(Uint32 *) dstp = BLEND2x16_50 (dw, sw, mask);
  1900                 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
  1903                 srcp += 2;
  1901                 srcp += 2;
  1904                 dstp += 2;
  1902                 dstp += 2;
  1905                 w -= 2;
  1903                 w -= 2;
  1906             }
  1904             }
  1907 
  1905 
  1908             /* last odd pixel? */
  1906             /* last odd pixel? */
  1909             if (w) {
  1907             if (w) {
  1910                 Uint16 d = *dstp, s = *srcp;
  1908                 Uint16 d = *dstp, s = *srcp;
  1911                 *dstp = BLEND16_50 (d, s, mask);
  1909                 *dstp = BLEND16_50(d, s, mask);
  1912                 srcp++;
  1910                 srcp++;
  1913                 dstp++;
  1911                 dstp++;
  1914             }
  1912             }
  1915             srcp += srcskip;
  1913             srcp += srcskip;
  1916             dstp += dstskip;
  1914             dstp += dstskip;
  1919 }
  1917 }
  1920 
  1918 
  1921 #if GCC_ASMBLIT
  1919 #if GCC_ASMBLIT
  1922 /* fast RGB565->RGB565 blending with surface alpha */
  1920 /* fast RGB565->RGB565 blending with surface alpha */
  1923 static void
  1921 static void
  1924 Blit565to565SurfaceAlphaMMX (SDL_BlitInfo * info)
  1922 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  1925 {
  1923 {
  1926     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1924     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  1927     if (alpha == 128) {
  1925     if (alpha == 128) {
  1928         Blit16to16SurfaceAlpha128 (info, 0xf7de);
  1926         Blit16to16SurfaceAlpha128(info, 0xf7de);
  1929     } else {
  1927     } else {
  1930         int width = info->d_width;
  1928         int width = info->d_width;
  1931         int height = info->d_height;
  1929         int height = info->d_height;
  1932         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1930         Uint16 *srcp = (Uint16 *) info->s_pixels;
  1933         int srcskip = info->s_skip >> 1;
  1931         int srcskip = info->s_skip >> 1;
  1938 
  1936 
  1939         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1937         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  1940         *(Uint64 *) load = alpha;
  1938         *(Uint64 *) load = alpha;
  1941         alpha >>= 3;            /* downscale alpha to 5 bits */
  1939         alpha >>= 3;            /* downscale alpha to 5 bits */
  1942 
  1940 
  1943         movq_m2r (*load, mm0);  /* alpha(0000000A) -> mm0 */
  1941         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  1944         punpcklwd_r2r (mm0, mm0);       /* 00000A0A -> mm0 */
  1942         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  1945         punpcklwd_r2r (mm0, mm0);       /* 0A0A0A0A -> mm0 */
  1943         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  1946         /* position alpha to allow for mullo and mulhi on diff channels
  1944         /* position alpha to allow for mullo and mulhi on diff channels
  1947            to reduce the number of operations */
  1945            to reduce the number of operations */
  1948         psllq_i2r (3, mm0);
  1946         psllq_i2r(3, mm0);
  1949 
  1947 
  1950         /* Setup the 565 color channel masks */
  1948         /* Setup the 565 color channel masks */
  1951         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1949         *(Uint64 *) load = 0x07E007E007E007E0ULL;
  1952         movq_m2r (*load, mm4);  /* MASKGREEN -> mm4 */
  1950         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  1953         *(Uint64 *) load = 0x001F001F001F001FULL;
  1951         *(Uint64 *) load = 0x001F001F001F001FULL;
  1954         movq_m2r (*load, mm7);  /* MASKBLUE -> mm7 */
  1952         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  1955         while (height--) {
  1953         while (height--) {
  1956 			/* *INDENT-OFF* */
  1954 			/* *INDENT-OFF* */
  1957 			DUFFS_LOOP_QUATRO2(
  1955 			DUFFS_LOOP_QUATRO2(
  1958 			{
  1956 			{
  1959 				s = *srcp++;
  1957 				s = *srcp++;
  2055 			}, width);			
  2053 			}, width);			
  2056 			/* *INDENT-ON* */
  2054 			/* *INDENT-ON* */
  2057             srcp += srcskip;
  2055             srcp += srcskip;
  2058             dstp += dstskip;
  2056             dstp += dstskip;
  2059         }
  2057         }
  2060         emms ();
  2058         emms();
  2061     }
  2059     }
  2062 }
  2060 }
  2063 
  2061 
  2064 /* fast RGB555->RGB555 blending with surface alpha */
  2062 /* fast RGB555->RGB555 blending with surface alpha */
  2065 static void
  2063 static void
  2066 Blit555to555SurfaceAlphaMMX (SDL_BlitInfo * info)
  2064 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2067 {
  2065 {
  2068     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2066     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2069     if (alpha == 128) {
  2067     if (alpha == 128) {
  2070         Blit16to16SurfaceAlpha128 (info, 0xfbde);
  2068         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2071     } else {
  2069     } else {
  2072         int width = info->d_width;
  2070         int width = info->d_width;
  2073         int height = info->d_height;
  2071         int height = info->d_height;
  2074         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2072         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2075         int srcskip = info->s_skip >> 1;
  2073         int srcskip = info->s_skip >> 1;
  2080 
  2078 
  2081         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2079         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2082         *(Uint64 *) load = alpha;
  2080         *(Uint64 *) load = alpha;
  2083         alpha >>= 3;            /* downscale alpha to 5 bits */
  2081         alpha >>= 3;            /* downscale alpha to 5 bits */
  2084 
  2082 
  2085         movq_m2r (*load, mm0);  /* alpha(0000000A) -> mm0 */
  2083         movq_m2r(*load, mm0);   /* alpha(0000000A) -> mm0 */
  2086         punpcklwd_r2r (mm0, mm0);       /* 00000A0A -> mm0 */
  2084         punpcklwd_r2r(mm0, mm0);        /* 00000A0A -> mm0 */
  2087         punpcklwd_r2r (mm0, mm0);       /* 0A0A0A0A -> mm0 */
  2085         punpcklwd_r2r(mm0, mm0);        /* 0A0A0A0A -> mm0 */
  2088         /* position alpha to allow for mullo and mulhi on diff channels
  2086         /* position alpha to allow for mullo and mulhi on diff channels
  2089            to reduce the number of operations */
  2087            to reduce the number of operations */
  2090         psllq_i2r (3, mm0);
  2088         psllq_i2r(3, mm0);
  2091 
  2089 
  2092         /* Setup the 555 color channel masks */
  2090         /* Setup the 555 color channel masks */
  2093         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2091         *(Uint64 *) load = 0x03E003E003E003E0ULL;
  2094         movq_m2r (*load, mm4);  /* MASKGREEN -> mm4 */
  2092         movq_m2r(*load, mm4);   /* MASKGREEN -> mm4 */
  2095         *(Uint64 *) load = 0x001F001F001F001FULL;
  2093         *(Uint64 *) load = 0x001F001F001F001FULL;
  2096         movq_m2r (*load, mm7);  /* MASKBLUE -> mm7 */
  2094         movq_m2r(*load, mm7);   /* MASKBLUE -> mm7 */
  2097         while (height--) {
  2095         while (height--) {
  2098 			/* *INDENT-OFF* */
  2096 			/* *INDENT-OFF* */
  2099 			DUFFS_LOOP_QUATRO2(
  2097 			DUFFS_LOOP_QUATRO2(
  2100 			{
  2098 			{
  2101 				s = *srcp++;
  2099 				s = *srcp++;
  2202 			}, width);
  2200 			}, width);
  2203 			/* *INDENT-ON* */
  2201 			/* *INDENT-ON* */
  2204             srcp += srcskip;
  2202             srcp += srcskip;
  2205             dstp += dstskip;
  2203             dstp += dstskip;
  2206         }
  2204         }
  2207         emms ();
  2205         emms();
  2208     }
  2206     }
  2209 }
  2207 }
  2210 
  2208 
  2211 /* End GCC_ASMBLIT */
  2209 /* End GCC_ASMBLIT */
  2212 
  2210 
  2213 #elif MSVC_ASMBLIT
  2211 #elif MSVC_ASMBLIT
  2214 /* fast RGB565->RGB565 blending with surface alpha */
  2212 /* fast RGB565->RGB565 blending with surface alpha */
  2215 static void
  2213 static void
  2216 Blit565to565SurfaceAlphaMMX (SDL_BlitInfo * info)
  2214 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
  2217 {
  2215 {
  2218     unsigned alpha = info->src->alpha;
  2216     unsigned alpha = info->src->alpha;
  2219     if (alpha == 128) {
  2217     if (alpha == 128) {
  2220         Blit16to16SurfaceAlpha128 (info, 0xf7de);
  2218         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2221     } else {
  2219     } else {
  2222         int width = info->d_width;
  2220         int width = info->d_width;
  2223         int height = info->d_height;
  2221         int height = info->d_height;
  2224         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2222         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2225         int srcskip = info->s_skip >> 1;
  2223         int srcskip = info->s_skip >> 1;
  2228         Uint32 s, d;
  2226         Uint32 s, d;
  2229 
  2227 
  2230         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2228         __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
  2231 
  2229 
  2232         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2230         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2233         mm_alpha = _mm_set_pi32 (0, alpha);     /* 0000000A -> mm_alpha */
  2231         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2234         alpha >>= 3;            /* downscale alpha to 5 bits */
  2232         alpha >>= 3;            /* downscale alpha to 5 bits */
  2235 
  2233 
  2236         mm_alpha = _mm_unpacklo_pi16 (mm_alpha, mm_alpha);      /* 00000A0A -> mm_alpha */
  2234         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2237         mm_alpha = _mm_unpacklo_pi32 (mm_alpha, mm_alpha);      /* 0A0A0A0A -> mm_alpha */
  2235         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2238         /* position alpha to allow for mullo and mulhi on diff channels
  2236         /* position alpha to allow for mullo and mulhi on diff channels
  2239            to reduce the number of operations */
  2237            to reduce the number of operations */
  2240         mm_alpha = _mm_slli_si64 (mm_alpha, 3);
  2238         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2241 
  2239 
  2242         /* Setup the 565 color channel masks */
  2240         /* Setup the 565 color channel masks */
  2243         gmask = _mm_set_pi32 (0x07E007E0, 0x07E007E0);  /* MASKGREEN -> gmask */
  2241         gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0);   /* MASKGREEN -> gmask */
  2244         bmask = _mm_set_pi32 (0x001F001F, 0x001F001F);  /* MASKBLUE -> bmask */
  2242         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2245 
  2243 
  2246         while (height--) {
  2244         while (height--) {
  2247 			/* *INDENT-OFF* */
  2245 			/* *INDENT-OFF* */
  2248 			DUFFS_LOOP_QUATRO2(
  2246 			DUFFS_LOOP_QUATRO2(
  2249 			{
  2247 			{
  2342 			}, width);
  2340 			}, width);
  2343 			/* *INDENT-ON* */
  2341 			/* *INDENT-ON* */
  2344             srcp += srcskip;
  2342             srcp += srcskip;
  2345             dstp += dstskip;
  2343             dstp += dstskip;
  2346         }
  2344         }
  2347         _mm_empty ();
  2345         _mm_empty();
  2348     }
  2346     }
  2349 }
  2347 }
  2350 
  2348 
  2351 /* fast RGB555->RGB555 blending with surface alpha */
  2349 /* fast RGB555->RGB555 blending with surface alpha */
  2352 static void
  2350 static void
  2353 Blit555to555SurfaceAlphaMMX (SDL_BlitInfo * info)
  2351 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
  2354 {
  2352 {
  2355     unsigned alpha = info->src->alpha;
  2353     unsigned alpha = info->src->alpha;
  2356     if (alpha == 128) {
  2354     if (alpha == 128) {
  2357         Blit16to16SurfaceAlpha128 (info, 0xfbde);
  2355         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2358     } else {
  2356     } else {
  2359         int width = info->d_width;
  2357         int width = info->d_width;
  2360         int height = info->d_height;
  2358         int height = info->d_height;
  2361         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2359         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2362         int srcskip = info->s_skip >> 1;
  2360         int srcskip = info->s_skip >> 1;
  2365         Uint32 s, d;
  2363         Uint32 s, d;
  2366 
  2364 
  2367         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2365         __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
  2368 
  2366 
  2369         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2367         alpha &= ~(1 + 2 + 4);  /* cut alpha to get the exact same behaviour */
  2370         mm_alpha = _mm_set_pi32 (0, alpha);     /* 0000000A -> mm_alpha */
  2368         mm_alpha = _mm_set_pi32(0, alpha);      /* 0000000A -> mm_alpha */
  2371         alpha >>= 3;            /* downscale alpha to 5 bits */
  2369         alpha >>= 3;            /* downscale alpha to 5 bits */
  2372 
  2370 
  2373         mm_alpha = _mm_unpacklo_pi16 (mm_alpha, mm_alpha);      /* 00000A0A -> mm_alpha */
  2371         mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha);       /* 00000A0A -> mm_alpha */
  2374         mm_alpha = _mm_unpacklo_pi32 (mm_alpha, mm_alpha);      /* 0A0A0A0A -> mm_alpha */
  2372         mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha);       /* 0A0A0A0A -> mm_alpha */
  2375         /* position alpha to allow for mullo and mulhi on diff channels
  2373         /* position alpha to allow for mullo and mulhi on diff channels
  2376            to reduce the number of operations */
  2374            to reduce the number of operations */
  2377         mm_alpha = _mm_slli_si64 (mm_alpha, 3);
  2375         mm_alpha = _mm_slli_si64(mm_alpha, 3);
  2378 
  2376 
  2379         /* Setup the 555 color channel masks */
  2377         /* Setup the 555 color channel masks */
  2380         rmask = _mm_set_pi32 (0x7C007C00, 0x7C007C00);  /* MASKRED -> rmask */
  2378         rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00);   /* MASKRED -> rmask */
  2381         gmask = _mm_set_pi32 (0x03E003E0, 0x03E003E0);  /* MASKGREEN -> gmask */
  2379         gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0);   /* MASKGREEN -> gmask */
  2382         bmask = _mm_set_pi32 (0x001F001F, 0x001F001F);  /* MASKBLUE -> bmask */
  2380         bmask = _mm_set_pi32(0x001F001F, 0x001F001F);   /* MASKBLUE -> bmask */
  2383 
  2381 
  2384         while (height--) {
  2382         while (height--) {
  2385 			/* *INDENT-OFF* */
  2383 			/* *INDENT-OFF* */
  2386 			DUFFS_LOOP_QUATRO2(
  2384 			DUFFS_LOOP_QUATRO2(
  2387 			{
  2385 			{
  2480 			}, width);
  2478 			}, width);
  2481 			/* *INDENT-ON* */
  2479 			/* *INDENT-ON* */
  2482             srcp += srcskip;
  2480             srcp += srcskip;
  2483             dstp += dstskip;
  2481             dstp += dstskip;
  2484         }
  2482         }
  2485         _mm_empty ();
  2483         _mm_empty();
  2486     }
  2484     }
  2487 }
  2485 }
  2488 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2486 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
  2489 
  2487 
  2490 /* fast RGB565->RGB565 blending with surface alpha */
  2488 /* fast RGB565->RGB565 blending with surface alpha */
  2491 static void
  2489 static void
  2492 Blit565to565SurfaceAlpha (SDL_BlitInfo * info)
  2490 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
  2493 {
  2491 {
  2494     unsigned alpha = info->src->alpha;
  2492     unsigned alpha = info->src->alpha;
  2495     if (alpha == 128) {
  2493     if (alpha == 128) {
  2496         Blit16to16SurfaceAlpha128 (info, 0xf7de);
  2494         Blit16to16SurfaceAlpha128(info, 0xf7de);
  2497     } else {
  2495     } else {
  2498         int width = info->d_width;
  2496         int width = info->d_width;
  2499         int height = info->d_height;
  2497         int height = info->d_height;
  2500         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2498         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2501         int srcskip = info->s_skip >> 1;
  2499         int srcskip = info->s_skip >> 1;
  2526     }
  2524     }
  2527 }
  2525 }
  2528 
  2526 
  2529 /* fast RGB555->RGB555 blending with surface alpha */
  2527 /* fast RGB555->RGB555 blending with surface alpha */
  2530 static void
  2528 static void
  2531 Blit555to555SurfaceAlpha (SDL_BlitInfo * info)
  2529 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
  2532 {
  2530 {
  2533     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2531     unsigned alpha = info->src->alpha;  /* downscale alpha to 5 bits */
  2534     if (alpha == 128) {
  2532     if (alpha == 128) {
  2535         Blit16to16SurfaceAlpha128 (info, 0xfbde);
  2533         Blit16to16SurfaceAlpha128(info, 0xfbde);
  2536     } else {
  2534     } else {
  2537         int width = info->d_width;
  2535         int width = info->d_width;
  2538         int height = info->d_height;
  2536         int height = info->d_height;
  2539         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2537         Uint16 *srcp = (Uint16 *) info->s_pixels;
  2540         int srcskip = info->s_skip >> 1;
  2538         int srcskip = info->s_skip >> 1;
  2565     }
  2563     }
  2566 }
  2564 }
  2567 
  2565 
  2568 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2566 /* fast ARGB8888->RGB565 blending with pixel alpha */
  2569 static void
  2567 static void
  2570 BlitARGBto565PixelAlpha (SDL_BlitInfo * info)
  2568 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
  2571 {
  2569 {
  2572     int width = info->d_width;
  2570     int width = info->d_width;
  2573     int height = info->d_height;
  2571     int height = info->d_height;
  2574     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2572     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2575     int srcskip = info->s_skip >> 2;
  2573     int srcskip = info->s_skip >> 2;
  2611     }
  2609     }
  2612 }
  2610 }
  2613 
  2611 
  2614 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2612 /* fast ARGB8888->RGB555 blending with pixel alpha */
  2615 static void
  2613 static void
  2616 BlitARGBto555PixelAlpha (SDL_BlitInfo * info)
  2614 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
  2617 {
  2615 {
  2618     int width = info->d_width;
  2616     int width = info->d_width;
  2619     int height = info->d_height;
  2617     int height = info->d_height;
  2620     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2618     Uint32 *srcp = (Uint32 *) info->s_pixels;
  2621     int srcskip = info->s_skip >> 2;
  2619     int srcskip = info->s_skip >> 2;
  2658     }
  2656     }
  2659 }
  2657 }
  2660 
  2658 
  2661 /* General (slow) N->N blending with per-surface alpha */
  2659 /* General (slow) N->N blending with per-surface alpha */
  2662 static void
  2660 static void
  2663 BlitNtoNSurfaceAlpha (SDL_BlitInfo * info)
  2661 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
  2664 {
  2662 {
  2665     int width = info->d_width;
  2663     int width = info->d_width;
  2666     int height = info->d_height;
  2664     int height = info->d_height;
  2667     Uint8 *src = info->s_pixels;
  2665     Uint8 *src = info->s_pixels;
  2668     int srcskip = info->s_skip;
  2666     int srcskip = info->s_skip;
  2702     }
  2700     }
  2703 }
  2701 }
  2704 
  2702 
  2705 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2703 /* General (slow) colorkeyed N->N blending with per-surface alpha */
  2706 static void
  2704 static void
  2707 BlitNtoNSurfaceAlphaKey (SDL_BlitInfo * info)
  2705 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
  2708 {
  2706 {
  2709     int width = info->d_width;
  2707     int width = info->d_width;
  2710     int height = info->d_height;
  2708     int height = info->d_height;
  2711     Uint8 *src = info->s_pixels;
  2709     Uint8 *src = info->s_pixels;
  2712     int srcskip = info->s_skip;
  2710     int srcskip = info->s_skip;
  2748     }
  2746     }
  2749 }
  2747 }
  2750 
  2748 
  2751 /* General (slow) N->N blending with pixel alpha */
  2749 /* General (slow) N->N blending with pixel alpha */
  2752 static void
  2750 static void
  2753 BlitNtoNPixelAlpha (SDL_BlitInfo * info)
  2751 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
  2754 {
  2752 {
  2755     int width = info->d_width;
  2753     int width = info->d_width;
  2756     int height = info->d_height;
  2754     int height = info->d_height;
  2757     Uint8 *src = info->s_pixels;
  2755     Uint8 *src = info->s_pixels;
  2758     int srcskip = info->s_skip;
  2756     int srcskip = info->s_skip;
  2802     }
  2800     }
  2803 }
  2801 }
  2804 
  2802 
  2805 
  2803 
  2806 SDL_loblit
  2804 SDL_loblit
  2807 SDL_CalculateAlphaBlit (SDL_Surface * surface, int blit_index)
  2805 SDL_CalculateAlphaBlit(SDL_Surface * surface, int blit_index)
  2808 {
  2806 {
  2809     SDL_PixelFormat *sf = surface->format;
  2807     SDL_PixelFormat *sf = surface->format;
  2810     SDL_PixelFormat *df = surface->map->dst->format;
  2808     SDL_PixelFormat *df = surface->map->dst->format;
  2811 
  2809 
  2812     if (sf->Amask == 0) {
  2810     if (sf->Amask == 0) {
  2815                 return BlitNto1SurfaceAlphaKey;
  2813                 return BlitNto1SurfaceAlphaKey;
  2816             else
  2814             else
  2817 #if SDL_ALTIVEC_BLITTERS
  2815 #if SDL_ALTIVEC_BLITTERS
  2818                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2816                 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
  2819                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2817                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2820                     && SDL_HasAltiVec ())
  2818                     && SDL_HasAltiVec())
  2821                 return Blit32to32SurfaceAlphaKeyAltivec;
  2819                 return Blit32to32SurfaceAlphaKeyAltivec;
  2822             else
  2820             else
  2823 #endif
  2821 #endif
  2824                 return BlitNtoNSurfaceAlphaKey;
  2822                 return BlitNtoNSurfaceAlphaKey;
  2825         } else {
  2823         } else {
  2830 
  2828 
  2831             case 2:
  2829             case 2:
  2832                 if (surface->map->identity) {
  2830                 if (surface->map->identity) {
  2833                     if (df->Gmask == 0x7e0) {
  2831                     if (df->Gmask == 0x7e0) {
  2834 #if MMX_ASMBLIT
  2832 #if MMX_ASMBLIT
  2835                         if (SDL_HasMMX ())
  2833                         if (SDL_HasMMX())
  2836                             return Blit565to565SurfaceAlphaMMX;
  2834                             return Blit565to565SurfaceAlphaMMX;
  2837                         else
  2835                         else
  2838 #endif
  2836 #endif
  2839                             return Blit565to565SurfaceAlpha;
  2837                             return Blit565to565SurfaceAlpha;
  2840                     } else if (df->Gmask == 0x3e0) {
  2838                     } else if (df->Gmask == 0x3e0) {
  2841 #if MMX_ASMBLIT
  2839 #if MMX_ASMBLIT
  2842                         if (SDL_HasMMX ())
  2840                         if (SDL_HasMMX())
  2843                             return Blit555to555SurfaceAlphaMMX;
  2841                             return Blit555to555SurfaceAlphaMMX;
  2844                         else
  2842                         else
  2845 #endif
  2843 #endif
  2846                             return Blit555to555SurfaceAlpha;
  2844                             return Blit555to555SurfaceAlpha;
  2847                     }
  2845                     }
  2853                     && sf->Gmask == df->Gmask
  2851                     && sf->Gmask == df->Gmask
  2854                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2852                     && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
  2855 #if MMX_ASMBLIT
  2853 #if MMX_ASMBLIT
  2856                     if (sf->Rshift % 8 == 0
  2854                     if (sf->Rshift % 8 == 0
  2857                         && sf->Gshift % 8 == 0
  2855                         && sf->Gshift % 8 == 0
  2858                         && sf->Bshift % 8 == 0 && SDL_HasMMX ())
  2856                         && sf->Bshift % 8 == 0 && SDL_HasMMX())
  2859                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2857                         return BlitRGBtoRGBSurfaceAlphaMMX;
  2860 #endif
  2858 #endif
  2861                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2859                     if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
  2862 #if SDL_ALTIVEC_BLITTERS
  2860 #if SDL_ALTIVEC_BLITTERS
  2863                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2861                         if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2864                             && SDL_HasAltiVec ())
  2862                             && SDL_HasAltiVec())
  2865                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2863                             return BlitRGBtoRGBSurfaceAlphaAltivec;
  2866 #endif
  2864 #endif
  2867                         return BlitRGBtoRGBSurfaceAlpha;
  2865                         return BlitRGBtoRGBSurfaceAlpha;
  2868                     }
  2866                     }
  2869                 }
  2867                 }
  2870 #if SDL_ALTIVEC_BLITTERS
  2868 #if SDL_ALTIVEC_BLITTERS
  2871                 if ((sf->BytesPerPixel == 4) &&
  2869                 if ((sf->BytesPerPixel == 4) &&
  2872                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2870                     !(surface->map->dst->flags & SDL_HWSURFACE)
  2873                     && SDL_HasAltiVec ())
  2871                     && SDL_HasAltiVec())
  2874                     return Blit32to32SurfaceAlphaAltivec;
  2872                     return Blit32to32SurfaceAlphaAltivec;
  2875                 else
  2873                 else
  2876 #endif
  2874 #endif
  2877                     return BlitNtoNSurfaceAlpha;
  2875                     return BlitNtoNSurfaceAlpha;
  2878 
  2876 
  2890         case 2:
  2888         case 2:
  2891 #if SDL_ALTIVEC_BLITTERS
  2889 #if SDL_ALTIVEC_BLITTERS
  2892             if (sf->BytesPerPixel == 4
  2890             if (sf->BytesPerPixel == 4
  2893                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2891                 && !(surface->map->dst->flags & SDL_HWSURFACE)
  2894                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2892                 && df->Gmask == 0x7e0 && df->Bmask == 0x1f
  2895                 && SDL_HasAltiVec ())
  2893                 && SDL_HasAltiVec())
  2896                 return Blit32to565PixelAlphaAltivec;
  2894                 return Blit32to565PixelAlphaAltivec;
  2897             else
  2895             else
  2898 #endif
  2896 #endif
  2899                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2897                 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
  2900                     && sf->Gmask == 0xff00
  2898                     && sf->Gmask == 0xff00
  2914 #if MMX_ASMBLIT
  2912 #if MMX_ASMBLIT
  2915                 if (sf->Rshift % 8 == 0
  2913                 if (sf->Rshift % 8 == 0
  2916                     && sf->Gshift % 8 == 0
  2914                     && sf->Gshift % 8 == 0
  2917                     && sf->Bshift % 8 == 0
  2915                     && sf->Bshift % 8 == 0
  2918                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2916                     && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
  2919                     if (SDL_Has3DNow ())
  2917                     if (SDL_Has3DNow())
  2920                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2918                         return BlitRGBtoRGBPixelAlphaMMX3DNOW;
  2921                     if (SDL_HasMMX ())
  2919                     if (SDL_HasMMX())
  2922                         return BlitRGBtoRGBPixelAlphaMMX;
  2920                         return BlitRGBtoRGBPixelAlphaMMX;
  2923                 }
  2921                 }
  2924 #endif
  2922 #endif
  2925                 if (sf->Amask == 0xff000000) {
  2923                 if (sf->Amask == 0xff000000) {
  2926 #if SDL_ALTIVEC_BLITTERS
  2924 #if SDL_ALTIVEC_BLITTERS
  2927                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2925                     if (!(surface->map->dst->flags & SDL_HWSURFACE)
  2928                         && SDL_HasAltiVec ())
  2926                         && SDL_HasAltiVec())
  2929                         return BlitRGBtoRGBPixelAlphaAltivec;
  2927                         return BlitRGBtoRGBPixelAlphaAltivec;
  2930 #endif
  2928 #endif
  2931                     return BlitRGBtoRGBPixelAlpha;
  2929                     return BlitRGBtoRGBPixelAlpha;
  2932                 }
  2930                 }
  2933             }
  2931             }
  2934 #if SDL_ALTIVEC_BLITTERS
  2932 #if SDL_ALTIVEC_BLITTERS
  2935             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2933             if (sf->Amask && sf->BytesPerPixel == 4 &&
  2936                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2934                 !(surface->map->dst->flags & SDL_HWSURFACE)
  2937                 && SDL_HasAltiVec ())
  2935                 && SDL_HasAltiVec())
  2938                 return Blit32to32PixelAlphaAltivec;
  2936                 return Blit32to32PixelAlphaAltivec;
  2939             else
  2937             else
  2940 #endif
  2938 #endif
  2941                 return BlitNtoNPixelAlpha;
  2939                 return BlitNtoNPixelAlpha;
  2942 
  2940