Add fast paths in BlitNtoNKey
authorSylvain Becker <sylvain.becker@gmail.com>
Wed, 30 Jan 2019 22:50:20 +0100
changeset 12586eb449394ec0e
parent 12585 dff36de37426
child 12587 b2d4dcb4ba33
Add fast paths in BlitNtoNKey

All following conversions are faster (with colorkey, but no blending).
(ratio isn't very accurate)

ABGR8888 -> BGR888 : faster x9 (2699035 -> 297425)

ARGB8888 -> RGB888 : faster x8 (2659266 -> 296137)

BGR24 -> BGR24 : faster x5 (2232482 -> 445897)
BGR24 -> RGB24 : faster x4 (2150023 -> 448576)

BGR888 -> ABGR8888 : faster x8 (2649957 -> 307595)

BGRA8888 -> BGRX8888 : faster x9 (2696041 -> 297596)

BGRX8888 -> BGRA8888 : faster x8 (2662011 -> 299463)
BGRX8888 -> BGRX8888 : faster x9 (2733346 -> 295045)

RGB24 -> BGR24 : faster x4 (2154551 -> 485262)
RGB24 -> RGB24 : faster x4 (2149878 -> 484870)

RGB888 -> ARGB8888 : faster x8 (2762877 -> 324946)

RGBA8888 -> RGBX8888 : faster x8 (2657855 -> 297753)

RGBX8888 -> RGBA8888 : faster x8 (2661360 -> 296655)
RGBX8888 -> RGBX8888 : faster x8 (2649287 -> 308268)
src/video/SDL_blit_N.c
     1.1 --- a/src/video/SDL_blit_N.c	Wed Jan 30 17:16:08 2019 +0100
     1.2 +++ b/src/video/SDL_blit_N.c	Wed Jan 30 22:50:20 2019 +0100
     1.3 @@ -2329,35 +2329,120 @@
     1.4      int dstbpp = dstfmt->BytesPerPixel;
     1.5      unsigned alpha = dstfmt->Amask ? info->a : 0;
     1.6      Uint32 rgbmask = ~srcfmt->Amask;
     1.7 +    int sfmt = srcfmt->format;
     1.8 +    int dfmt = dstfmt->format;
     1.9  
    1.10      /* Set up some basic variables */
    1.11      ckey &= rgbmask;
    1.12  
    1.13 -    /* Fastpath: same source/destination format, no Amask, bpp 32, loop is vectorized. ~10x faster */
    1.14 -    if (srcfmt->format == dstfmt->format &&
    1.15 -        (srcfmt->format == SDL_PIXELFORMAT_RGB888 || srcfmt->format == SDL_PIXELFORMAT_BGR888)) {
    1.16 +    /* BPP 4, same rgb */
    1.17 +    if (srcbpp == 4 && dstbpp == 4 && srcfmt->Rmask == dstfmt->Rmask && srcfmt->Gmask == dstfmt->Gmask && srcfmt->Bmask == dstfmt->Bmask) {
    1.18          Uint32 *src32 = (Uint32*)src;
    1.19          Uint32 *dst32 = (Uint32*)dst;
    1.20 -        srcskip /= sizeof(Uint32);
    1.21 -        dstskip /= sizeof(Uint32);
    1.22 + 
    1.23 +        if (dstfmt->Amask) {
    1.24 +            /* RGB->RGBA, SET_ALPHA */
    1.25 +            Uint32 mask = info->a << dstfmt->Ashift;
    1.26 +            while (height--) {
    1.27 +                /* *INDENT-OFF* */
    1.28 +                DUFFS_LOOP(
    1.29 +                {
    1.30 +                    if ((*src32 & rgbmask) != ckey) {
    1.31 +                        *dst32 = *src32 | mask;
    1.32 +                    }
    1.33 +                    ++dst32;
    1.34 +                    ++src32;
    1.35 +                }, width);
    1.36 +                /* *INDENT-ON* */
    1.37 +                src32 = (Uint32 *) ((Uint8 *) src32 + srcskip);
    1.38 +                dst32 = (Uint32 *) ((Uint8 *) dst32 + dstskip);
    1.39 +            }
    1.40 +            return;
    1.41 +        } else {
    1.42 +            /* RGBA->RGB, NO_ALPHA */
    1.43 +            Uint32 mask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
    1.44 +            while (height--) {
    1.45 +                /* *INDENT-OFF* */
    1.46 +                DUFFS_LOOP(
    1.47 +                {
    1.48 +                    if ((*src32 & rgbmask) != ckey) {
    1.49 +                        *dst32 = *src32 & mask;
    1.50 +                    }
    1.51 +                    ++dst32;
    1.52 +                    ++src32;
    1.53 +                }, width);
    1.54 +                /* *INDENT-ON* */
    1.55 +                src32 = (Uint32 *) ((Uint8 *) src32 + srcskip);
    1.56 +                dst32 = (Uint32 *) ((Uint8 *) dst32 + dstskip);
    1.57 +            }
    1.58 +            return;
    1.59 +        }
    1.60 +    }
    1.61 +
    1.62 +    /* BPP 3, same rgb triplet */
    1.63 +    if ((sfmt == SDL_PIXELFORMAT_RGB24 && dfmt == SDL_PIXELFORMAT_RGB24) ||
    1.64 +        (sfmt == SDL_PIXELFORMAT_BGR24 && dfmt == SDL_PIXELFORMAT_BGR24)) {
    1.65 +
    1.66 +        Uint8 k0 = ckey & 0x000000FF;
    1.67 +        Uint8 k1 = (ckey & 0x0000FF00) >> 8;
    1.68 +        Uint8 k2 = (ckey & 0x00FF0000) >> 16;
    1.69 +
    1.70          while (height--) {
    1.71              /* *INDENT-OFF* */
    1.72              DUFFS_LOOP(
    1.73              {
    1.74 -                if (*src32 != ckey) {
    1.75 -                    *dst32 = *src32;
    1.76 +                Uint8 s0 = src[0];
    1.77 +                Uint8 s1 = src[1];
    1.78 +                Uint8 s2 = src[2];
    1.79 +
    1.80 +                if (k0 != s0 || k1 != s1 || k2 != s2) {
    1.81 +                    dst[0] = s0;
    1.82 +                    dst[1] = s1;
    1.83 +                    dst[2] = s2;
    1.84                  }
    1.85 -                ++src32;
    1.86 -                ++dst32;
    1.87 +                src += 3;
    1.88 +                dst += 3;
    1.89              },
    1.90              width);
    1.91              /* *INDENT-ON* */
    1.92 -            src32 += srcskip;
    1.93 -            dst32 += dstskip;
    1.94 +            src += srcskip;
    1.95 +            dst += dstskip;
    1.96          }
    1.97          return;
    1.98      }
    1.99  
   1.100 +    /* BPP 3, inversed rgb triplet */
   1.101 +    if ((sfmt == SDL_PIXELFORMAT_RGB24 && dfmt == SDL_PIXELFORMAT_BGR24) ||
   1.102 +        (sfmt == SDL_PIXELFORMAT_BGR24 && dfmt == SDL_PIXELFORMAT_RGB24)) {
   1.103 +
   1.104 +        Uint8 k0 = ckey & 0xFF;
   1.105 +        Uint8 k1 = (ckey >> 8)  & 0xFF;
   1.106 +        Uint8 k2 = (ckey >> 16) & 0xFF;
   1.107 +
   1.108 +        while (height--) {
   1.109 +            /* *INDENT-OFF* */
   1.110 +            DUFFS_LOOP(
   1.111 +            {
   1.112 +                Uint8 s0 = src[0];
   1.113 +                Uint8 s1 = src[1];
   1.114 +                Uint8 s2 = src[2];
   1.115 +                if (k0 != s0 || k1 != s1 || k2 != s2) {
   1.116 +                    /* Inversed RGB */
   1.117 +                    dst[0] = s2;
   1.118 +                    dst[1] = s1;
   1.119 +                    dst[2] = s0;
   1.120 +                }
   1.121 +                src += 3;
   1.122 +                dst += 3;
   1.123 +            },
   1.124 +            width);
   1.125 +            /* *INDENT-ON* */
   1.126 +            src += srcskip;
   1.127 +            dst += dstskip;
   1.128 +        }
   1.129 +        return;
   1.130 +    } 
   1.131 +   
   1.132      while (height--) {
   1.133          /* *INDENT-OFF* */
   1.134          DUFFS_LOOP(
   1.135 @@ -2406,29 +2491,30 @@
   1.136      ckey &= rgbmask;
   1.137  
   1.138      /* Fastpath: same source/destination format, with Amask, bpp 32, loop is vectorized. ~10x faster */
   1.139 -    if (srcfmt->format == dstfmt->format &&
   1.140 -        (srcfmt->format == SDL_PIXELFORMAT_ARGB8888 ||
   1.141 -         srcfmt->format == SDL_PIXELFORMAT_ABGR8888 ||
   1.142 -         srcfmt->format == SDL_PIXELFORMAT_BGRA8888 ||
   1.143 -         srcfmt->format == SDL_PIXELFORMAT_RGBA8888)) {
   1.144 -        Uint32 *src32 = (Uint32*)src;
   1.145 -        Uint32 *dst32 = (Uint32*)dst;
   1.146 -        srcskip /= sizeof(Uint32);
   1.147 -        dstskip /= sizeof(Uint32);
   1.148 -        while (height--) {
   1.149 -            /* *INDENT-OFF* */
   1.150 -            DUFFS_LOOP(
   1.151 -            {
   1.152 -                if ((*src32 & rgbmask) != ckey) {
   1.153 -                    *dst32 = *src32;
   1.154 -                }
   1.155 -                ++src32;
   1.156 -                ++dst32;
   1.157 -            },
   1.158 -            width);
   1.159 -            /* *INDENT-ON* */
   1.160 -            src32 += srcskip;
   1.161 -            dst32 += dstskip;
   1.162 +    if (srcfmt->format == dstfmt->format) {
   1.163 +
   1.164 +        if (srcfmt->format == SDL_PIXELFORMAT_ARGB8888 ||
   1.165 +            srcfmt->format == SDL_PIXELFORMAT_ABGR8888 ||
   1.166 +            srcfmt->format == SDL_PIXELFORMAT_BGRA8888 ||
   1.167 +            srcfmt->format == SDL_PIXELFORMAT_RGBA8888) {
   1.168 +
   1.169 +            Uint32 *src32 = (Uint32*)src;
   1.170 +            Uint32 *dst32 = (Uint32*)dst;
   1.171 +            while (height--) {
   1.172 +                /* *INDENT-OFF* */
   1.173 +                DUFFS_LOOP(
   1.174 +                {
   1.175 +                    if ((*src32 & rgbmask) != ckey) {
   1.176 +                        *dst32 = *src32;
   1.177 +                    }
   1.178 +                    ++src32;
   1.179 +                    ++dst32;
   1.180 +                },
   1.181 +                width);
   1.182 +                /* *INDENT-ON* */
   1.183 +                src32 = (Uint32 *)((Uint8 *)src32 + srcskip);
   1.184 +                dst32 = (Uint32 *)((Uint8 *)dst32 + dstskip);
   1.185 +            }
   1.186          }
   1.187          return;
   1.188      }
   1.189 @@ -2532,8 +2618,7 @@
   1.190  
   1.191      if (dstfmt->Amask) {
   1.192          /* SET_ALPHA */
   1.193 -        Uint32 alpha = info->a;
   1.194 -        Uint32 alphashift = alpha << 24;
   1.195 +        Uint32 mask = info->a << dstfmt->Ashift;
   1.196          while (height--) {
   1.197              /* *INDENT-OFF* */
   1.198              DUFFS_LOOP(
   1.199 @@ -2542,7 +2627,7 @@
   1.200                  Uint8 s0 = src[0];
   1.201                  Uint8 s1 = src[1];
   1.202                  Uint8 s2 = src[2];
   1.203 -                *dst32 = (s0) | (s1 << 8) | (s2 << 16) | alphashift;
   1.204 +                *dst32 = (s0) | (s1 << 8) | (s2 << 16) | mask;
   1.205                  dst += dstbpp;
   1.206                  src += srcbpp;
   1.207              }, width);
   1.208 @@ -2588,7 +2673,6 @@
   1.209      int dstbpp = dstfmt->BytesPerPixel;
   1.210  
   1.211      if (dstfmt->Amask) {
   1.212 -
   1.213          if (srcfmt->Amask) {
   1.214              /* COPY_ALPHA */
   1.215              /* Only to switch ABGR8888 <-> ARGB8888 */
   1.216 @@ -2596,11 +2680,11 @@
   1.217                  /* *INDENT-OFF* */
   1.218                  DUFFS_LOOP(
   1.219                  {
   1.220 -                    Uint32  *dst32 = (Uint32*)dst;
   1.221 +                    Uint32 *dst32 = (Uint32*)dst;
   1.222                      Uint8 s0 = src[0];
   1.223                      Uint8 s1 = src[1];
   1.224                      Uint8 s2 = src[2];
   1.225 -                    Uint32 alphashift = src[3] << 24;
   1.226 +                    Uint32 alphashift = src[3] << dstfmt->Ashift;
   1.227                      /* inversed, compared to Blit_3or4_to_3or4__same_rgb */
   1.228                      *dst32 = (s0 << 16) | (s1 << 8) | (s2) | alphashift;
   1.229                      dst += dstbpp;
   1.230 @@ -2612,18 +2696,17 @@
   1.231              }
   1.232          } else {
   1.233              /* SET_ALPHA */
   1.234 -            Uint32 alpha = info->a;
   1.235 -            Uint32 alphashift = alpha << 24;
   1.236 +            Uint32 mask = info->a << dstfmt->Ashift;
   1.237              while (height--) {
   1.238                  /* *INDENT-OFF* */
   1.239                  DUFFS_LOOP(
   1.240                  {
   1.241 -                    Uint32  *dst32 = (Uint32*)dst;
   1.242 +                    Uint32 *dst32 = (Uint32*)dst;
   1.243                      Uint8 s0 = src[0];
   1.244                      Uint8 s1 = src[1];
   1.245                      Uint8 s2 = src[2];
   1.246                      /* inversed, compared to Blit_3or4_to_3or4__same_rgb */
   1.247 -                    *dst32 = (s0 << 16) | (s1 << 8) | (s2) | alphashift;
   1.248 +                    *dst32 = (s0 << 16) | (s1 << 8) | (s2) | mask;
   1.249                      dst += dstbpp;
   1.250                      src += srcbpp;
   1.251                  }, width);
   1.252 @@ -2638,7 +2721,7 @@
   1.253              /* *INDENT-OFF* */
   1.254              DUFFS_LOOP(
   1.255              {
   1.256 -                Uint32  *dst32 = (Uint32*)dst;
   1.257 +                Uint32 *dst32 = (Uint32*)dst;
   1.258                  Uint8 s0 = src[0];
   1.259                  Uint8 s1 = src[1];
   1.260                  Uint8 s2 = src[2];
   1.261 @@ -2745,7 +2828,7 @@
   1.262       0, Blit_3or4_to_3or4__inversed_rgb, NO_ALPHA | SET_ALPHA | COPY_ALPHA},
   1.263      {0x00FF0000, 0x0000FF00, 0x000000FF, 4, 0x000000FF, 0x0000FF00, 0x00FF0000,
   1.264       0, Blit_3or4_to_3or4__inversed_rgb, NO_ALPHA | SET_ALPHA | COPY_ALPHA},
   1.265 -    /* RBG 888 and RGB 565 */
   1.266 +    /* RGB 888 and RGB 565 */
   1.267      {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x0000F800, 0x000007E0, 0x0000001F,
   1.268       0, Blit_RGB888_RGB565, NO_ALPHA},
   1.269      {0x00FF0000, 0x0000FF00, 0x000000FF, 2, 0x00007C00, 0x000003E0, 0x0000001F,