ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13216eb0ed9be8a68
parent 13215 84aa936d90e4
child 13217 d5d5686855d3
ARM: SIMD assembly optimization for BGR-to-RGB 32bpp normal blits
---
src/video/SDL_blit_N.c | 25 +++++++++++++++--
src/video/arm/pixman-arm-simd-asm.S | 42 +++++++++++++++++++++++++++++
2 files changed, 65 insertions(+), 2 deletions(-)
src/video/SDL_blit_N.c
src/video/arm/pixman-arm-simd-asm.S
     1.1 --- a/src/video/SDL_blit_N.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_blit_N.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -31,7 +31,8 @@
     1.4  enum blit_features {
     1.5  	BLIT_FEATURE_HAS_MMX = 1,
     1.6  	BLIT_FEATURE_HAS_ALTIVEC = 2,
     1.7 -	BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4
     1.8 +	BLIT_FEATURE_ALTIVEC_DONT_USE_PREFETCH = 4,
     1.9 +	BLIT_FEATURE_HAS_ARM_SIMD = 8
    1.10  };
    1.11  
    1.12  #if SDL_ALTIVEC_BLITTERS
    1.13 @@ -893,7 +894,23 @@
    1.14  #endif
    1.15  #else
    1.16  /* Feature 1 is has-MMX */
    1.17 -#define GetBlitFeatures() (SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0)
    1.18 +#define GetBlitFeatures() ((SDL_HasMMX() ? BLIT_FEATURE_HAS_MMX : 0) | (SDL_HasARMSIMD() ? BLIT_FEATURE_HAS_ARM_SIMD : 0))
    1.19 +#endif
    1.20 +
    1.21 +#if SDL_ARM_SIMD_BLITTERS
    1.22 +void Blit_BGR888_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    1.23 +
    1.24 +static void Blit_BGR888_RGB888ARMSIMD(SDL_BlitInfo *info)
    1.25 +{
    1.26 +	int32_t width = info->d_width;
    1.27 +	int32_t height = info->d_height;
    1.28 +	uint32_t *dstp = (uint32_t *)info->d_pixels;
    1.29 +	int32_t dststride = width + (info->d_skip >> 2);
    1.30 +	uint32_t *srcp = (uint32_t *)info->s_pixels;
    1.31 +	int32_t srcstride = width + (info->s_skip >> 2);
    1.32 +
    1.33 +	Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
    1.34 +}
    1.35  #endif
    1.36  
    1.37  /* This is now endian dependent */
    1.38 @@ -2435,6 +2452,10 @@
    1.39      { 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F,
    1.40        BLIT_FEATURE_HAS_ALTIVEC, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA },
    1.41  #endif
    1.42 +#if SDL_ARM_SIMD_BLITTERS
    1.43 +    { 0x000000FF,0x0000FF00,0x00FF0000, 4, 0x00FF0000,0x0000FF00,0x000000FF,
    1.44 +      BLIT_FEATURE_HAS_ARM_SIMD, NULL, Blit_BGR888_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA },
    1.45 +#endif
    1.46      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F,
    1.47        0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
    1.48      { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
     2.1 --- a/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -363,3 +363,45 @@
     2.4      nop_macro, /* cleanup */ \
     2.5      ARGBto565PixelAlpha_process_head, \
     2.6      ARGBto565PixelAlpha_process_tail
     2.7 +
     2.8 + /******************************************************************************/
     2.9 +
    2.10 +.macro BGR888toRGB888_1pixel cond, reg, tmp
    2.11 +        uxtb16&cond  tmp, WK&reg, ror #8
    2.12 +        uxtb16&cond  WK&reg, WK&reg, ror #16
    2.13 +        orr&cond     WK&reg, WK&reg, tmp, lsl #8
    2.14 +.endm
    2.15 +
    2.16 +.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
    2.17 +        uxtb16&cond  tmp1, WK&reg1, ror #8
    2.18 +        uxtb16&cond  WK&reg1, WK&reg1, ror #16
    2.19 +        uxtb16&cond  tmp2, WK&reg2, ror #8
    2.20 +        uxtb16&cond  WK&reg2, WK&reg2, ror #16
    2.21 +        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
    2.22 +        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
    2.23 +.endm
    2.24 +
    2.25 +.macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    2.26 +        pixld   cond, numbytes, firstreg, SRC, unaligned_src
    2.27 +.endm
    2.28 +
    2.29 +.macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
    2.30 + .if numbytes >= 8
    2.31 +        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
    2.32 +  .if numbytes == 16
    2.33 +        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
    2.34 +  .endif
    2.35 + .else @ numbytes == 4
    2.36 +        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
    2.37 + .endif
    2.38 +.endm
    2.39 +
    2.40 +generate_composite_function \
    2.41 +    Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
    2.42 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    2.43 +    2, /* prefetch distance */ \
    2.44 +    nop_macro, /* init */ \
    2.45 +    nop_macro, /* newline */ \
    2.46 +    nop_macro, /* cleanup */ \
    2.47 +    BGR888toRGB888_process_head, \
    2.48 +    BGR888toRGB888_process_tail