ARM: SIMD optimization for 4:4:4:4 to 8:8:8:8 normal blits SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 132188a6c0f0319d4
parent 13217 d5d5686855d3
child 13219 4f88e197acad
ARM: SIMD optimization for 4:4:4:4 to 8:8:8:8 normal blits
---
src/video/SDL_blit_N.c | 18 +++++++++
src/video/arm/pixman-arm-simd-asm.S | 57 +++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)
src/video/SDL_blit_N.c
src/video/arm/pixman-arm-simd-asm.S
     1.1 --- a/src/video/SDL_blit_N.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_blit_N.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -911,6 +911,20 @@
     1.4  
     1.5  	Blit_BGR888_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
     1.6  }
     1.7 +
     1.8 +void Blit_RGB444_RGB888ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint16_t *src, int32_t src_stride);
     1.9 +
    1.10 +static void Blit_RGB444_RGB888ARMSIMD(SDL_BlitInfo *info)
    1.11 +{
    1.12 +	int32_t width = info->d_width;
    1.13 +	int32_t height = info->d_height;
    1.14 +	uint32_t *dstp = (uint32_t *)info->d_pixels;
    1.15 +	int32_t dststride = width + (info->d_skip >> 2);
    1.16 +	uint16_t *srcp = (uint16_t *)info->s_pixels;
    1.17 +	int32_t srcstride = width + (info->s_skip >> 1);
    1.18 +
    1.19 +	Blit_RGB444_RGB888ARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
    1.20 +}
    1.21  #endif
    1.22  
    1.23  /* This is now endian dependent */
    1.24 @@ -2394,6 +2408,10 @@
    1.25      { 0x00007C00,0x000003E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000,
    1.26        BLIT_FEATURE_HAS_ALTIVEC, NULL, Blit_RGB555_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
    1.27  #endif
    1.28 +#if SDL_ARM_SIMD_BLITTERS
    1.29 +    { 0x00000F00,0x000000F0,0x0000000F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
    1.30 +      BLIT_FEATURE_HAS_ARM_SIMD, NULL, Blit_RGB444_RGB888ARMSIMD, NO_ALPHA | COPY_ALPHA },
    1.31 +#endif
    1.32      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
    1.33        0, NULL, Blit_RGB565_ARGB8888, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
    1.34      { 0x0000F800,0x000007E0,0x0000001F, 4, 0x000000FF,0x0000FF00,0x00FF0000,
     2.1 --- a/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -473,3 +473,60 @@
     2.4      nop_macro, /* cleanup */ \
     2.5      BGR888toRGB888_process_head, \
     2.6      BGR888toRGB888_process_tail
     2.7 +
     2.8 +/******************************************************************************/
     2.9 +
    2.10 +.macro RGB444toRGB888_init
    2.11 +        ldr     MASK, =0x0f0f0f0f
    2.12 +        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
    2.13 +        msr     CPSR_s, #0x50000
    2.14 +.endm
    2.15 +
    2.16 +.macro RGB444toRGB888_1pixel reg, mask, tmp
    2.17 +        pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
    2.18 +        and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
    2.19 +        orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
    2.20 +        pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
    2.21 +        pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
    2.22 +        sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
    2.23 +.endm
    2.24 +
    2.25 +.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
    2.26 +        and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
    2.27 +        and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
    2.28 +        orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
    2.29 +        orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
    2.30 +        pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
    2.31 +        pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
    2.32 +        pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
    2.33 +        pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
    2.34 +        pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
    2.35 +        pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
    2.36 +        sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
    2.37 +        sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
    2.38 +.endm
    2.39 +
    2.40 +.macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
    2.41 +        pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
    2.42 +.endm
    2.43 +
    2.44 +.macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
    2.45 + .if numbytes >= 8
    2.46 +  .if numbytes == 16
    2.47 +        RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
    2.48 +  .endif
    2.49 +        RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
    2.50 + .else @ numbytes == 4
    2.51 +        RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
    2.52 + .endif
    2.53 +.endm
    2.54 +
    2.55 +generate_composite_function \
    2.56 +    Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \
    2.57 +    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    2.58 +    2, /* prefetch distance */ \
    2.59 +    RGB444toRGB888_init, \
    2.60 +    nop_macro, /* newline */ \
    2.61 +    nop_macro, /* cleanup */ \
    2.62 +    RGB444toRGB888_process_head, \
    2.63 +    RGB444toRGB888_process_tail