ARM: SIMD assembly optimization for function BlitARGBto565PixelAlpha SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 1321478d5c2b67346
parent 13213 db77604e083d
child 13215 84aa936d90e4
ARM: SIMD assembly optimization for function BlitARGBto565PixelAlpha
---
src/video/SDL_blit_A.c | 23 ++++
src/video/arm/pixman-arm-simd-asm.S | 197 ++++++++++++++++++++++++++++
2 files changed, 220 insertions(+)
src/video/SDL_blit_A.c
src/video/arm/pixman-arm-simd-asm.S
     1.1 --- a/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -1434,6 +1434,20 @@
     1.4  #endif /* SDL_ALTIVEC_BLITTERS */
     1.5  
     1.6  #if SDL_ARM_SIMD_BLITTERS
     1.7 +void BlitARGBto565PixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
     1.8 +
     1.9 +static void BlitARGBto565PixelAlphaARMSIMD(SDL_BlitInfo *info)
    1.10 +{
    1.11 +	int32_t width = info->d_width;
    1.12 +	int32_t height = info->d_height;
    1.13 +	uint16_t *dstp = (uint16_t *)info->d_pixels;
    1.14 +	int32_t dststride = width + (info->d_skip >> 1);
    1.15 +	uint32_t *srcp = (uint32_t *)info->s_pixels;
    1.16 +	int32_t srcstride = width + (info->s_skip >> 2);
    1.17 +
    1.18 +	BlitARGBto565PixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
    1.19 +}
    1.20 +
    1.21  void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    1.22  
    1.23  static void BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo *info)
    1.24 @@ -2832,6 +2846,15 @@
    1.25              return Blit32to565PixelAlphaAltivec;
    1.26          else
    1.27  #endif
    1.28 +#if SDL_ARM_SIMD_BLITTERS
    1.29 +		if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
    1.30 +		   && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
    1.31 +		   && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
    1.32 +		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))
    1.33 +		   && SDL_HasARMSIMD())
    1.34 +			return BlitARGBto565PixelAlphaARMSIMD;
    1.35 +		else
    1.36 +#endif
    1.37  	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
    1.38  	       && sf->Gmask == 0xff00
    1.39  	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
     2.1 --- a/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -166,3 +166,200 @@
     2.4      RGBtoRGBPixelAlpha_process_tail
     2.5  
     2.6  /******************************************************************************/
     2.7 +
     2.8 +.macro ARGBto565PixelAlpha_init
     2.9 +        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
    2.10 +        mov     MASK, #0x001f
    2.11 +        mov     STRIDE_M, #0x0010
    2.12 +        orr     MASK, MASK, MASK, lsl #16
    2.13 +        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
    2.14 +.endm
    2.15 +
    2.16 +.macro ARGBto565PixelAlpha_newline
    2.17 +        mov     STRIDE_S, #0x0200
    2.18 +.endm
    2.19 +
    2.20 +/* On entry:
    2.21 + * s1 holds 1 32bpp source pixel
    2.22 + * d holds 1 16bpp destination pixel
    2.23 + * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
    2.24 + * other registers are temporaries
    2.25 + * On exit:
    2.26 + * Constant registers preserved
    2.27 + */
    2.28 +
    2.29 +.macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
    2.30 +        mov     alpha, s, lsr #27
    2.31 +        and     misc, s, #0xfc00
    2.32 +        and     g, d, #0x07e0
    2.33 +        pkhbt   rb, d, d, lsl #5
    2.34 +        rsb     misc, g, misc, lsr #5
    2.35 +        and     s, rbmask, s, lsr #3
    2.36 +        and     rb, rbmask, rb
    2.37 +        sub     s, s, rb
    2.38 +        smlabb  misc, misc, alpha, ghalf
    2.39 +        mla     s, s, alpha, rbhalf
    2.40 +        add     misc, misc, misc, lsl #5
    2.41 +        add     g, g, misc, asr #10
    2.42 +        add     s, s, s, lsl #5
    2.43 +        and     g, g, #0x07e0
    2.44 +        add     rb, rb, s, asr #10
    2.45 +        and     rb, rb, rbmask
    2.46 +        pkhbt   rb, rb, rb, lsl #11
    2.47 +        orr     d, rb, g
    2.48 +        orr     d, d, rb, lsr #16
    2.49 +.endm
    2.50 +
    2.51 +/* On entry:
    2.52 + * s1 holds 1 32bpp source pixel
    2.53 + * d holds 1 16bpp destination pixel
    2.54 + * rbmask holds 0x001f001f
    2.55 + * On exit:
    2.56 + * Constant registers preserved
    2.57 + */
    2.58 +
    2.59 +.macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
    2.60 +        and     d, rbmask, s, lsr #3
    2.61 +        and     s, s, #0xfc00
    2.62 +        orr     d, d, d, lsr #5
    2.63 +        orr     d, d, s, lsr #5
    2.64 +.endm
    2.65 +
    2.66 +/* On entry:
    2.67 + * s1, s2 hold 2 32bpp source pixels
    2.68 + * d holds 2 16bpp destination pixels
    2.69 + * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
    2.70 + * other registers are temporaries
    2.71 + * On exit:
    2.72 + * Constant registers preserved
    2.73 + * Blended results have been written through destination pointer
    2.74 + */
    2.75 +
    2.76 +.macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
    2.77 +        mov     alpha, s1, lsr #27
    2.78 +        and     misc, s1, #0xfc00
    2.79 +        and     g, d, #0x07e0
    2.80 +        pkhbt   rb, d, d, lsl #5
    2.81 +        rsb     misc, g, misc, lsr #5
    2.82 +        and     s1, rbmask, s1, lsr #3
    2.83 +        and     rb, rbmask, rb
    2.84 +        sub     s1, s1, rb
    2.85 +        smlabb  misc, misc, alpha, ghalf
    2.86 +        mla     s1, s1, alpha, rbhalf
    2.87 +          uxth    d, d, ror #16
    2.88 +        add     misc, misc, misc, lsl #5
    2.89 +          mov     alpha, s2, lsr #27
    2.90 +        add     g, g, misc, asr #10
    2.91 +        add     s1, s1, s1, lsl #5
    2.92 +        and     g, g, #0x07e0
    2.93 +        add     rb, rb, s1, asr #10
    2.94 +        and     rb, rb, rbmask
    2.95 +          and     misc, s2, #0xfc00
    2.96 +        pkhbt   rb, rb, rb, lsl #11
    2.97 +          and     s1, d, #0x07e0
    2.98 +          pkhbt   d, d, d, lsl #5
    2.99 +          rsb     misc, s1, misc, lsr #5
   2.100 +          and     s2, rbmask, s2, lsr #3
   2.101 +          and     d, rbmask, d
   2.102 +          sub     s2, s2, d
   2.103 +          smlabb  misc, misc, alpha, ghalf
   2.104 +          mla     s2, s2, alpha, rbhalf
   2.105 +        orr     alpha, rb, g
   2.106 +          add     misc, misc, misc, lsl #5
   2.107 +        orr     alpha, alpha, rb, lsr #16
   2.108 +          add     s1, s1, misc, asr #10
   2.109 +          add     s2, s2, s2, lsl #5
   2.110 +          and     s1, s1, #0x07e0
   2.111 +          add     d, d, s2, asr #10
   2.112 +          and     d, d, rbmask
   2.113 +        strh    alpha, [DST, #-4]
   2.114 +          pkhbt   d, d, d, lsl #11
   2.115 +          orr     alpha, d, s1
   2.116 +          orr     alpha, alpha, d, lsr #16
   2.117 +          strh    alpha, [DST, #-2]
   2.118 +.endm
   2.119 +
   2.120 +/* On entry:
   2.121 + * s1, s2 hold 2 32bpp source pixels
   2.122 + * rbmask holds 0x001f001f
   2.123 + * other registers are temporaries
   2.124 + * On exit:
   2.125 + * Constant registers preserved
   2.126 + * Blended results have been written through destination pointer
   2.127 + */
   2.128 +
   2.129 +.macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
   2.130 +        and     g, s1, #0xfc00
   2.131 +        and     d, rbmask, s1, lsr #3
   2.132 +          and     s1, rbmask, s2, lsr #3
   2.133 +        orr     d, d, d, lsr #5
   2.134 +        orr     d, d, g, lsr #5
   2.135 +          and     g, s2, #0xfc00
   2.136 +        strh    d, [DST, #-4]
   2.137 +          orr     s1, s1, s1, lsr #5
   2.138 +          orr     s1, s1, g, lsr #5
   2.139 +          strh    s1, [DST, #-2]
   2.140 +.endm
   2.141 +
   2.142 +.macro ARGBto565PixelAlpha_2pixels_head
   2.143 +        ldrd    WK0, WK1, [SRC], #8
   2.144 +        ldr     WK2, [DST], #4
   2.145 +        orr     SCRATCH, WK0, WK1
   2.146 +        and     ORIG_W, WK0, WK1
   2.147 +        tst     SCRATCH, #0xff000000
   2.148 +.endm
   2.149 +
   2.150 +.macro ARGBto565PixelAlpha_2pixels_tail
   2.151 +        beq     20f @ all transparent
   2.152 +        cmp     ORIG_W, #0xff000000
   2.153 +        bhs     10f @ all opaque
   2.154 +        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
   2.155 +        b       20f
   2.156 +10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
   2.157 +20:
   2.158 +.endm
   2.159 +
   2.160 +.macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
   2.161 + .if numbytes == 16
   2.162 +        ARGBto565PixelAlpha_2pixels_head
   2.163 +        ARGBto565PixelAlpha_2pixels_tail
   2.164 +        ARGBto565PixelAlpha_2pixels_head
   2.165 +        ARGBto565PixelAlpha_2pixels_tail
   2.166 + .endif
   2.167 + .if numbytes >= 8
   2.168 +        ARGBto565PixelAlpha_2pixels_head
   2.169 +        ARGBto565PixelAlpha_2pixels_tail
   2.170 + .endif
   2.171 + .if numbytes >= 4
   2.172 +        ARGBto565PixelAlpha_2pixels_head
   2.173 + .else // numbytes == 2
   2.174 +        ldr     WK0, [SRC], #4
   2.175 +        ldrh    WK2, [DST], #2
   2.176 +        tst     WK0, #0xff000000
   2.177 + .endif
   2.178 +.endm
   2.179 +
   2.180 +.macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
   2.181 + .if numbytes >= 4
   2.182 +        ARGBto565PixelAlpha_2pixels_tail
   2.183 + .else // numbytes == 2
   2.184 +        beq     20f @ all transparent
   2.185 +        cmp     WK0, #0xff000000
   2.186 +        bhs     10f @ opaque
   2.187 +        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
   2.188 +        b       19f
   2.189 +10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
   2.190 +19:     strh    WK2, [DST, #-2]
   2.191 +20:
   2.192 + .endif
   2.193 +.endm
   2.194 +
   2.195 +generate_composite_function \
   2.196 +    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
   2.197 +    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
   2.198 +    2, /* prefetch distance */ \
   2.199 +    ARGBto565PixelAlpha_init, \
   2.200 +    ARGBto565PixelAlpha_newline, \
   2.201 +    nop_macro, /* cleanup */ \
   2.202 +    ARGBto565PixelAlpha_process_head, \
   2.203 +    ARGBto565PixelAlpha_process_tail