ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 132213705e81df6ff
parent 13220 0ae1ddca5e85
child 13222 c8562ecca3c9
ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha
---
src/video/SDL_blit_A.c | 31 ++++++++--
src/video/arm/pixman-arm-neon-asm.S | 88 +++++++++++++++++++++++++++++
2 files changed, 114 insertions(+), 5 deletions(-)
src/video/SDL_blit_A.c
src/video/arm/pixman-arm-neon-asm.S
     1.1 --- a/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -1464,6 +1464,20 @@
     1.4  #endif
     1.5  
     1.6  #if SDL_ARM_NEON_BLITTERS
     1.7 +void BlitARGBto565PixelAlphaARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
     1.8 +
     1.9 +static void BlitARGBto565PixelAlphaARMNEON(SDL_BlitInfo *info)
    1.10 +{
    1.11 +    int32_t width = info->d_width;
    1.12 +    int32_t height = info->d_height;
    1.13 +    uint16_t *dstp = (uint16_t *)info->d_pixels;
    1.14 +    int32_t dststride = width + (info->d_skip >> 1);
    1.15 +    uint32_t *srcp = (uint32_t *)info->s_pixels;
    1.16 +    int32_t srcstride = width + (info->s_skip >> 2);
    1.17 +
    1.18 +    BlitARGBto565PixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
    1.19 +}
    1.20 +
    1.21  void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
    1.22  
    1.23  static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info)
    1.24 @@ -2862,14 +2876,21 @@
    1.25              return Blit32to565PixelAlphaAltivec;
    1.26          else
    1.27  #endif
    1.28 -#if SDL_ARM_SIMD_BLITTERS
    1.29 +#if SDL_ARM_NEON_BLITTERS || SDL_ARM_SIMD_BLITTERS
    1.30  		if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
    1.31  		   && sf->Gmask == 0xff00 && df->Gmask == 0x7e0
    1.32  		   && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
    1.33 -		   || (sf->Bmask == 0xff && df->Bmask == 0x1f))
    1.34 -		   && SDL_HasARMSIMD())
    1.35 -			return BlitARGBto565PixelAlphaARMSIMD;
    1.36 -		else
    1.37 +		   || (sf->Bmask == 0xff && df->Bmask == 0x1f)))
    1.38 +		{
    1.39 +#if SDL_ARM_NEON_BLITTERS
    1.40 +		    if(SDL_HasARMNEON())
    1.41 +		        return BlitARGBto565PixelAlphaARMNEON;
    1.42 +#endif
    1.43 +#if SDL_ARM_SIMD_BLITTERS
    1.44 +		    if(SDL_HasARMSIMD())
    1.45 +		        return BlitARGBto565PixelAlphaARMSIMD;
    1.46 +#endif
    1.47 +		}
    1.48  #endif
    1.49  	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
    1.50  	       && sf->Gmask == 0xff00
     2.1 --- a/src/video/arm/pixman-arm-neon-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-neon-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -100,3 +100,91 @@
     2.4      RGBtoRGBPixelAlpha_process_pixblock_head, \
     2.5      RGBtoRGBPixelAlpha_process_pixblock_tail, \
     2.6      RGBtoRGBPixelAlpha_process_pixblock_tail_head
     2.7 +
     2.8 + /******************************************************************************/
     2.9 +
    2.10 +.macro ARGBto565PixelAlpha_process_pixblock_head
    2.11 +    vmvn        d6, d3
    2.12 +    vshr.u8     d1, #2
    2.13 +    vshr.u8     d3, #3
    2.14 +    vshr.u8     d0, #3
    2.15 +    vshrn.u16   d7, q2, #3
    2.16 +    vshrn.u16   d25, q2, #8
    2.17 +    vbic.i16    q2, #0xe0
    2.18 +    vshr.u8     d6, #3
    2.19 +    vshr.u8     d7, #2
    2.20 +    vshr.u8     d2, #3
    2.21 +    vmovn.u16   d24, q2
    2.22 +    vshr.u8     d25, #3
    2.23 +    vmull.u8    q13, d1, d3
    2.24 +    vmlal.u8    q13, d7, d6
    2.25 +    vmull.u8    q14, d0, d3
    2.26 +    vmlal.u8    q14, d24, d6
    2.27 +    vmull.u8    q15, d2, d3
    2.28 +    vmlal.u8    q15, d25, d6
    2.29 +.endm
    2.30 +
    2.31 +.macro ARGBto565PixelAlpha_process_pixblock_tail
    2.32 +    vsra.u16    q13, #5
    2.33 +    vsra.u16    q14, #5
    2.34 +    vsra.u16    q15, #5
    2.35 +    vrshr.u16   q13, #5
    2.36 +    vrshr.u16   q14, #5
    2.37 +    vrshr.u16   q15, #5
    2.38 +    vsli.u16    q14, q13, #5
    2.39 +    vsli.u16    q14, q15, #11
    2.40 +.endm
    2.41 +
    2.42 +.macro ARGBto565PixelAlpha_process_pixblock_tail_head
    2.43 +    vld4.8      {d0-d3}, [SRC]!
    2.44 +                                    PF add PF_X, PF_X, #8
    2.45 +        vsra.u16    q13, #5
    2.46 +                                    PF tst PF_CTL, #0xF
    2.47 +        vsra.u16    q14, #5
    2.48 +                                    PF addne PF_X, PF_X, #8
    2.49 +        vsra.u16    q15, #5
    2.50 +                                    PF subne PF_CTL, PF_CTL, #1
    2.51 +        vrshr.u16   q13, #5
    2.52 +                                    PF cmp PF_X, ORIG_W
    2.53 +        vrshr.u16   q14, #5
    2.54 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    2.55 +        vrshr.u16   q15, #5
    2.56 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    2.57 +    vld1.8      {d4-d5}, [DST_R]!
    2.58 +                                    PF subge PF_X, PF_X, ORIG_W
    2.59 +        vsli.u16    q14, q13, #5
    2.60 +                                    PF subges PF_CTL, PF_CTL, #0x10
    2.61 +        vsli.u16    q14, q15, #11
    2.62 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    2.63 +        vst1.8      {q14}, [DST_W :128]!
    2.64 +    vmvn        d6, d3
    2.65 +    vshr.u8     d1, #2
    2.66 +    vshr.u8     d3, #3
    2.67 +    vshr.u8     d0, #3
    2.68 +    vshrn.u16   d7, q2, #3
    2.69 +    vshrn.u16   d25, q2, #8
    2.70 +    vbic.i16    q2, #0xe0
    2.71 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    2.72 +    vshr.u8     d6, #3
    2.73 +    vshr.u8     d7, #2
    2.74 +    vshr.u8     d2, #3
    2.75 +    vmovn.u16   d24, q2
    2.76 +    vshr.u8     d25, #3
    2.77 +    vmull.u8    q13, d1, d3
    2.78 +    vmlal.u8    q13, d7, d6
    2.79 +    vmull.u8    q14, d0, d3
    2.80 +    vmlal.u8    q14, d24, d6
    2.81 +    vmull.u8    q15, d2, d3
    2.82 +    vmlal.u8    q15, d25, d6
    2.83 +.endm
    2.84 +
    2.85 +generate_composite_function \
    2.86 +    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
    2.87 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    2.88 +    8, /* number of pixels, processed in a single block */ \
    2.89 +    6, /* prefetch distance */ \
    2.90 +    default_init, \
    2.91 +    default_cleanup, \
    2.92 +    ARGBto565PixelAlpha_process_pixblock_head, \
    2.93 +    ARGBto565PixelAlpha_process_pixblock_tail, \
    2.94 +    ARGBto565PixelAlpha_process_pixblock_tail_head