ARM: NEON assembly optimization for SDL_FillRect SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13222c8562ecca3c9
parent 13221 3705e81df6ff
child 13223 4ceb979e228a
ARM: NEON assembly optimization for SDL_FillRect
---
src/video/SDL_surface.c | 21 +++++
src/video/arm/pixman-arm-neon-asm.S | 128 ++++++++++++++++++++++++++++
2 files changed, 149 insertions(+)
src/video/SDL_surface.c
src/video/arm/pixman-arm-neon-asm.S
     1.1 --- a/src/video/SDL_surface.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_surface.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -603,6 +603,27 @@
     1.4  	}
     1.5  	row = (Uint8 *)dst->pixels+dstrect->y*dst->pitch+
     1.6  			dstrect->x*dst->format->BytesPerPixel;
     1.7 +#if SDL_ARM_NEON_BLITTERS
     1.8 +    if (SDL_HasARMNEON() && dst->format->BytesPerPixel != 3) {
     1.9 +        void FillRect8ARMNEONAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
    1.10 +        void FillRect16ARMNEONAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
    1.11 +        void FillRect32ARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
    1.12 +        switch (dst->format->BytesPerPixel) {
    1.13 +        case 1:
    1.14 +            FillRect8ARMNEONAsm(dstrect->w, dstrect->h, (uint8_t *) row, dst->pitch >> 0, color);
    1.15 +            break;
    1.16 +        case 2:
    1.17 +            FillRect16ARMNEONAsm(dstrect->w, dstrect->h, (uint16_t *) row, dst->pitch >> 1, color);
    1.18 +            break;
    1.19 +        case 4:
    1.20 +            FillRect32ARMNEONAsm(dstrect->w, dstrect->h, (uint32_t *) row, dst->pitch >> 2, color);
    1.21 +            break;
    1.22 +        }
    1.23 +
    1.24 +        SDL_UnlockSurface(dst);
    1.25 +        return(0);
    1.26 +    }
    1.27 +#endif
    1.28  #if SDL_ARM_SIMD_BLITTERS
    1.29  	if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
    1.30  		void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
     2.1 --- a/src/video/arm/pixman-arm-neon-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-neon-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -38,6 +38,134 @@
     2.4  
     2.5  /******************************************************************************/
     2.6  
     2.7 +/* We can actually do significantly better than the Pixman macros, at least for
     2.8 + * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
     2.9 + * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
    2.10 + */
    2.11 +
    2.12 +.macro generate_fillrect_function name, bpp, log2Bpp
    2.13 +/*
    2.14 + * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
    2.15 + * On entry:
    2.16 + * a1 = width, pixels
    2.17 + * a2 = height, rows
    2.18 + * a3 = pointer to top-left destination pixel
    2.19 + * a4 = stride, pixels
    2.20 + * [sp] = pixel value to fill with
    2.21 + * Within the function:
    2.22 + * v1 = width remaining
    2.23 + * v2 = vst offset
    2.24 + * v3 = alternate pointer
    2.25 + * ip = data ARM register
    2.26 + */
    2.27 +pixman_asm_function name
    2.28 +    vld1.\bpp   {d0[],d1[]}, [sp]
    2.29 +    sub         a4, a1
    2.30 +    vld1.\bpp   {d2[],d3[]}, [sp]
    2.31 +    cmp         a1, #(15+64) >> \log2Bpp
    2.32 +    push        {v1-v3,lr}
    2.33 +    vmov        ip, s0
    2.34 +    blo         51f
    2.35 +
    2.36 +    /* Long-row case */
    2.37 +    mov         v2, #64
    2.38 +1:  mov         v1, a1
    2.39 +    ands        v3, a3, #15
    2.40 +    beq         2f
    2.41 +    /* Leading pixels */
    2.42 +    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
    2.43 +    sub         v1, v1, v3, lsr #\log2Bpp
    2.44 +    rbit        v3, v3
    2.45 +.if bpp <= 16
    2.46 +.if bpp == 8
    2.47 +    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
    2.48 +    strneb      ip, [a3], #1
    2.49 +    tst         v3, #1<<30
    2.50 +.else
    2.51 +    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
    2.52 +.endif
    2.53 +    strneh      ip, [a3], #2
    2.54 +.endif
    2.55 +    movs        v3, v3, lsl #3
    2.56 +    vstmcs      a3!, {s0}
    2.57 +    vstmmi      a3!, {d0}
    2.58 +2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
    2.59 +    add         v3, a3, #32
    2.60 +    /* Inner loop */
    2.61 +3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
    2.62 +    subs        v1, v1, #64 >> \log2Bpp
    2.63 +    vst1.\bpp   {q0-q1}, [v3 :128], v2
    2.64 +    bhs         3b
    2.65 +    /* Trailing pixels */
    2.66 +4:  movs        v1, v1, lsl #27 + \log2Bpp
    2.67 +    bcc         5f
    2.68 +    vst1.\bpp   {q0-q1}, [a3 :128]!
    2.69 +5:  bpl         6f
    2.70 +    vst1.\bpp   {q0}, [a3 :128]!
    2.71 +6:  movs        v1, v1, lsl #2
    2.72 +    vstmcs      a3!, {d0}
    2.73 +    vstmmi      a3!, {s0}
    2.74 +.if bpp <= 16
    2.75 +    movs        v1, v1, lsl #2
    2.76 +    strcsh      ip, [a3], #2
    2.77 +.if bpp == 8
    2.78 +    strmib      ip, [a3], #1
    2.79 +.endif
    2.80 +.endif
    2.81 +    subs        a2, a2, #1
    2.82 +    add         a3, a3, a4, lsl #\log2Bpp
    2.83 +    bhi         1b
    2.84 +    pop         {v1-v3,pc}
    2.85 +
    2.86 +    /* Short-row case */
    2.87 +51: movs        v1, a1
    2.88 +.if bpp == 8
    2.89 +    tst         a3, #3
    2.90 +    beq         53f
    2.91 +52: subs        v1, v1, #1
    2.92 +    blo         57f
    2.93 +    strb        ip, [a3], #1
    2.94 +    tst         a3, #3
    2.95 +    bne         52b
    2.96 +.elseif bpp == 16
    2.97 +    tstne       a3, #2
    2.98 +    subne       v1, v1, #1
    2.99 +    strneh      ip, [a3], #2
   2.100 +.endif
   2.101 +53: cmp         v1, #32 >> \log2Bpp
   2.102 +    bcc         54f
   2.103 +    vst1.\bpp   {q0-q1}, [a3]!
   2.104 +    sub         v1, v1, #32 >> \log2Bpp
   2.105 +    /* Trailing pixels */
   2.106 +54: movs        v1, v1, lsl #27 + \log2Bpp
   2.107 +    bcc         55f
   2.108 +    vst1.\bpp   {q0-q1}, [a3]!
   2.109 +55: bpl         56f
   2.110 +    vst1.\bpp   {q0}, [a3]!
   2.111 +56: movs        v1, v1, lsl #2
   2.112 +    vstmcs      a3!, {d0}
   2.113 +    vstmmi      a3!, {s0}
   2.114 +.if bpp <= 16
   2.115 +    movs        v1, v1, lsl #2
   2.116 +    strcsh      ip, [a3], #2
   2.117 +.if bpp == 8
   2.118 +    strmib      ip, [a3], #1
   2.119 +.endif
   2.120 +.endif
   2.121 +    subs        a2, a2, #1
   2.122 +    add         a3, a3, a4, lsl #\log2Bpp
   2.123 +    bhi         51b
   2.124 +57: pop         {v1-v3,pc}
   2.125 +
   2.126 +.endfunc
   2.127 +.endm
   2.128 +
   2.129 +generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
   2.130 +generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
   2.131 +generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
   2.132 +
   2.133 +/******************************************************************************/
   2.134 +
   2.135  .macro RGBtoRGBPixelAlpha_process_pixblock_head
   2.136      vmvn        d30, d3  /* get inverted source alpha */
   2.137      vmov        d31, d7  /* dest alpha is always unchanged */