ARM: assembly optimization for SDL_FillRect SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13217d5d5686855d3
parent 13216 eb0ed9be8a68
child 13218 8a6c0f0319d4
ARM: assembly optimization for SDL_FillRect
---
src/video/SDL_surface.c | 22 ++++++++++
src/video/arm/pixman-arm-simd-asm.S | 68 +++++++++++++++++++++++++++++
2 files changed, 90 insertions(+)
src/video/SDL_surface.c
src/video/arm/pixman-arm-simd-asm.S
     1.1 --- a/src/video/SDL_surface.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_surface.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -28,6 +28,7 @@
     1.4  #include "SDL_RLEaccel_c.h"
     1.5  #include "SDL_pixels_c.h"
     1.6  #include "SDL_leaks.h"
     1.7 +#include "SDL_cpuinfo.h"
     1.8  
     1.9  
    1.10  /* Public routines */
    1.11 @@ -602,6 +603,27 @@
    1.12  	}
    1.13  	row = (Uint8 *)dst->pixels+dstrect->y*dst->pitch+
    1.14  			dstrect->x*dst->format->BytesPerPixel;
    1.15 +#if SDL_ARM_SIMD_BLITTERS
    1.16 +	if (SDL_HasARMSIMD() && dst->format->BytesPerPixel != 3) {
    1.17 +		void FillRect8ARMSIMDAsm(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
    1.18 +		void FillRect16ARMSIMDAsm(int32_t w, int32_t h, uint16_t *dst, int32_t dst_stride, uint16_t src);
    1.19 +		void FillRect32ARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t src);
    1.20 +		switch (dst->format->BytesPerPixel) {
    1.21 +		case 1:
    1.22 +			FillRect8ARMSIMDAsm(dstrect->w, dstrect->h, (uint8_t *) row, dst->pitch >> 0, color);
    1.23 +			break;
    1.24 +		case 2:
    1.25 +			FillRect16ARMSIMDAsm(dstrect->w, dstrect->h, (uint16_t *) row, dst->pitch >> 1, color);
    1.26 +			break;
    1.27 +		case 4:
    1.28 +			FillRect32ARMSIMDAsm(dstrect->w, dstrect->h, (uint32_t *) row, dst->pitch >> 2, color);
    1.29 +			break;
    1.30 +		}
    1.31 +
    1.32 +		SDL_UnlockSurface(dst);
    1.33 +		return(0);
    1.34 +	}
    1.35 +#endif
    1.36  	if ( dst->format->palette || (color == 0) ) {
    1.37  		x = dstrect->w*dst->format->BytesPerPixel;
    1.38  		if ( !color && !((uintptr_t)row&3) && !(x&3) && !(dst->pitch&3) ) {
     2.1 --- a/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.2 +++ b/src/video/arm/pixman-arm-simd-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -47,6 +47,74 @@
     2.4  
     2.5  /******************************************************************************/
     2.6  
     2.7 +.macro FillRect32_init
     2.8 +        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
     2.9 +        mov     STRIDE_S, SRC
    2.10 +        mov     MASK, SRC
    2.11 +        mov     STRIDE_M, SRC
    2.12 +.endm
    2.13 +
    2.14 +.macro FillRect16_init
    2.15 +        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
    2.16 +        orr     SRC, SRC, lsl #16
    2.17 +        mov     STRIDE_S, SRC
    2.18 +        mov     MASK, SRC
    2.19 +        mov     STRIDE_M, SRC
    2.20 +.endm
    2.21 +
    2.22 +.macro FillRect8_init
    2.23 +        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
    2.24 +        orr     SRC, SRC, lsl #8
    2.25 +        orr     SRC, SRC, lsl #16
    2.26 +        mov     STRIDE_S, SRC
    2.27 +        mov     MASK, SRC
    2.28 +        mov     STRIDE_M, SRC
    2.29 +.endm
    2.30 +
    2.31 +.macro FillRect_process_tail  cond, numbytes, firstreg
    2.32 +    WK4     .req    SRC
    2.33 +    WK5     .req    STRIDE_S
    2.34 +    WK6     .req    MASK
    2.35 +    WK7     .req    STRIDE_M
    2.36 +        pixst   cond, numbytes, 4, DST
    2.37 +    .unreq  WK4
    2.38 +    .unreq  WK5
    2.39 +    .unreq  WK6
    2.40 +    .unreq  WK7
    2.41 +.endm
    2.42 +
    2.43 +generate_composite_function \
    2.44 +    FillRect32ARMSIMDAsm, 0, 0, 32, \
    2.45 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    2.46 +    0, /* prefetch distance doesn't apply */ \
    2.47 +    FillRect32_init \
    2.48 +    nop_macro, /* newline */ \
    2.49 +    nop_macro /* cleanup */ \
    2.50 +    nop_macro /* process head */ \
    2.51 +    FillRect_process_tail
    2.52 +
    2.53 +generate_composite_function \
    2.54 +    FillRect16ARMSIMDAsm, 0, 0, 16, \
    2.55 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    2.56 +    0, /* prefetch distance doesn't apply */ \
    2.57 +    FillRect16_init \
    2.58 +    nop_macro, /* newline */ \
    2.59 +    nop_macro /* cleanup */ \
    2.60 +    nop_macro /* process head */ \
    2.61 +    FillRect_process_tail
    2.62 +
    2.63 +generate_composite_function \
    2.64 +    FillRect8ARMSIMDAsm, 0, 0, 8, \
    2.65 +    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    2.66 +    0, /* prefetch distance doesn't apply */ \
    2.67 +    FillRect8_init \
    2.68 +    nop_macro, /* newline */ \
    2.69 +    nop_macro /* cleanup */ \
    2.70 +    nop_macro /* process head */ \
    2.71 +    FillRect_process_tail
    2.72 +
    2.73 +/******************************************************************************/
    2.74 +
    2.75  /* This differs from the over_8888_8888 routine in Pixman in that the destination
    2.76   * alpha component is always left unchanged, and RGB components are not
    2.77   * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that