src/video/arm/pixman-arm-neon-asm.S
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13222 c8562ecca3c9
parent 13221 3705e81df6ff
permissions -rw-r--r--
ARM: NEON assembly optimization for SDL_FillRect
---
src/video/SDL_surface.c | 21 +++++
src/video/arm/pixman-arm-neon-asm.S | 128 ++++++++++++++++++++++++++++
2 files changed, 149 insertions(+)
bavison@13220
     1
/*
bavison@13220
     2
 * Copyright (c) 2018 RISC OS Open Ltd
bavison@13220
     3
 *
bavison@13220
     4
 * This software is provided 'as-is', without any express or implied
bavison@13220
     5
 * warranty.  In no event will the authors be held liable for any damages
bavison@13220
     6
 * arising from the use of this software.
bavison@13220
     7
 *
bavison@13220
     8
 * Permission is granted to anyone to use this software for any purpose,
bavison@13220
     9
 * including commercial applications, and to alter it and redistribute it
bavison@13220
    10
 * freely, subject to the following restrictions:
bavison@13220
    11
 *
bavison@13220
    12
 * 1. The origin of this software must not be misrepresented; you must not
bavison@13220
    13
 *    claim that you wrote the original software. If you use this software
bavison@13220
    14
 *    in a product, an acknowledgment in the product documentation would be
bavison@13220
    15
 *    appreciated but is not required.
bavison@13220
    16
 * 2. Altered source versions must be plainly marked as such, and must not be
bavison@13220
    17
 *    misrepresented as being the original software.
bavison@13220
    18
 * 3. This notice may not be removed or altered from any source distribution.
bavison@13220
    19
 */
bavison@13220
    20
bavison@13220
    21
/* Prevent the stack from becoming executable for no reason... */
bavison@13220
    22
#if defined(__linux__) && defined(__ELF__)
bavison@13220
    23
.section .note.GNU-stack,"",%progbits
bavison@13220
    24
#endif
bavison@13220
    25
bavison@13220
    26
    .text
bavison@13220
    27
    .fpu neon
bavison@13220
    28
    .arch armv7a
bavison@13220
    29
    .object_arch armv4
bavison@13220
    30
    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
bavison@13220
    31
    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
bavison@13220
    32
    .arm
bavison@13220
    33
    .altmacro
bavison@13220
    34
    .p2align 2
bavison@13220
    35
bavison@13220
    36
#include "pixman-arm-asm.h"
bavison@13220
    37
#include "pixman-arm-neon-asm.h"
bavison@13220
    38
bavison@13220
    39
/******************************************************************************/
bavison@13220
    40
bavison@13222
    41
/* We can actually do significantly better than the Pixman macros, at least for
bavison@13222
    42
 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
bavison@13222
    43
 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
bavison@13222
    44
 */
bavison@13222
    45
bavison@13222
    46
.macro generate_fillrect_function name, bpp, log2Bpp
bavison@13222
    47
/*
bavison@13222
    48
 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
bavison@13222
    49
 * On entry:
bavison@13222
    50
 * a1 = width, pixels
bavison@13222
    51
 * a2 = height, rows
bavison@13222
    52
 * a3 = pointer to top-left destination pixel
bavison@13222
    53
 * a4 = stride, pixels
bavison@13222
    54
 * [sp] = pixel value to fill with
bavison@13222
    55
 * Within the function:
bavison@13222
    56
 * v1 = width remaining
bavison@13222
    57
 * v2 = vst offset
bavison@13222
    58
 * v3 = alternate pointer
bavison@13222
    59
 * ip = data ARM register
bavison@13222
    60
 */
bavison@13222
    61
pixman_asm_function name
bavison@13222
    62
    vld1.\bpp   {d0[],d1[]}, [sp]
bavison@13222
    63
    sub         a4, a1
bavison@13222
    64
    vld1.\bpp   {d2[],d3[]}, [sp]
bavison@13222
    65
    cmp         a1, #(15+64) >> \log2Bpp
bavison@13222
    66
    push        {v1-v3,lr}
bavison@13222
    67
    vmov        ip, s0
bavison@13222
    68
    blo         51f
bavison@13222
    69
bavison@13222
    70
    /* Long-row case */
bavison@13222
    71
    mov         v2, #64
bavison@13222
    72
1:  mov         v1, a1
bavison@13222
    73
    ands        v3, a3, #15
bavison@13222
    74
    beq         2f
bavison@13222
    75
    /* Leading pixels */
bavison@13222
    76
    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
bavison@13222
    77
    sub         v1, v1, v3, lsr #\log2Bpp
bavison@13222
    78
    rbit        v3, v3
bavison@13222
    79
.if bpp <= 16
bavison@13222
    80
.if bpp == 8
bavison@13222
    81
    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
bavison@13222
    82
    strneb      ip, [a3], #1
bavison@13222
    83
    tst         v3, #1<<30
bavison@13222
    84
.else
bavison@13222
    85
    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
bavison@13222
    86
.endif
bavison@13222
    87
    strneh      ip, [a3], #2
bavison@13222
    88
.endif
bavison@13222
    89
    movs        v3, v3, lsl #3
bavison@13222
    90
    vstmcs      a3!, {s0}
bavison@13222
    91
    vstmmi      a3!, {d0}
bavison@13222
    92
2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
bavison@13222
    93
    add         v3, a3, #32
bavison@13222
    94
    /* Inner loop */
bavison@13222
    95
3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
bavison@13222
    96
    subs        v1, v1, #64 >> \log2Bpp
bavison@13222
    97
    vst1.\bpp   {q0-q1}, [v3 :128], v2
bavison@13222
    98
    bhs         3b
bavison@13222
    99
    /* Trailing pixels */
bavison@13222
   100
4:  movs        v1, v1, lsl #27 + \log2Bpp
bavison@13222
   101
    bcc         5f
bavison@13222
   102
    vst1.\bpp   {q0-q1}, [a3 :128]!
bavison@13222
   103
5:  bpl         6f
bavison@13222
   104
    vst1.\bpp   {q0}, [a3 :128]!
bavison@13222
   105
6:  movs        v1, v1, lsl #2
bavison@13222
   106
    vstmcs      a3!, {d0}
bavison@13222
   107
    vstmmi      a3!, {s0}
bavison@13222
   108
.if bpp <= 16
bavison@13222
   109
    movs        v1, v1, lsl #2
bavison@13222
   110
    strcsh      ip, [a3], #2
bavison@13222
   111
.if bpp == 8
bavison@13222
   112
    strmib      ip, [a3], #1
bavison@13222
   113
.endif
bavison@13222
   114
.endif
bavison@13222
   115
    subs        a2, a2, #1
bavison@13222
   116
    add         a3, a3, a4, lsl #\log2Bpp
bavison@13222
   117
    bhi         1b
bavison@13222
   118
    pop         {v1-v3,pc}
bavison@13222
   119
bavison@13222
   120
    /* Short-row case */
bavison@13222
   121
51: movs        v1, a1
bavison@13222
   122
.if bpp == 8
bavison@13222
   123
    tst         a3, #3
bavison@13222
   124
    beq         53f
bavison@13222
   125
52: subs        v1, v1, #1
bavison@13222
   126
    blo         57f
bavison@13222
   127
    strb        ip, [a3], #1
bavison@13222
   128
    tst         a3, #3
bavison@13222
   129
    bne         52b
bavison@13222
   130
.elseif bpp == 16
bavison@13222
   131
    tstne       a3, #2
bavison@13222
   132
    subne       v1, v1, #1
bavison@13222
   133
    strneh      ip, [a3], #2
bavison@13222
   134
.endif
bavison@13222
   135
53: cmp         v1, #32 >> \log2Bpp
bavison@13222
   136
    bcc         54f
bavison@13222
   137
    vst1.\bpp   {q0-q1}, [a3]!
bavison@13222
   138
    sub         v1, v1, #32 >> \log2Bpp
bavison@13222
   139
    /* Trailing pixels */
bavison@13222
   140
54: movs        v1, v1, lsl #27 + \log2Bpp
bavison@13222
   141
    bcc         55f
bavison@13222
   142
    vst1.\bpp   {q0-q1}, [a3]!
bavison@13222
   143
55: bpl         56f
bavison@13222
   144
    vst1.\bpp   {q0}, [a3]!
bavison@13222
   145
56: movs        v1, v1, lsl #2
bavison@13222
   146
    vstmcs      a3!, {d0}
bavison@13222
   147
    vstmmi      a3!, {s0}
bavison@13222
   148
.if bpp <= 16
bavison@13222
   149
    movs        v1, v1, lsl #2
bavison@13222
   150
    strcsh      ip, [a3], #2
bavison@13222
   151
.if bpp == 8
bavison@13222
   152
    strmib      ip, [a3], #1
bavison@13222
   153
.endif
bavison@13222
   154
.endif
bavison@13222
   155
    subs        a2, a2, #1
bavison@13222
   156
    add         a3, a3, a4, lsl #\log2Bpp
bavison@13222
   157
    bhi         51b
bavison@13222
   158
57: pop         {v1-v3,pc}
bavison@13222
   159
bavison@13222
   160
.endfunc
bavison@13222
   161
.endm
bavison@13222
   162
bavison@13222
   163
generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
bavison@13222
   164
generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
bavison@13222
   165
generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
bavison@13222
   166
bavison@13222
   167
/******************************************************************************/
bavison@13222
   168
bavison@13220
   169
.macro RGBtoRGBPixelAlpha_process_pixblock_head
bavison@13220
   170
    vmvn        d30, d3  /* get inverted source alpha */
bavison@13220
   171
    vmov        d31, d7  /* dest alpha is always unchanged */
bavison@13220
   172
    vmull.u8    q14, d0, d3
bavison@13220
   173
    vmlal.u8    q14, d4, d30
bavison@13220
   174
    vmull.u8    q0, d1, d3
bavison@13220
   175
    vmlal.u8    q0, d5, d30
bavison@13220
   176
    vmull.u8    q1, d2, d3
bavison@13220
   177
    vmlal.u8    q1, d6, d30
bavison@13220
   178
    vrshr.u16   q2, q14, #8
bavison@13220
   179
    vrshr.u16   q3, q0, #8
bavison@13220
   180
    vraddhn.u16 d28, q14, q2
bavison@13220
   181
    vrshr.u16   q2, q1, #8
bavison@13220
   182
    vraddhn.u16 d29, q0, q3
bavison@13220
   183
    vraddhn.u16 d30, q1, q2
bavison@13220
   184
.endm
bavison@13220
   185
bavison@13220
   186
.macro RGBtoRGBPixelAlpha_process_pixblock_tail
bavison@13220
   187
    /* nothing */
bavison@13220
   188
.endm
bavison@13220
   189
bavison@13220
   190
.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
bavison@13220
   191
    vld4.8      {d0-d3}, [SRC]!
bavison@13220
   192
                                    PF add PF_X, PF_X, #8
bavison@13220
   193
        vst4.8      {d28-d31}, [DST_W :128]!
bavison@13220
   194
                                    PF tst PF_CTL, #0xF
bavison@13220
   195
    vld4.8      {d4-d7}, [DST_R :128]!
bavison@13220
   196
                                    PF addne PF_X, PF_X, #8
bavison@13220
   197
    vmvn        d30, d3  /* get inverted source alpha */
bavison@13220
   198
    vmov        d31, d7  /* dest alpha is always unchanged */
bavison@13220
   199
    vmull.u8    q14, d0, d3
bavison@13220
   200
                                    PF subne PF_CTL, PF_CTL, #1
bavison@13220
   201
    vmlal.u8    q14, d4, d30
bavison@13220
   202
                                    PF cmp PF_X, ORIG_W
bavison@13220
   203
    vmull.u8    q0, d1, d3
bavison@13220
   204
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
bavison@13220
   205
    vmlal.u8    q0, d5, d30
bavison@13220
   206
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
bavison@13220
   207
    vmull.u8    q1, d2, d3
bavison@13220
   208
                                    PF subge PF_X, PF_X, ORIG_W
bavison@13220
   209
    vmlal.u8    q1, d6, d30
bavison@13220
   210
                                    PF subges PF_CTL, PF_CTL, #0x10
bavison@13220
   211
    vrshr.u16   q2, q14, #8
bavison@13220
   212
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
bavison@13220
   213
    vrshr.u16   q3, q0, #8
bavison@13220
   214
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
bavison@13220
   215
    vraddhn.u16 d28, q14, q2
bavison@13220
   216
    vrshr.u16   q2, q1, #8
bavison@13220
   217
    vraddhn.u16 d29, q0, q3
bavison@13220
   218
    vraddhn.u16 d30, q1, q2
bavison@13220
   219
.endm
bavison@13220
   220
bavison@13220
   221
generate_composite_function \
bavison@13220
   222
    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
bavison@13220
   223
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
bavison@13220
   224
    8, /* number of pixels, processed in a single block */ \
bavison@13220
   225
    5, /* prefetch distance */ \
bavison@13220
   226
    default_init, \
bavison@13220
   227
    default_cleanup, \
bavison@13220
   228
    RGBtoRGBPixelAlpha_process_pixblock_head, \
bavison@13220
   229
    RGBtoRGBPixelAlpha_process_pixblock_tail, \
bavison@13220
   230
    RGBtoRGBPixelAlpha_process_pixblock_tail_head
bavison@13221
   231
bavison@13221
   232
 /******************************************************************************/
bavison@13221
   233
bavison@13221
   234
.macro ARGBto565PixelAlpha_process_pixblock_head
bavison@13221
   235
    vmvn        d6, d3
bavison@13221
   236
    vshr.u8     d1, #2
bavison@13221
   237
    vshr.u8     d3, #3
bavison@13221
   238
    vshr.u8     d0, #3
bavison@13221
   239
    vshrn.u16   d7, q2, #3
bavison@13221
   240
    vshrn.u16   d25, q2, #8
bavison@13221
   241
    vbic.i16    q2, #0xe0
bavison@13221
   242
    vshr.u8     d6, #3
bavison@13221
   243
    vshr.u8     d7, #2
bavison@13221
   244
    vshr.u8     d2, #3
bavison@13221
   245
    vmovn.u16   d24, q2
bavison@13221
   246
    vshr.u8     d25, #3
bavison@13221
   247
    vmull.u8    q13, d1, d3
bavison@13221
   248
    vmlal.u8    q13, d7, d6
bavison@13221
   249
    vmull.u8    q14, d0, d3
bavison@13221
   250
    vmlal.u8    q14, d24, d6
bavison@13221
   251
    vmull.u8    q15, d2, d3
bavison@13221
   252
    vmlal.u8    q15, d25, d6
bavison@13221
   253
.endm
bavison@13221
   254
bavison@13221
   255
.macro ARGBto565PixelAlpha_process_pixblock_tail
bavison@13221
   256
    vsra.u16    q13, #5
bavison@13221
   257
    vsra.u16    q14, #5
bavison@13221
   258
    vsra.u16    q15, #5
bavison@13221
   259
    vrshr.u16   q13, #5
bavison@13221
   260
    vrshr.u16   q14, #5
bavison@13221
   261
    vrshr.u16   q15, #5
bavison@13221
   262
    vsli.u16    q14, q13, #5
bavison@13221
   263
    vsli.u16    q14, q15, #11
bavison@13221
   264
.endm
bavison@13221
   265
bavison@13221
   266
.macro ARGBto565PixelAlpha_process_pixblock_tail_head
bavison@13221
   267
    vld4.8      {d0-d3}, [SRC]!
bavison@13221
   268
                                    PF add PF_X, PF_X, #8
bavison@13221
   269
        vsra.u16    q13, #5
bavison@13221
   270
                                    PF tst PF_CTL, #0xF
bavison@13221
   271
        vsra.u16    q14, #5
bavison@13221
   272
                                    PF addne PF_X, PF_X, #8
bavison@13221
   273
        vsra.u16    q15, #5
bavison@13221
   274
                                    PF subne PF_CTL, PF_CTL, #1
bavison@13221
   275
        vrshr.u16   q13, #5
bavison@13221
   276
                                    PF cmp PF_X, ORIG_W
bavison@13221
   277
        vrshr.u16   q14, #5
bavison@13221
   278
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
bavison@13221
   279
        vrshr.u16   q15, #5
bavison@13221
   280
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
bavison@13221
   281
    vld1.8      {d4-d5}, [DST_R]!
bavison@13221
   282
                                    PF subge PF_X, PF_X, ORIG_W
bavison@13221
   283
        vsli.u16    q14, q13, #5
bavison@13221
   284
                                    PF subges PF_CTL, PF_CTL, #0x10
bavison@13221
   285
        vsli.u16    q14, q15, #11
bavison@13221
   286
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
bavison@13221
   287
        vst1.8      {q14}, [DST_W :128]!
bavison@13221
   288
    vmvn        d6, d3
bavison@13221
   289
    vshr.u8     d1, #2
bavison@13221
   290
    vshr.u8     d3, #3
bavison@13221
   291
    vshr.u8     d0, #3
bavison@13221
   292
    vshrn.u16   d7, q2, #3
bavison@13221
   293
    vshrn.u16   d25, q2, #8
bavison@13221
   294
    vbic.i16    q2, #0xe0
bavison@13221
   295
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
bavison@13221
   296
    vshr.u8     d6, #3
bavison@13221
   297
    vshr.u8     d7, #2
bavison@13221
   298
    vshr.u8     d2, #3
bavison@13221
   299
    vmovn.u16   d24, q2
bavison@13221
   300
    vshr.u8     d25, #3
bavison@13221
   301
    vmull.u8    q13, d1, d3
bavison@13221
   302
    vmlal.u8    q13, d7, d6
bavison@13221
   303
    vmull.u8    q14, d0, d3
bavison@13221
   304
    vmlal.u8    q14, d24, d6
bavison@13221
   305
    vmull.u8    q15, d2, d3
bavison@13221
   306
    vmlal.u8    q15, d25, d6
bavison@13221
   307
.endm
bavison@13221
   308
bavison@13221
   309
generate_composite_function \
bavison@13221
   310
    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
bavison@13221
   311
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
bavison@13221
   312
    8, /* number of pixels, processed in a single block */ \
bavison@13221
   313
    6, /* prefetch distance */ \
bavison@13221
   314
    default_init, \
bavison@13221
   315
    default_cleanup, \
bavison@13221
   316
    ARGBto565PixelAlpha_process_pixblock_head, \
bavison@13221
   317
    ARGBto565PixelAlpha_process_pixblock_tail, \
bavison@13221
   318
    ARGBto565PixelAlpha_process_pixblock_tail_head