src/video/arm/pixman-arm-neon-asm.S
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13221 3705e81df6ff
parent 13220 0ae1ddca5e85
child 13222 c8562ecca3c9
permissions -rw-r--r--
ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha
---
src/video/SDL_blit_A.c | 31 ++++++++--
src/video/arm/pixman-arm-neon-asm.S | 88 +++++++++++++++++++++++++++++
2 files changed, 114 insertions(+), 5 deletions(-)
bavison@13220
     1
/*
bavison@13220
     2
 * Copyright (c) 2018 RISC OS Open Ltd
bavison@13220
     3
 *
bavison@13220
     4
 * This software is provided 'as-is', without any express or implied
bavison@13220
     5
 * warranty.  In no event will the authors be held liable for any damages
bavison@13220
     6
 * arising from the use of this software.
bavison@13220
     7
 *
bavison@13220
     8
 * Permission is granted to anyone to use this software for any purpose,
bavison@13220
     9
 * including commercial applications, and to alter it and redistribute it
bavison@13220
    10
 * freely, subject to the following restrictions:
bavison@13220
    11
 *
bavison@13220
    12
 * 1. The origin of this software must not be misrepresented; you must not
bavison@13220
    13
 *    claim that you wrote the original software. If you use this software
bavison@13220
    14
 *    in a product, an acknowledgment in the product documentation would be
bavison@13220
    15
 *    appreciated but is not required.
bavison@13220
    16
 * 2. Altered source versions must be plainly marked as such, and must not be
bavison@13220
    17
 *    misrepresented as being the original software.
bavison@13220
    18
 * 3. This notice may not be removed or altered from any source distribution.
bavison@13220
    19
 */
bavison@13220
    20
bavison@13220
    21
/* Prevent the stack from becoming executable for no reason... */
bavison@13220
    22
#if defined(__linux__) && defined(__ELF__)
bavison@13220
    23
.section .note.GNU-stack,"",%progbits
bavison@13220
    24
#endif
bavison@13220
    25
bavison@13220
    26
    .text
bavison@13220
    27
    .fpu neon
bavison@13220
    28
    .arch armv7a
bavison@13220
    29
    .object_arch armv4
bavison@13220
    30
    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
bavison@13220
    31
    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
bavison@13220
    32
    .arm
bavison@13220
    33
    .altmacro
bavison@13220
    34
    .p2align 2
bavison@13220
    35
bavison@13220
    36
#include "pixman-arm-asm.h"
bavison@13220
    37
#include "pixman-arm-neon-asm.h"
bavison@13220
    38
bavison@13220
    39
/******************************************************************************/
bavison@13220
    40
bavison@13220
    41
.macro RGBtoRGBPixelAlpha_process_pixblock_head
bavison@13220
    42
    vmvn        d30, d3  /* get inverted source alpha */
bavison@13220
    43
    vmov        d31, d7  /* dest alpha is always unchanged */
bavison@13220
    44
    vmull.u8    q14, d0, d3
bavison@13220
    45
    vmlal.u8    q14, d4, d30
bavison@13220
    46
    vmull.u8    q0, d1, d3
bavison@13220
    47
    vmlal.u8    q0, d5, d30
bavison@13220
    48
    vmull.u8    q1, d2, d3
bavison@13220
    49
    vmlal.u8    q1, d6, d30
bavison@13220
    50
    vrshr.u16   q2, q14, #8
bavison@13220
    51
    vrshr.u16   q3, q0, #8
bavison@13220
    52
    vraddhn.u16 d28, q14, q2
bavison@13220
    53
    vrshr.u16   q2, q1, #8
bavison@13220
    54
    vraddhn.u16 d29, q0, q3
bavison@13220
    55
    vraddhn.u16 d30, q1, q2
bavison@13220
    56
.endm
bavison@13220
    57
bavison@13220
    58
.macro RGBtoRGBPixelAlpha_process_pixblock_tail
bavison@13220
    59
    /* nothing */
bavison@13220
    60
.endm
bavison@13220
    61
bavison@13220
    62
.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
bavison@13220
    63
    vld4.8      {d0-d3}, [SRC]!
bavison@13220
    64
                                    PF add PF_X, PF_X, #8
bavison@13220
    65
        vst4.8      {d28-d31}, [DST_W :128]!
bavison@13220
    66
                                    PF tst PF_CTL, #0xF
bavison@13220
    67
    vld4.8      {d4-d7}, [DST_R :128]!
bavison@13220
    68
                                    PF addne PF_X, PF_X, #8
bavison@13220
    69
    vmvn        d30, d3  /* get inverted source alpha */
bavison@13220
    70
    vmov        d31, d7  /* dest alpha is always unchanged */
bavison@13220
    71
    vmull.u8    q14, d0, d3
bavison@13220
    72
                                    PF subne PF_CTL, PF_CTL, #1
bavison@13220
    73
    vmlal.u8    q14, d4, d30
bavison@13220
    74
                                    PF cmp PF_X, ORIG_W
bavison@13220
    75
    vmull.u8    q0, d1, d3
bavison@13220
    76
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
bavison@13220
    77
    vmlal.u8    q0, d5, d30
bavison@13220
    78
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
bavison@13220
    79
    vmull.u8    q1, d2, d3
bavison@13220
    80
                                    PF subge PF_X, PF_X, ORIG_W
bavison@13220
    81
    vmlal.u8    q1, d6, d30
bavison@13220
    82
                                    PF subges PF_CTL, PF_CTL, #0x10
bavison@13220
    83
    vrshr.u16   q2, q14, #8
bavison@13220
    84
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
bavison@13220
    85
    vrshr.u16   q3, q0, #8
bavison@13220
    86
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
bavison@13220
    87
    vraddhn.u16 d28, q14, q2
bavison@13220
    88
    vrshr.u16   q2, q1, #8
bavison@13220
    89
    vraddhn.u16 d29, q0, q3
bavison@13220
    90
    vraddhn.u16 d30, q1, q2
bavison@13220
    91
.endm
bavison@13220
    92
bavison@13220
    93
generate_composite_function \
bavison@13220
    94
    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
bavison@13220
    95
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
bavison@13220
    96
    8, /* number of pixels, processed in a single block */ \
bavison@13220
    97
    5, /* prefetch distance */ \
bavison@13220
    98
    default_init, \
bavison@13220
    99
    default_cleanup, \
bavison@13220
   100
    RGBtoRGBPixelAlpha_process_pixblock_head, \
bavison@13220
   101
    RGBtoRGBPixelAlpha_process_pixblock_tail, \
bavison@13220
   102
    RGBtoRGBPixelAlpha_process_pixblock_tail_head
bavison@13221
   103
bavison@13221
   104
 /******************************************************************************/
bavison@13221
   105
bavison@13221
   106
.macro ARGBto565PixelAlpha_process_pixblock_head
bavison@13221
   107
    vmvn        d6, d3
bavison@13221
   108
    vshr.u8     d1, #2
bavison@13221
   109
    vshr.u8     d3, #3
bavison@13221
   110
    vshr.u8     d0, #3
bavison@13221
   111
    vshrn.u16   d7, q2, #3
bavison@13221
   112
    vshrn.u16   d25, q2, #8
bavison@13221
   113
    vbic.i16    q2, #0xe0
bavison@13221
   114
    vshr.u8     d6, #3
bavison@13221
   115
    vshr.u8     d7, #2
bavison@13221
   116
    vshr.u8     d2, #3
bavison@13221
   117
    vmovn.u16   d24, q2
bavison@13221
   118
    vshr.u8     d25, #3
bavison@13221
   119
    vmull.u8    q13, d1, d3
bavison@13221
   120
    vmlal.u8    q13, d7, d6
bavison@13221
   121
    vmull.u8    q14, d0, d3
bavison@13221
   122
    vmlal.u8    q14, d24, d6
bavison@13221
   123
    vmull.u8    q15, d2, d3
bavison@13221
   124
    vmlal.u8    q15, d25, d6
bavison@13221
   125
.endm
bavison@13221
   126
bavison@13221
   127
.macro ARGBto565PixelAlpha_process_pixblock_tail
bavison@13221
   128
    vsra.u16    q13, #5
bavison@13221
   129
    vsra.u16    q14, #5
bavison@13221
   130
    vsra.u16    q15, #5
bavison@13221
   131
    vrshr.u16   q13, #5
bavison@13221
   132
    vrshr.u16   q14, #5
bavison@13221
   133
    vrshr.u16   q15, #5
bavison@13221
   134
    vsli.u16    q14, q13, #5
bavison@13221
   135
    vsli.u16    q14, q15, #11
bavison@13221
   136
.endm
bavison@13221
   137
bavison@13221
   138
.macro ARGBto565PixelAlpha_process_pixblock_tail_head
bavison@13221
   139
    vld4.8      {d0-d3}, [SRC]!
bavison@13221
   140
                                    PF add PF_X, PF_X, #8
bavison@13221
   141
        vsra.u16    q13, #5
bavison@13221
   142
                                    PF tst PF_CTL, #0xF
bavison@13221
   143
        vsra.u16    q14, #5
bavison@13221
   144
                                    PF addne PF_X, PF_X, #8
bavison@13221
   145
        vsra.u16    q15, #5
bavison@13221
   146
                                    PF subne PF_CTL, PF_CTL, #1
bavison@13221
   147
        vrshr.u16   q13, #5
bavison@13221
   148
                                    PF cmp PF_X, ORIG_W
bavison@13221
   149
        vrshr.u16   q14, #5
bavison@13221
   150
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
bavison@13221
   151
        vrshr.u16   q15, #5
bavison@13221
   152
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
bavison@13221
   153
    vld1.8      {d4-d5}, [DST_R]!
bavison@13221
   154
                                    PF subge PF_X, PF_X, ORIG_W
bavison@13221
   155
        vsli.u16    q14, q13, #5
bavison@13221
   156
                                    PF subges PF_CTL, PF_CTL, #0x10
bavison@13221
   157
        vsli.u16    q14, q15, #11
bavison@13221
   158
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
bavison@13221
   159
        vst1.8      {q14}, [DST_W :128]!
bavison@13221
   160
    vmvn        d6, d3
bavison@13221
   161
    vshr.u8     d1, #2
bavison@13221
   162
    vshr.u8     d3, #3
bavison@13221
   163
    vshr.u8     d0, #3
bavison@13221
   164
    vshrn.u16   d7, q2, #3
bavison@13221
   165
    vshrn.u16   d25, q2, #8
bavison@13221
   166
    vbic.i16    q2, #0xe0
bavison@13221
   167
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
bavison@13221
   168
    vshr.u8     d6, #3
bavison@13221
   169
    vshr.u8     d7, #2
bavison@13221
   170
    vshr.u8     d2, #3
bavison@13221
   171
    vmovn.u16   d24, q2
bavison@13221
   172
    vshr.u8     d25, #3
bavison@13221
   173
    vmull.u8    q13, d1, d3
bavison@13221
   174
    vmlal.u8    q13, d7, d6
bavison@13221
   175
    vmull.u8    q14, d0, d3
bavison@13221
   176
    vmlal.u8    q14, d24, d6
bavison@13221
   177
    vmull.u8    q15, d2, d3
bavison@13221
   178
    vmlal.u8    q15, d25, d6
bavison@13221
   179
.endm
bavison@13221
   180
bavison@13221
   181
generate_composite_function \
bavison@13221
   182
    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
bavison@13221
   183
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
bavison@13221
   184
    8, /* number of pixels, processed in a single block */ \
bavison@13221
   185
    6, /* prefetch distance */ \
bavison@13221
   186
    default_init, \
bavison@13221
   187
    default_cleanup, \
bavison@13221
   188
    ARGBto565PixelAlpha_process_pixblock_head, \
bavison@13221
   189
    ARGBto565PixelAlpha_process_pixblock_tail, \
bavison@13221
   190
    ARGBto565PixelAlpha_process_pixblock_tail_head