src/video/arm/pixman-arm-neon-asm.S
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13221 3705e81df6ff
parent 13220 0ae1ddca5e85
child 13222 c8562ecca3c9
permissions -rw-r--r--
ARM: NEON assembly optimization for function BlitARGBto565PixelAlpha
---
src/video/SDL_blit_A.c | 31 ++++++++--
src/video/arm/pixman-arm-neon-asm.S | 88 +++++++++++++++++++++++++++++
2 files changed, 114 insertions(+), 5 deletions(-)
     1 /*
     2  * Copyright (c) 2018 RISC OS Open Ltd
     3  *
     4  * This software is provided 'as-is', without any express or implied
     5  * warranty.  In no event will the authors be held liable for any damages
     6  * arising from the use of this software.
     7  *
     8  * Permission is granted to anyone to use this software for any purpose,
     9  * including commercial applications, and to alter it and redistribute it
    10  * freely, subject to the following restrictions:
    11  *
    12  * 1. The origin of this software must not be misrepresented; you must not
    13  *    claim that you wrote the original software. If you use this software
    14  *    in a product, an acknowledgment in the product documentation would be
    15  *    appreciated but is not required.
    16  * 2. Altered source versions must be plainly marked as such, and must not be
    17  *    misrepresented as being the original software.
    18  * 3. This notice may not be removed or altered from any source distribution.
    19  */
    20 
    21 /* Prevent the stack from becoming executable for no reason... */
    22 #if defined(__linux__) && defined(__ELF__)
    23 .section .note.GNU-stack,"",%progbits
    24 #endif
    25 
    26     .text
    27     .fpu neon
    28     .arch armv7a
    29     .object_arch armv4
    30     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    31     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    32     .arm
    33     .altmacro
    34     .p2align 2
    35 
    36 #include "pixman-arm-asm.h"
    37 #include "pixman-arm-neon-asm.h"
    38 
    39 /******************************************************************************/
    40 
    41 .macro RGBtoRGBPixelAlpha_process_pixblock_head
    42     vmvn        d30, d3  /* get inverted source alpha */
    43     vmov        d31, d7  /* dest alpha is always unchanged */
    44     vmull.u8    q14, d0, d3
    45     vmlal.u8    q14, d4, d30
    46     vmull.u8    q0, d1, d3
    47     vmlal.u8    q0, d5, d30
    48     vmull.u8    q1, d2, d3
    49     vmlal.u8    q1, d6, d30
    50     vrshr.u16   q2, q14, #8
    51     vrshr.u16   q3, q0, #8
    52     vraddhn.u16 d28, q14, q2
    53     vrshr.u16   q2, q1, #8
    54     vraddhn.u16 d29, q0, q3
    55     vraddhn.u16 d30, q1, q2
    56 .endm
    57 
    58 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
    59     /* nothing */
    60 .endm
    61 
    62 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
    63     vld4.8      {d0-d3}, [SRC]!
    64                                     PF add PF_X, PF_X, #8
    65         vst4.8      {d28-d31}, [DST_W :128]!
    66                                     PF tst PF_CTL, #0xF
    67     vld4.8      {d4-d7}, [DST_R :128]!
    68                                     PF addne PF_X, PF_X, #8
    69     vmvn        d30, d3  /* get inverted source alpha */
    70     vmov        d31, d7  /* dest alpha is always unchanged */
    71     vmull.u8    q14, d0, d3
    72                                     PF subne PF_CTL, PF_CTL, #1
    73     vmlal.u8    q14, d4, d30
    74                                     PF cmp PF_X, ORIG_W
    75     vmull.u8    q0, d1, d3
    76                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    77     vmlal.u8    q0, d5, d30
    78                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    79     vmull.u8    q1, d2, d3
    80                                     PF subge PF_X, PF_X, ORIG_W
    81     vmlal.u8    q1, d6, d30
    82                                     PF subges PF_CTL, PF_CTL, #0x10
    83     vrshr.u16   q2, q14, #8
    84                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    85     vrshr.u16   q3, q0, #8
    86                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    87     vraddhn.u16 d28, q14, q2
    88     vrshr.u16   q2, q1, #8
    89     vraddhn.u16 d29, q0, q3
    90     vraddhn.u16 d30, q1, q2
    91 .endm
    92 
    93 generate_composite_function \
    94     BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
    95     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    96     8, /* number of pixels, processed in a single block */ \
    97     5, /* prefetch distance */ \
    98     default_init, \
    99     default_cleanup, \
   100     RGBtoRGBPixelAlpha_process_pixblock_head, \
   101     RGBtoRGBPixelAlpha_process_pixblock_tail, \
   102     RGBtoRGBPixelAlpha_process_pixblock_tail_head
   103 
   104  /******************************************************************************/
   105 
   106 .macro ARGBto565PixelAlpha_process_pixblock_head
   107     vmvn        d6, d3
   108     vshr.u8     d1, #2
   109     vshr.u8     d3, #3
   110     vshr.u8     d0, #3
   111     vshrn.u16   d7, q2, #3
   112     vshrn.u16   d25, q2, #8
   113     vbic.i16    q2, #0xe0
   114     vshr.u8     d6, #3
   115     vshr.u8     d7, #2
   116     vshr.u8     d2, #3
   117     vmovn.u16   d24, q2
   118     vshr.u8     d25, #3
   119     vmull.u8    q13, d1, d3
   120     vmlal.u8    q13, d7, d6
   121     vmull.u8    q14, d0, d3
   122     vmlal.u8    q14, d24, d6
   123     vmull.u8    q15, d2, d3
   124     vmlal.u8    q15, d25, d6
   125 .endm
   126 
   127 .macro ARGBto565PixelAlpha_process_pixblock_tail
   128     vsra.u16    q13, #5
   129     vsra.u16    q14, #5
   130     vsra.u16    q15, #5
   131     vrshr.u16   q13, #5
   132     vrshr.u16   q14, #5
   133     vrshr.u16   q15, #5
   134     vsli.u16    q14, q13, #5
   135     vsli.u16    q14, q15, #11
   136 .endm
   137 
   138 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
   139     vld4.8      {d0-d3}, [SRC]!
   140                                     PF add PF_X, PF_X, #8
   141         vsra.u16    q13, #5
   142                                     PF tst PF_CTL, #0xF
   143         vsra.u16    q14, #5
   144                                     PF addne PF_X, PF_X, #8
   145         vsra.u16    q15, #5
   146                                     PF subne PF_CTL, PF_CTL, #1
   147         vrshr.u16   q13, #5
   148                                     PF cmp PF_X, ORIG_W
   149         vrshr.u16   q14, #5
   150                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   151         vrshr.u16   q15, #5
   152                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   153     vld1.8      {d4-d5}, [DST_R]!
   154                                     PF subge PF_X, PF_X, ORIG_W
   155         vsli.u16    q14, q13, #5
   156                                     PF subges PF_CTL, PF_CTL, #0x10
   157         vsli.u16    q14, q15, #11
   158                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   159         vst1.8      {q14}, [DST_W :128]!
   160     vmvn        d6, d3
   161     vshr.u8     d1, #2
   162     vshr.u8     d3, #3
   163     vshr.u8     d0, #3
   164     vshrn.u16   d7, q2, #3
   165     vshrn.u16   d25, q2, #8
   166     vbic.i16    q2, #0xe0
   167                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   168     vshr.u8     d6, #3
   169     vshr.u8     d7, #2
   170     vshr.u8     d2, #3
   171     vmovn.u16   d24, q2
   172     vshr.u8     d25, #3
   173     vmull.u8    q13, d1, d3
   174     vmlal.u8    q13, d7, d6
   175     vmull.u8    q14, d0, d3
   176     vmlal.u8    q14, d24, d6
   177     vmull.u8    q15, d2, d3
   178     vmlal.u8    q15, d25, d6
   179 .endm
   180 
   181 generate_composite_function \
   182     BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
   183     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   184     8, /* number of pixels, processed in a single block */ \
   185     6, /* prefetch distance */ \
   186     default_init, \
   187     default_cleanup, \
   188     ARGBto565PixelAlpha_process_pixblock_head, \
   189     ARGBto565PixelAlpha_process_pixblock_tail, \
   190     ARGBto565PixelAlpha_process_pixblock_tail_head