src/video/arm/pixman-arm-neon-asm.S
author Ryan C. Gordon <icculus@icculus.org>
Mon, 27 Jan 2020 17:15:12 -0500
changeset 13452 b7592c05a293
parent 13183 094ee379b862
permissions -rw-r--r--
cocoa: Fix command line apps' menu bar not working on macOS Catalina.

Fixes Bugzilla #4937.
     1 /*
     2  * Copyright © 2009 Nokia Corporation
     3  *
     4  * Permission is hereby granted, free of charge, to any person obtaining a
     5  * copy of this software and associated documentation files (the "Software"),
     6  * to deal in the Software without restriction, including without limitation
     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     8  * and/or sell copies of the Software, and to permit persons to whom the
     9  * Software is furnished to do so, subject to the following conditions:
    10  *
    11  * The above copyright notice and this permission notice (including the next
    12  * paragraph) shall be included in all copies or substantial portions of the
    13  * Software.
    14  *
    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    21  * DEALINGS IN THE SOFTWARE.
    22  *
    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    24  */
    25 
    26 /*
    27  * Copyright (c) 2018 RISC OS Open Ltd
    28  *
    29  * This software is provided 'as-is', without any express or implied
    30  * warranty.  In no event will the authors be held liable for any damages
    31  * arising from the use of this software.
    32  *
    33  * Permission is granted to anyone to use this software for any purpose,
    34  * including commercial applications, and to alter it and redistribute it
    35  * freely, subject to the following restrictions:
    36  *
    37  * 1. The origin of this software must not be misrepresented; you must not
    38  *    claim that you wrote the original software. If you use this software
    39  *    in a product, an acknowledgment in the product documentation would be
    40  *    appreciated but is not required.
    41  * 2. Altered source versions must be plainly marked as such, and must not be
    42  *    misrepresented as being the original software.
    43  * 3. This notice may not be removed or altered from any source distribution.
    44  */
    45 
    46 /* Prevent the stack from becoming executable for no reason... */
    47 #if defined(__linux__) && defined(__ELF__)
    48 .section .note.GNU-stack,"",%progbits
    49 #endif
    50 
    51     .text
    52     .fpu neon
    53     .arch armv7a
    54     .object_arch armv4
    55     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    56     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    57     .arm
    58     .altmacro
    59     .p2align 2
    60 
    61 #include "pixman-arm-asm.h"
    62 #include "pixman-arm-neon-asm.h"
    63 
    64 /* Global configuration options and preferences */
    65 
    66 /*
    67  * The code can optionally make use of unaligned memory accesses to improve
    68  * performance of handling leading/trailing pixels for each scanline.
    69  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
    70  * example in linux if unaligned memory accesses are not configured to
    71  * generate.exceptions.
    72  */
    73 .set RESPECT_STRICT_ALIGNMENT, 1
    74 
    75 /*
    76  * Set default prefetch type. There is a choice between the following options:
    77  *
    78  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
    79  * as NOP to workaround some HW bugs or for whatever other reason)
    80  *
    81  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
    82  * advanced prefetch intruduces heavy overhead)
    83  *
    84  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
    85  * which can run ARM and NEON instructions simultaneously so that extra ARM
    86  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
    87  *
    88  * Note: some types of function can't support advanced prefetch and fallback
    89  *       to simple one (those which handle 24bpp pixels)
    90  */
    91 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
    92 
    93 /* Prefetch distance in pixels for simple prefetch */
    94 .set PREFETCH_DISTANCE_SIMPLE, 64
    95 
    96 /******************************************************************************/
    97 
    98 /* We can actually do significantly better than the Pixman macros, at least for
    99  * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
   100  * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
   101  */
   102 
   103 .macro generate_fillrect_function name, bpp, log2Bpp
   104 /*
   105  * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
   106  * On entry:
   107  * a1 = width, pixels
   108  * a2 = height, rows
   109  * a3 = pointer to top-left destination pixel
   110  * a4 = stride, pixels
   111  * [sp] = pixel value to fill with
   112  * Within the function:
   113  * v1 = width remaining
   114  * v2 = vst offset
   115  * v3 = alternate pointer
   116  * ip = data ARM register
   117  */
   118 pixman_asm_function name
   119     vld1.\bpp   {d0[],d1[]}, [sp]
   120     sub         a4, a1
   121     vld1.\bpp   {d2[],d3[]}, [sp]
   122     cmp         a1, #(15+64) >> \log2Bpp
   123     push        {v1-v3,lr}
   124     vmov        ip, s0
   125     blo         51f
   126 
   127     /* Long-row case */
   128     mov         v2, #64
   129 1:  mov         v1, a1
   130     ands        v3, a3, #15
   131     beq         2f
   132     /* Leading pixels */
   133     rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
   134     sub         v1, v1, v3, lsr #\log2Bpp
   135     rbit        v3, v3
   136 .if bpp <= 16
   137 .if bpp == 8
   138     tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
   139     strneb      ip, [a3], #1
   140     tst         v3, #1<<30
   141 .else
   142     tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
   143 .endif
   144     strneh      ip, [a3], #2
   145 .endif
   146     movs        v3, v3, lsl #3
   147     vstmcs      a3!, {s0}
   148     vstmmi      a3!, {d0}
   149 2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
   150     add         v3, a3, #32
   151     /* Inner loop */
   152 3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
   153     subs        v1, v1, #64 >> \log2Bpp
   154     vst1.\bpp   {q0-q1}, [v3 :128], v2
   155     bhs         3b
   156     /* Trailing pixels */
   157 4:  movs        v1, v1, lsl #27 + \log2Bpp
   158     bcc         5f
   159     vst1.\bpp   {q0-q1}, [a3 :128]!
   160 5:  bpl         6f
   161     vst1.\bpp   {q0}, [a3 :128]!
   162 6:  movs        v1, v1, lsl #2
   163     vstmcs      a3!, {d0}
   164     vstmmi      a3!, {s0}
   165 .if bpp <= 16
   166     movs        v1, v1, lsl #2
   167     strcsh      ip, [a3], #2
   168 .if bpp == 8
   169     strmib      ip, [a3], #1
   170 .endif
   171 .endif
   172     subs        a2, a2, #1
   173     add         a3, a3, a4, lsl #\log2Bpp
   174     bhi         1b
   175     pop         {v1-v3,pc}
   176 
   177     /* Short-row case */
   178 51: movs        v1, a1
   179 .if bpp == 8
   180     tst         a3, #3
   181     beq         53f
   182 52: subs        v1, v1, #1
   183     blo         57f
   184     strb        ip, [a3], #1
   185     tst         a3, #3
   186     bne         52b
   187 .elseif bpp == 16
   188     tstne       a3, #2
   189     subne       v1, v1, #1
   190     strneh      ip, [a3], #2
   191 .endif
   192 53: cmp         v1, #32 >> \log2Bpp
   193     bcc         54f
   194     vst1.\bpp   {q0-q1}, [a3]!
   195     sub         v1, v1, #32 >> \log2Bpp
   196     /* Trailing pixels */
   197 54: movs        v1, v1, lsl #27 + \log2Bpp
   198     bcc         55f
   199     vst1.\bpp   {q0-q1}, [a3]!
   200 55: bpl         56f
   201     vst1.\bpp   {q0}, [a3]!
   202 56: movs        v1, v1, lsl #2
   203     vstmcs      a3!, {d0}
   204     vstmmi      a3!, {s0}
   205 .if bpp <= 16
   206     movs        v1, v1, lsl #2
   207     strcsh      ip, [a3], #2
   208 .if bpp == 8
   209     strmib      ip, [a3], #1
   210 .endif
   211 .endif
   212     subs        a2, a2, #1
   213     add         a3, a3, a4, lsl #\log2Bpp
   214     bhi         51b
   215 57: pop         {v1-v3,pc}
   216 
   217 .endfunc
   218 .endm
   219 
   220 generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
   221 generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
   222 generate_fillrect_function FillRect8ARMNEONAsm,  8,  0
   223 
   224 /******************************************************************************/
   225 
   226 .macro RGBtoRGBPixelAlpha_process_pixblock_head
   227     vmvn        d30, d3  /* get inverted source alpha */
   228     vmov        d31, d7  /* dest alpha is always unchanged */
   229     vmull.u8    q14, d0, d3
   230     vmlal.u8    q14, d4, d30
   231     vmull.u8    q0, d1, d3
   232     vmlal.u8    q0, d5, d30
   233     vmull.u8    q1, d2, d3
   234     vmlal.u8    q1, d6, d30
   235     vrshr.u16   q2, q14, #8
   236     vrshr.u16   q3, q0, #8
   237     vraddhn.u16 d28, q14, q2
   238     vrshr.u16   q2, q1, #8
   239     vraddhn.u16 d29, q0, q3
   240     vraddhn.u16 d30, q1, q2
   241 .endm
   242 
   243 .macro RGBtoRGBPixelAlpha_process_pixblock_tail
   244     /* nothing */
   245 .endm
   246 
   247 .macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
   248     vld4.8      {d0-d3}, [SRC]!
   249                                     PF add PF_X, PF_X, #8
   250         vst4.8      {d28-d31}, [DST_W :128]!
   251                                     PF tst PF_CTL, #0xF
   252     vld4.8      {d4-d7}, [DST_R :128]!
   253                                     PF addne PF_X, PF_X, #8
   254     vmvn        d30, d3  /* get inverted source alpha */
   255     vmov        d31, d7  /* dest alpha is always unchanged */
   256     vmull.u8    q14, d0, d3
   257                                     PF subne PF_CTL, PF_CTL, #1
   258     vmlal.u8    q14, d4, d30
   259                                     PF cmp PF_X, ORIG_W
   260     vmull.u8    q0, d1, d3
   261                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   262     vmlal.u8    q0, d5, d30
   263                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   264     vmull.u8    q1, d2, d3
   265                                     PF subge PF_X, PF_X, ORIG_W
   266     vmlal.u8    q1, d6, d30
   267                                     PF subges PF_CTL, PF_CTL, #0x10
   268     vrshr.u16   q2, q14, #8
   269                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   270     vrshr.u16   q3, q0, #8
   271                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   272     vraddhn.u16 d28, q14, q2
   273     vrshr.u16   q2, q1, #8
   274     vraddhn.u16 d29, q0, q3
   275     vraddhn.u16 d30, q1, q2
   276 .endm
   277 
   278 generate_composite_function \
   279     BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
   280     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   281     8, /* number of pixels, processed in a single block */ \
   282     5, /* prefetch distance */ \
   283     default_init, \
   284     default_cleanup, \
   285     RGBtoRGBPixelAlpha_process_pixblock_head, \
   286     RGBtoRGBPixelAlpha_process_pixblock_tail, \
   287     RGBtoRGBPixelAlpha_process_pixblock_tail_head
   288 
   289  /******************************************************************************/
   290 
   291 .macro ARGBto565PixelAlpha_process_pixblock_head
   292     vmvn        d6, d3
   293     vshr.u8     d1, #2
   294     vshr.u8     d3, #3
   295     vshr.u8     d0, #3
   296     vshrn.u16   d7, q2, #3
   297     vshrn.u16   d25, q2, #8
   298     vbic.i16    q2, #0xe0
   299     vshr.u8     d6, #3
   300     vshr.u8     d7, #2
   301     vshr.u8     d2, #3
   302     vmovn.u16   d24, q2
   303     vshr.u8     d25, #3
   304     vmull.u8    q13, d1, d3
   305     vmlal.u8    q13, d7, d6
   306     vmull.u8    q14, d0, d3
   307     vmlal.u8    q14, d24, d6
   308     vmull.u8    q15, d2, d3
   309     vmlal.u8    q15, d25, d6
   310 .endm
   311 
   312 .macro ARGBto565PixelAlpha_process_pixblock_tail
   313     vsra.u16    q13, #5
   314     vsra.u16    q14, #5
   315     vsra.u16    q15, #5
   316     vrshr.u16   q13, #5
   317     vrshr.u16   q14, #5
   318     vrshr.u16   q15, #5
   319     vsli.u16    q14, q13, #5
   320     vsli.u16    q14, q15, #11
   321 .endm
   322 
   323 .macro ARGBto565PixelAlpha_process_pixblock_tail_head
   324     vld4.8      {d0-d3}, [SRC]!
   325                                     PF add PF_X, PF_X, #8
   326         vsra.u16    q13, #5
   327                                     PF tst PF_CTL, #0xF
   328         vsra.u16    q14, #5
   329                                     PF addne PF_X, PF_X, #8
   330         vsra.u16    q15, #5
   331                                     PF subne PF_CTL, PF_CTL, #1
   332         vrshr.u16   q13, #5
   333                                     PF cmp PF_X, ORIG_W
   334         vrshr.u16   q14, #5
   335                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   336         vrshr.u16   q15, #5
   337                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   338     vld1.8      {d4-d5}, [DST_R]!
   339                                     PF subge PF_X, PF_X, ORIG_W
   340         vsli.u16    q14, q13, #5
   341                                     PF subges PF_CTL, PF_CTL, #0x10
   342         vsli.u16    q14, q15, #11
   343                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   344         vst1.8      {q14}, [DST_W :128]!
   345     vmvn        d6, d3
   346     vshr.u8     d1, #2
   347     vshr.u8     d3, #3
   348     vshr.u8     d0, #3
   349     vshrn.u16   d7, q2, #3
   350     vshrn.u16   d25, q2, #8
   351     vbic.i16    q2, #0xe0
   352                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   353     vshr.u8     d6, #3
   354     vshr.u8     d7, #2
   355     vshr.u8     d2, #3
   356     vmovn.u16   d24, q2
   357     vshr.u8     d25, #3
   358     vmull.u8    q13, d1, d3
   359     vmlal.u8    q13, d7, d6
   360     vmull.u8    q14, d0, d3
   361     vmlal.u8    q14, d24, d6
   362     vmull.u8    q15, d2, d3
   363     vmlal.u8    q15, d25, d6
   364 .endm
   365 
   366 generate_composite_function \
   367     BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
   368     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   369     8, /* number of pixels, processed in a single block */ \
   370     6, /* prefetch distance */ \
   371     default_init, \
   372     default_cleanup, \
   373     ARGBto565PixelAlpha_process_pixblock_head, \
   374     ARGBto565PixelAlpha_process_pixblock_tail, \
   375     ARGBto565PixelAlpha_process_pixblock_tail_head