ARM: NEON assembly optimization for function BlitRGBtoRGBPixelAlpha SDL-1.2
authorBen Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 132200ae1ddca5e85
parent 13219 4f88e197acad
child 13221 3705e81df6ff
ARM: NEON assembly optimization for function BlitRGBtoRGBPixelAlpha
---
src/video/SDL_blit_A.c | 20 +
src/video/arm/pixman-arm-neon-asm.S | 102 +++
src/video/arm/pixman-arm-neon-asm.h | 1218 +++++++++++++++++++++++++++
3 files changed, 1340 insertions(+)
create mode 100644 src/video/arm/pixman-arm-neon-asm.S
create mode 100644 src/video/arm/pixman-arm-neon-asm.h
src/video/SDL_blit_A.c
src/video/arm/pixman-arm-neon-asm.S
src/video/arm/pixman-arm-neon-asm.h
     1.1 --- a/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.2 +++ b/src/video/SDL_blit_A.c	Thu Oct 31 14:00:28 2019 +0300
     1.3 @@ -1463,6 +1463,22 @@
     1.4  }
     1.5  #endif
     1.6  
     1.7 +#if SDL_ARM_NEON_BLITTERS
     1.8 +void BlitRGBtoRGBPixelAlphaARMNEONAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);
     1.9 +
    1.10 +static void BlitRGBtoRGBPixelAlphaARMNEON(SDL_BlitInfo *info)
    1.11 +{
    1.12 +	int32_t width = info->d_width;
    1.13 +	int32_t height = info->d_height;
    1.14 +	uint32_t *dstp = (uint32_t *)info->d_pixels;
    1.15 +	int32_t dststride = width + (info->d_skip >> 2);
    1.16 +	uint32_t *srcp = (uint32_t *)info->s_pixels;
    1.17 +	int32_t srcstride = width + (info->s_skip >> 2);
    1.18 +
    1.19 +	BlitRGBtoRGBPixelAlphaARMNEONAsm(width, height, dstp, dststride, srcp, srcstride);
    1.20 +}
    1.21 +#endif
    1.22 +
    1.23  /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
    1.24  static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
    1.25  {
    1.26 @@ -2892,6 +2908,10 @@
    1.27  				&& SDL_HasAltiVec())
    1.28  				return BlitRGBtoRGBPixelAlphaAltivec;
    1.29  #endif
    1.30 +#if SDL_ARM_NEON_BLITTERS
    1.31 +			if (SDL_HasARMNEON())
    1.32 +				return BlitRGBtoRGBPixelAlphaARMNEON;
    1.33 +#endif
    1.34  #if SDL_ARM_SIMD_BLITTERS
    1.35  			if (SDL_HasARMSIMD())
    1.36  				return BlitRGBtoRGBPixelAlphaARMSIMD;
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/src/video/arm/pixman-arm-neon-asm.S	Thu Oct 31 14:00:28 2019 +0300
     2.3 @@ -0,0 +1,102 @@
     2.4 +/*
     2.5 + * Copyright (c) 2018 RISC OS Open Ltd
     2.6 + *
     2.7 + * This software is provided 'as-is', without any express or implied
     2.8 + * warranty.  In no event will the authors be held liable for any damages
     2.9 + * arising from the use of this software.
    2.10 + *
    2.11 + * Permission is granted to anyone to use this software for any purpose,
    2.12 + * including commercial applications, and to alter it and redistribute it
    2.13 + * freely, subject to the following restrictions:
    2.14 + *
    2.15 + * 1. The origin of this software must not be misrepresented; you must not
    2.16 + *    claim that you wrote the original software. If you use this software
    2.17 + *    in a product, an acknowledgment in the product documentation would be
    2.18 + *    appreciated but is not required.
    2.19 + * 2. Altered source versions must be plainly marked as such, and must not be
    2.20 + *    misrepresented as being the original software.
    2.21 + * 3. This notice may not be removed or altered from any source distribution.
    2.22 + */
    2.23 +
    2.24 +/* Prevent the stack from becoming executable for no reason... */
    2.25 +#if defined(__linux__) && defined(__ELF__)
    2.26 +.section .note.GNU-stack,"",%progbits
    2.27 +#endif
    2.28 +
    2.29 +    .text
    2.30 +    .fpu neon
    2.31 +    .arch armv7a
    2.32 +    .object_arch armv4
    2.33 +    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    2.34 +    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    2.35 +    .arm
    2.36 +    .altmacro
    2.37 +    .p2align 2
    2.38 +
    2.39 +#include "pixman-arm-asm.h"
    2.40 +#include "pixman-arm-neon-asm.h"
    2.41 +
    2.42 +/******************************************************************************/
    2.43 +
    2.44 +.macro RGBtoRGBPixelAlpha_process_pixblock_head
    2.45 +    vmvn        d30, d3  /* get inverted source alpha */
    2.46 +    vmov        d31, d7  /* dest alpha is always unchanged */
    2.47 +    vmull.u8    q14, d0, d3
    2.48 +    vmlal.u8    q14, d4, d30
    2.49 +    vmull.u8    q0, d1, d3
    2.50 +    vmlal.u8    q0, d5, d30
    2.51 +    vmull.u8    q1, d2, d3
    2.52 +    vmlal.u8    q1, d6, d30
    2.53 +    vrshr.u16   q2, q14, #8
    2.54 +    vrshr.u16   q3, q0, #8
    2.55 +    vraddhn.u16 d28, q14, q2
    2.56 +    vrshr.u16   q2, q1, #8
    2.57 +    vraddhn.u16 d29, q0, q3
    2.58 +    vraddhn.u16 d30, q1, q2
    2.59 +.endm
    2.60 +
    2.61 +.macro RGBtoRGBPixelAlpha_process_pixblock_tail
    2.62 +    /* nothing */
    2.63 +.endm
    2.64 +
    2.65 +.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
    2.66 +    vld4.8      {d0-d3}, [SRC]!
    2.67 +                                    PF add PF_X, PF_X, #8
    2.68 +        vst4.8      {d28-d31}, [DST_W :128]!
    2.69 +                                    PF tst PF_CTL, #0xF
    2.70 +    vld4.8      {d4-d7}, [DST_R :128]!
    2.71 +                                    PF addne PF_X, PF_X, #8
    2.72 +    vmvn        d30, d3  /* get inverted source alpha */
    2.73 +    vmov        d31, d7  /* dest alpha is always unchanged */
    2.74 +    vmull.u8    q14, d0, d3
    2.75 +                                    PF subne PF_CTL, PF_CTL, #1
    2.76 +    vmlal.u8    q14, d4, d30
    2.77 +                                    PF cmp PF_X, ORIG_W
    2.78 +    vmull.u8    q0, d1, d3
    2.79 +                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    2.80 +    vmlal.u8    q0, d5, d30
    2.81 +                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    2.82 +    vmull.u8    q1, d2, d3
    2.83 +                                    PF subge PF_X, PF_X, ORIG_W
    2.84 +    vmlal.u8    q1, d6, d30
    2.85 +                                    PF subges PF_CTL, PF_CTL, #0x10
    2.86 +    vrshr.u16   q2, q14, #8
    2.87 +                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    2.88 +    vrshr.u16   q3, q0, #8
    2.89 +                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    2.90 +    vraddhn.u16 d28, q14, q2
    2.91 +    vrshr.u16   q2, q1, #8
    2.92 +    vraddhn.u16 d29, q0, q3
    2.93 +    vraddhn.u16 d30, q1, q2
    2.94 +.endm
    2.95 +
    2.96 +generate_composite_function \
    2.97 +    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
    2.98 +    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    2.99 +    8, /* number of pixels, processed in a single block */ \
   2.100 +    5, /* prefetch distance */ \
   2.101 +    default_init, \
   2.102 +    default_cleanup, \
   2.103 +    RGBtoRGBPixelAlpha_process_pixblock_head, \
   2.104 +    RGBtoRGBPixelAlpha_process_pixblock_tail, \
   2.105 +    RGBtoRGBPixelAlpha_process_pixblock_tail_head
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/src/video/arm/pixman-arm-neon-asm.h	Thu Oct 31 14:00:28 2019 +0300
     3.3 @@ -0,0 +1,1218 @@
     3.4 +/*
     3.5 + * Copyright © 2009 Nokia Corporation
     3.6 + *
     3.7 + * Permission is hereby granted, free of charge, to any person obtaining a
     3.8 + * copy of this software and associated documentation files (the "Software"),
     3.9 + * to deal in the Software without restriction, including without limitation
    3.10 + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
    3.11 + * and/or sell copies of the Software, and to permit persons to whom the
    3.12 + * Software is furnished to do so, subject to the following conditions:
    3.13 + *
    3.14 + * The above copyright notice and this permission notice (including the next
    3.15 + * paragraph) shall be included in all copies or substantial portions of the
    3.16 + * Software.
    3.17 + *
    3.18 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    3.19 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    3.20 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
    3.21 + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    3.22 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    3.23 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    3.24 + * DEALINGS IN THE SOFTWARE.
    3.25 + *
    3.26 + * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
    3.27 + */
    3.28 +
    3.29 +/*
    3.30 + * This file contains a macro ('generate_composite_function') which can
    3.31 + * construct 2D image processing functions, based on a common template.
    3.32 + * Any combinations of source, destination and mask images with 8bpp,
    3.33 + * 16bpp, 24bpp, 32bpp color formats are supported.
    3.34 + *
    3.35 + * This macro takes care of:
    3.36 + *  - handling of leading and trailing unaligned pixels
    3.37 + *  - doing most of the work related to L2 cache preload
    3.38 + *  - encourages the use of software pipelining for better instructions
    3.39 + *    scheduling
    3.40 + *
    3.41 + * The user of this macro has to provide some configuration parameters
    3.42 + * (bit depths for the images, prefetch distance, etc.) and a set of
    3.43 + * macros, which should implement basic code chunks responsible for
    3.44 + * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
    3.45 + * examples.
    3.46 + *
    3.47 + * TODO:
    3.48 + *  - try overlapped pixel method (from Ian Rickards) when processing
    3.49 + *    exactly two blocks of pixels
    3.50 + *  - maybe add an option to do reverse scanline processing
    3.51 + */
    3.52 +
    3.53 +/*
    3.54 + * Bit flags for 'generate_composite_function' macro which are used
    3.55 + * to tune generated functions behavior.
    3.56 + */
    3.57 +.set FLAG_DST_WRITEONLY,       0
    3.58 +.set FLAG_DST_READWRITE,       1
    3.59 +.set FLAG_DEINTERLEAVE_32BPP,  2
    3.60 +
    3.61 +/*
    3.62 + * Offset in stack where mask and source pointer/stride can be accessed
    3.63 + * from 'init' macro. This is useful for doing special handling for solid mask.
    3.64 + */
    3.65 +.set ARGS_STACK_OFFSET,        40
    3.66 +
    3.67 +/*
    3.68 + * Constants for selecting preferable prefetch type.
    3.69 + */
    3.70 +.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
    3.71 +.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
    3.72 +.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
    3.73 +
    3.74 +/*
    3.75 + * Definitions of supplementary pixld/pixst macros (for partial load/store of
    3.76 + * pixel data).
    3.77 + */
    3.78 +
    3.79 +.macro pixldst1 op, elem_size, reg1, mem_operand, abits
    3.80 +.if abits > 0
    3.81 +    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
    3.82 +.else
    3.83 +    op&.&elem_size {d&reg1}, [&mem_operand&]!
    3.84 +.endif
    3.85 +.endm
    3.86 +
    3.87 +.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
    3.88 +.if abits > 0
    3.89 +    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
    3.90 +.else
    3.91 +    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
    3.92 +.endif
    3.93 +.endm
    3.94 +
    3.95 +.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
    3.96 +.if abits > 0
    3.97 +    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
    3.98 +.else
    3.99 +    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
   3.100 +.endif
   3.101 +.endm
   3.102 +
   3.103 +.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
   3.104 +    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
   3.105 +.endm
   3.106 +
   3.107 +.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
   3.108 +    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
   3.109 +.endm
   3.110 +
   3.111 +.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
   3.112 +    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
   3.113 +.endm
   3.114 +
   3.115 +.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
   3.116 +.if numbytes == 32
   3.117 +    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
   3.118 +                              %(basereg+6), %(basereg+7), mem_operand, abits
   3.119 +.elseif numbytes == 16
   3.120 +    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
   3.121 +.elseif numbytes == 8
   3.122 +    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
   3.123 +.elseif numbytes == 4
   3.124 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
   3.125 +        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
   3.126 +    .elseif elem_size == 16
   3.127 +        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
   3.128 +        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
   3.129 +    .else
   3.130 +        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
   3.131 +        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
   3.132 +        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
   3.133 +        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
   3.134 +    .endif
   3.135 +.elseif numbytes == 2
   3.136 +    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
   3.137 +        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
   3.138 +    .else
   3.139 +        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
   3.140 +        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
   3.141 +    .endif
   3.142 +.elseif numbytes == 1
   3.143 +    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
   3.144 +.else
   3.145 +    .error "unsupported size: numbytes"
   3.146 +.endif
   3.147 +.endm
   3.148 +
   3.149 +.macro pixld numpix, bpp, basereg, mem_operand, abits=0
   3.150 +.if bpp > 0
   3.151 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   3.152 +    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
   3.153 +                      %(basereg+6), %(basereg+7), mem_operand, abits
   3.154 +.elseif (bpp == 24) && (numpix == 8)
   3.155 +    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   3.156 +.elseif (bpp == 24) && (numpix == 4)
   3.157 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   3.158 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   3.159 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   3.160 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   3.161 +.elseif (bpp == 24) && (numpix == 2)
   3.162 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   3.163 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   3.164 +.elseif (bpp == 24) && (numpix == 1)
   3.165 +    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   3.166 +.else
   3.167 +    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
   3.168 +.endif
   3.169 +.endif
   3.170 +.endm
   3.171 +
   3.172 +.macro pixst numpix, bpp, basereg, mem_operand, abits=0
   3.173 +.if bpp > 0
   3.174 +.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   3.175 +    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
   3.176 +                      %(basereg+6), %(basereg+7), mem_operand, abits
   3.177 +.elseif (bpp == 24) && (numpix == 8)
   3.178 +    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
   3.179 +.elseif (bpp == 24) && (numpix == 4)
   3.180 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
   3.181 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
   3.182 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
   3.183 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
   3.184 +.elseif (bpp == 24) && (numpix == 2)
   3.185 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
   3.186 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
   3.187 +.elseif (bpp == 24) && (numpix == 1)
   3.188 +    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
   3.189 +.else
   3.190 +    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
   3.191 +.endif
   3.192 +.endif
   3.193 +.endm
   3.194 +
   3.195 +.macro pixld_a numpix, bpp, basereg, mem_operand
   3.196 +.if (bpp * numpix) <= 128
   3.197 +    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   3.198 +.else
   3.199 +    pixld numpix, bpp, basereg, mem_operand, 128
   3.200 +.endif
   3.201 +.endm
   3.202 +
   3.203 +.macro pixst_a numpix, bpp, basereg, mem_operand
   3.204 +.if (bpp * numpix) <= 128
   3.205 +    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
   3.206 +.else
   3.207 +    pixst numpix, bpp, basereg, mem_operand, 128
   3.208 +.endif
   3.209 +.endm
   3.210 +
   3.211 +/*
   3.212 + * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
   3.213 + * aliases to be defined)
   3.214 + */
   3.215 +.macro pixld1_s elem_size, reg1, mem_operand
   3.216 +.if elem_size == 16
   3.217 +    mov     TMP1, VX, asr #16
   3.218 +    adds    VX, VX, UNIT_X
   3.219 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.220 +    bpl     5b
   3.221 +    add     TMP1, mem_operand, TMP1, asl #1
   3.222 +    mov     TMP2, VX, asr #16
   3.223 +    adds    VX, VX, UNIT_X
   3.224 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.225 +    bpl     5b
   3.226 +    add     TMP2, mem_operand, TMP2, asl #1
   3.227 +    vld1.16 {d&reg1&[0]}, [TMP1, :16]
   3.228 +    mov     TMP1, VX, asr #16
   3.229 +    adds    VX, VX, UNIT_X
   3.230 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.231 +    bpl     5b
   3.232 +    add     TMP1, mem_operand, TMP1, asl #1
   3.233 +    vld1.16 {d&reg1&[1]}, [TMP2, :16]
   3.234 +    mov     TMP2, VX, asr #16
   3.235 +    adds    VX, VX, UNIT_X
   3.236 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.237 +    bpl     5b
   3.238 +    add     TMP2, mem_operand, TMP2, asl #1
   3.239 +    vld1.16 {d&reg1&[2]}, [TMP1, :16]
   3.240 +    vld1.16 {d&reg1&[3]}, [TMP2, :16]
   3.241 +.elseif elem_size == 32
   3.242 +    mov     TMP1, VX, asr #16
   3.243 +    adds    VX, VX, UNIT_X
   3.244 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.245 +    bpl     5b
   3.246 +    add     TMP1, mem_operand, TMP1, asl #2
   3.247 +    mov     TMP2, VX, asr #16
   3.248 +    adds    VX, VX, UNIT_X
   3.249 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.250 +    bpl     5b
   3.251 +    add     TMP2, mem_operand, TMP2, asl #2
   3.252 +    vld1.32 {d&reg1&[0]}, [TMP1, :32]
   3.253 +    vld1.32 {d&reg1&[1]}, [TMP2, :32]
   3.254 +.else
   3.255 +    .error "unsupported"
   3.256 +.endif
   3.257 +.endm
   3.258 +
   3.259 +.macro pixld2_s elem_size, reg1, reg2, mem_operand
   3.260 +.if 0 /* elem_size == 32 */
   3.261 +    mov     TMP1, VX, asr #16
   3.262 +    add     VX, VX, UNIT_X, asl #1
   3.263 +    add     TMP1, mem_operand, TMP1, asl #2
   3.264 +    mov     TMP2, VX, asr #16
   3.265 +    sub     VX, VX, UNIT_X
   3.266 +    add     TMP2, mem_operand, TMP2, asl #2
   3.267 +    vld1.32 {d&reg1&[0]}, [TMP1, :32]
   3.268 +    mov     TMP1, VX, asr #16
   3.269 +    add     VX, VX, UNIT_X, asl #1
   3.270 +    add     TMP1, mem_operand, TMP1, asl #2
   3.271 +    vld1.32 {d&reg2&[0]}, [TMP2, :32]
   3.272 +    mov     TMP2, VX, asr #16
   3.273 +    add     VX, VX, UNIT_X
   3.274 +    add     TMP2, mem_operand, TMP2, asl #2
   3.275 +    vld1.32 {d&reg1&[1]}, [TMP1, :32]
   3.276 +    vld1.32 {d&reg2&[1]}, [TMP2, :32]
   3.277 +.else
   3.278 +    pixld1_s elem_size, reg1, mem_operand
   3.279 +    pixld1_s elem_size, reg2, mem_operand
   3.280 +.endif
   3.281 +.endm
   3.282 +
   3.283 +.macro pixld0_s elem_size, reg1, idx, mem_operand
   3.284 +.if elem_size == 16
   3.285 +    mov     TMP1, VX, asr #16
   3.286 +    adds    VX, VX, UNIT_X
   3.287 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.288 +    bpl     5b
   3.289 +    add     TMP1, mem_operand, TMP1, asl #1
   3.290 +    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
   3.291 +.elseif elem_size == 32
   3.292 +    mov     TMP1, VX, asr #16
   3.293 +    adds    VX, VX, UNIT_X
   3.294 +5:  subpls  VX, VX, SRC_WIDTH_FIXED
   3.295 +    bpl     5b
   3.296 +    add     TMP1, mem_operand, TMP1, asl #2
   3.297 +    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
   3.298 +.endif
   3.299 +.endm
   3.300 +
   3.301 +.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
   3.302 +.if numbytes == 32
   3.303 +    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
   3.304 +    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
   3.305 +    pixdeinterleave elem_size, %(basereg+4)
   3.306 +.elseif numbytes == 16
   3.307 +    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
   3.308 +.elseif numbytes == 8
   3.309 +    pixld1_s elem_size, %(basereg+1), mem_operand
   3.310 +.elseif numbytes == 4
   3.311 +    .if elem_size == 32
   3.312 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
   3.313 +    .elseif elem_size == 16
   3.314 +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
   3.315 +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
   3.316 +    .else
   3.317 +        pixld0_s elem_size, %(basereg+0), 4, mem_operand
   3.318 +        pixld0_s elem_size, %(basereg+0), 5, mem_operand
   3.319 +        pixld0_s elem_size, %(basereg+0), 6, mem_operand
   3.320 +        pixld0_s elem_size, %(basereg+0), 7, mem_operand
   3.321 +    .endif
   3.322 +.elseif numbytes == 2
   3.323 +    .if elem_size == 16
   3.324 +        pixld0_s elem_size, %(basereg+0), 1, mem_operand
   3.325 +    .else
   3.326 +        pixld0_s elem_size, %(basereg+0), 2, mem_operand
   3.327 +        pixld0_s elem_size, %(basereg+0), 3, mem_operand
   3.328 +    .endif
   3.329 +.elseif numbytes == 1
   3.330 +    pixld0_s elem_size, %(basereg+0), 1, mem_operand
   3.331 +.else
   3.332 +    .error "unsupported size: numbytes"
   3.333 +.endif
   3.334 +.endm
   3.335 +
   3.336 +.macro pixld_s numpix, bpp, basereg, mem_operand
   3.337 +.if bpp > 0
   3.338 +    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
   3.339 +.endif
   3.340 +.endm
   3.341 +
   3.342 +.macro vuzp8 reg1, reg2
   3.343 +    vuzp.8 d&reg1, d&reg2
   3.344 +.endm
   3.345 +
   3.346 +.macro vzip8 reg1, reg2
   3.347 +    vzip.8 d&reg1, d&reg2
   3.348 +.endm
   3.349 +
   3.350 +/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   3.351 +.macro pixdeinterleave bpp, basereg
   3.352 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   3.353 +    vuzp8 %(basereg+0), %(basereg+1)
   3.354 +    vuzp8 %(basereg+2), %(basereg+3)
   3.355 +    vuzp8 %(basereg+1), %(basereg+3)
   3.356 +    vuzp8 %(basereg+0), %(basereg+2)
   3.357 +.endif
   3.358 +.endm
   3.359 +
   3.360 +/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
   3.361 +.macro pixinterleave bpp, basereg
   3.362 +.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
   3.363 +    vzip8 %(basereg+0), %(basereg+2)
   3.364 +    vzip8 %(basereg+1), %(basereg+3)
   3.365 +    vzip8 %(basereg+2), %(basereg+3)
   3.366 +    vzip8 %(basereg+0), %(basereg+1)
   3.367 +.endif
   3.368 +.endm
   3.369 +
   3.370 +/*
   3.371 + * This is a macro for implementing cache preload. The main idea is that
   3.372 + * cache preload logic is mostly independent from the rest of pixels
   3.373 + * processing code. It starts at the top left pixel and moves forward
   3.374 + * across pixels and can jump across scanlines. Prefetch distance is
   3.375 + * handled in an 'incremental' way: it starts from 0 and advances to the
   3.376 + * optimal distance over time. After reaching optimal prefetch distance,
   3.377 + * it is kept constant. There are some checks which prevent prefetching
   3.378 + * unneeded pixel lines below the image (but it still can prefetch a bit
   3.379 + * more data on the right side of the image - not a big issue and may
   3.380 + * be actually helpful when rendering text glyphs). Additional trick is
   3.381 + * the use of LDR instruction for prefetch instead of PLD when moving to
   3.382 + * the next line, the point is that we have a high chance of getting TLB
   3.383 + * miss in this case, and PLD would be useless.
   3.384 + *
   3.385 + * This sounds like it may introduce a noticeable overhead (when working with
   3.386 + * fully cached data). But in reality, due to having a separate pipeline and
   3.387 + * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
   3.388 + * execute simultaneously with NEON and be completely shadowed by it. Thus
   3.389 + * we get no performance overhead at all (*). This looks like a very nice
   3.390 + * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
   3.391 + * but still can implement some rather advanced prefetch logic in software
   3.392 + * for almost zero cost!
   3.393 + *
   3.394 + * (*) The overhead of the prefetcher is visible when running some trivial
   3.395 + * pixels processing like simple copy. Anyway, having prefetch is a must
   3.396 + * when working with the graphics data.
   3.397 + */
   3.398 +.macro PF a, x:vararg
   3.399 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
   3.400 +    a x
   3.401 +.endif
   3.402 +.endm
   3.403 +
   3.404 +.macro cache_preload std_increment, boost_increment
   3.405 +.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
   3.406 +.if regs_shortage
   3.407 +    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
   3.408 +.endif
   3.409 +.if std_increment != 0
   3.410 +    PF add PF_X, PF_X, #std_increment
   3.411 +.endif
   3.412 +    PF tst PF_CTL, #0xF
   3.413 +    PF addne PF_X, PF_X, #boost_increment
   3.414 +    PF subne PF_CTL, PF_CTL, #1
   3.415 +    PF cmp PF_X, ORIG_W
   3.416 +.if src_bpp_shift >= 0
   3.417 +    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   3.418 +.endif
   3.419 +.if dst_r_bpp != 0
   3.420 +    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   3.421 +.endif
   3.422 +.if mask_bpp_shift >= 0
   3.423 +    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   3.424 +.endif
   3.425 +    PF subge PF_X, PF_X, ORIG_W
   3.426 +    PF subges PF_CTL, PF_CTL, #0x10
   3.427 +.if src_bpp_shift >= 0
   3.428 +    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   3.429 +.endif
   3.430 +.if dst_r_bpp != 0
   3.431 +    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   3.432 +.endif
   3.433 +.if mask_bpp_shift >= 0
   3.434 +    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   3.435 +.endif
   3.436 +.endif
   3.437 +.endm
   3.438 +
   3.439 +.macro cache_preload_simple
   3.440 +.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
   3.441 +.if src_bpp > 0
   3.442 +    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
   3.443 +.endif
   3.444 +.if dst_r_bpp > 0
   3.445 +    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
   3.446 +.endif
   3.447 +.if mask_bpp > 0
   3.448 +    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
   3.449 +.endif
   3.450 +.endif
   3.451 +.endm
   3.452 +
   3.453 +.macro fetch_mask_pixblock
   3.454 +    pixld       pixblock_size, mask_bpp, \
   3.455 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   3.456 +.endm
   3.457 +
   3.458 +/*
   3.459 + * Macro which is used to process leading pixels until destination
   3.460 + * pointer is properly aligned (at 16 bytes boundary). When destination
   3.461 + * buffer uses 16bpp format, this is unnecessary, or even pointless.
   3.462 + */
   3.463 +.macro ensure_destination_ptr_alignment process_pixblock_head, \
   3.464 +                                        process_pixblock_tail, \
   3.465 +                                        process_pixblock_tail_head
   3.466 +.if dst_w_bpp != 24
   3.467 +    tst         DST_R, #0xF
   3.468 +    beq         2f
   3.469 +
   3.470 +.irp lowbit, 1, 2, 4, 8, 16
   3.471 +local skip1
   3.472 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   3.473 +.if lowbit < 16 /* we don't need more than 16-byte alignment */
   3.474 +    tst         DST_R, #lowbit
   3.475 +    beq         1f
   3.476 +.endif
   3.477 +    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
   3.478 +    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
   3.479 +.if dst_r_bpp > 0
   3.480 +    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
   3.481 +.else
   3.482 +    add         DST_R, DST_R, #lowbit
   3.483 +.endif
   3.484 +    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
   3.485 +    sub         W, W, #(lowbit * 8 / dst_w_bpp)
   3.486 +1:
   3.487 +.endif
   3.488 +.endr
   3.489 +    pixdeinterleave src_bpp, src_basereg
   3.490 +    pixdeinterleave mask_bpp, mask_basereg
   3.491 +    pixdeinterleave dst_r_bpp, dst_r_basereg
   3.492 +
   3.493 +    process_pixblock_head
   3.494 +    cache_preload 0, pixblock_size
   3.495 +    cache_preload_simple
   3.496 +    process_pixblock_tail
   3.497 +
   3.498 +    pixinterleave dst_w_bpp, dst_w_basereg
   3.499 +.irp lowbit, 1, 2, 4, 8, 16
   3.500 +.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
   3.501 +.if lowbit < 16 /* we don't need more than 16-byte alignment */
   3.502 +    tst         DST_W, #lowbit
   3.503 +    beq         1f
   3.504 +.endif
   3.505 +    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
   3.506 +1:
   3.507 +.endif
   3.508 +.endr
   3.509 +.endif
   3.510 +2:
   3.511 +.endm
   3.512 +
   3.513 +/*
   3.514 + * Special code for processing up to (pixblock_size - 1) remaining
   3.515 + * trailing pixels. As SIMD processing performs operation on
   3.516 + * pixblock_size pixels, anything smaller than this has to be loaded
   3.517 + * and stored in a special way. Loading and storing of pixel data is
   3.518 + * performed in such a way that we fill some 'slots' in the NEON
   3.519 + * registers (some slots naturally are unused), then perform compositing
   3.520 + * operation as usual. In the end, the data is taken from these 'slots'
   3.521 + * and saved to memory.
   3.522 + *
   3.523 + * cache_preload_flag - allows to suppress prefetch if
   3.524 + *                      set to 0
   3.525 + * dst_aligned_flag   - selects whether destination buffer
   3.526 + *                      is aligned
   3.527 + */
   3.528 +.macro process_trailing_pixels cache_preload_flag, \
   3.529 +                               dst_aligned_flag, \
   3.530 +                               process_pixblock_head, \
   3.531 +                               process_pixblock_tail, \
   3.532 +                               process_pixblock_tail_head
   3.533 +    tst         W, #(pixblock_size - 1)
   3.534 +    beq         2f
   3.535 +.irp chunk_size, 16, 8, 4, 2, 1
   3.536 +.if pixblock_size > chunk_size
   3.537 +    tst         W, #chunk_size
   3.538 +    beq         1f
   3.539 +    pixld_src   chunk_size, src_bpp, src_basereg, SRC
   3.540 +    pixld       chunk_size, mask_bpp, mask_basereg, MASK
   3.541 +.if dst_aligned_flag != 0
   3.542 +    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   3.543 +.else
   3.544 +    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
   3.545 +.endif
   3.546 +.if cache_preload_flag != 0
   3.547 +    PF add      PF_X, PF_X, #chunk_size
   3.548 +.endif
   3.549 +1:
   3.550 +.endif
   3.551 +.endr
   3.552 +    pixdeinterleave src_bpp, src_basereg
   3.553 +    pixdeinterleave mask_bpp, mask_basereg
   3.554 +    pixdeinterleave dst_r_bpp, dst_r_basereg
   3.555 +
   3.556 +    process_pixblock_head
   3.557 +.if cache_preload_flag != 0
   3.558 +    cache_preload 0, pixblock_size
   3.559 +    cache_preload_simple
   3.560 +.endif
   3.561 +    process_pixblock_tail
   3.562 +    pixinterleave dst_w_bpp, dst_w_basereg
   3.563 +.irp chunk_size, 16, 8, 4, 2, 1
   3.564 +.if pixblock_size > chunk_size
   3.565 +    tst         W, #chunk_size
   3.566 +    beq         1f
   3.567 +.if dst_aligned_flag != 0
   3.568 +    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   3.569 +.else
   3.570 +    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
   3.571 +.endif
   3.572 +1:
   3.573 +.endif
   3.574 +.endr
   3.575 +2:
   3.576 +.endm
   3.577 +
   3.578 +/*
   3.579 + * Macro, which performs all the needed operations to switch to the next
   3.580 + * scanline and start the next loop iteration unless all the scanlines
   3.581 + * are already processed.
   3.582 + */
   3.583 +.macro advance_to_next_scanline start_of_loop_label
   3.584 +.if regs_shortage
   3.585 +    ldrd        W, [sp] /* load W and H (width and height) from stack */
   3.586 +.else
   3.587 +    mov         W, ORIG_W
   3.588 +.endif
   3.589 +    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
   3.590 +.if src_bpp != 0
   3.591 +    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
   3.592 +.endif
   3.593 +.if mask_bpp != 0
   3.594 +    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
   3.595 +.endif
   3.596 +.if (dst_w_bpp != 24)
   3.597 +    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
   3.598 +.endif
   3.599 +.if (src_bpp != 24) && (src_bpp != 0)
   3.600 +    sub         SRC, SRC, W, lsl #src_bpp_shift
   3.601 +.endif
   3.602 +.if (mask_bpp != 24) && (mask_bpp != 0)
   3.603 +    sub         MASK, MASK, W, lsl #mask_bpp_shift
   3.604 +.endif
   3.605 +    subs        H, H, #1
   3.606 +    mov         DST_R, DST_W
   3.607 +.if regs_shortage
   3.608 +    str         H, [sp, #4] /* save updated height to stack */
   3.609 +.endif
   3.610 +    bge         start_of_loop_label
   3.611 +.endm
   3.612 +
   3.613 +/*
   3.614 + * Registers are allocated in the following way by default:
   3.615 + * d0, d1, d2, d3     - reserved for loading source pixel data
   3.616 + * d4, d5, d6, d7     - reserved for loading destination pixel data
   3.617 + * d24, d25, d26, d27 - reserved for loading mask pixel data
   3.618 + * d28, d29, d30, d31 - final destination pixel data for writeback to memory
   3.619 + */
   3.620 +.macro generate_composite_function fname, \
   3.621 +                                   src_bpp_, \
   3.622 +                                   mask_bpp_, \
   3.623 +                                   dst_w_bpp_, \
   3.624 +                                   flags, \
   3.625 +                                   pixblock_size_, \
   3.626 +                                   prefetch_distance, \
   3.627 +                                   init, \
   3.628 +                                   cleanup, \
   3.629 +                                   process_pixblock_head, \
   3.630 +                                   process_pixblock_tail, \
   3.631 +                                   process_pixblock_tail_head, \
   3.632 +                                   dst_w_basereg_ = 28, \
   3.633 +                                   dst_r_basereg_ = 4, \
   3.634 +                                   src_basereg_   = 0, \
   3.635 +                                   mask_basereg_  = 24
   3.636 +
   3.637 +    pixman_asm_function fname
   3.638 +
   3.639 +    push        {r4-r12, lr}        /* save all registers */
   3.640 +
   3.641 +/*
   3.642 + * Select prefetch type for this function. If prefetch distance is
   3.643 + * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
   3.644 + * has to be used instead of ADVANCED.
   3.645 + */
   3.646 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
   3.647 +.if prefetch_distance == 0
   3.648 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   3.649 +.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
   3.650 +        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
   3.651 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
   3.652 +.endif
   3.653 +
   3.654 +/*
   3.655 + * Make some macro arguments globally visible and accessible
   3.656 + * from other macros
   3.657 + */
   3.658 +    .set src_bpp, src_bpp_
   3.659 +    .set mask_bpp, mask_bpp_
   3.660 +    .set dst_w_bpp, dst_w_bpp_
   3.661 +    .set pixblock_size, pixblock_size_
   3.662 +    .set dst_w_basereg, dst_w_basereg_
   3.663 +    .set dst_r_basereg, dst_r_basereg_
   3.664 +    .set src_basereg, src_basereg_
   3.665 +    .set mask_basereg, mask_basereg_
   3.666 +
   3.667 +    .macro pixld_src x:vararg
   3.668 +        pixld x
   3.669 +    .endm
   3.670 +    .macro fetch_src_pixblock
   3.671 +        pixld_src   pixblock_size, src_bpp, \
   3.672 +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
   3.673 +    .endm
   3.674 +/*
   3.675 + * Assign symbolic names to registers
   3.676 + */
   3.677 +    W           .req        r0      /* width (is updated during processing) */
   3.678 +    H           .req        r1      /* height (is updated during processing) */
   3.679 +    DST_W       .req        r2      /* destination buffer pointer for writes */
   3.680 +    DST_STRIDE  .req        r3      /* destination image stride */
   3.681 +    SRC         .req        r4      /* source buffer pointer */
   3.682 +    SRC_STRIDE  .req        r5      /* source image stride */
   3.683 +    DST_R       .req        r6      /* destination buffer pointer for reads */
   3.684 +
   3.685 +    MASK        .req        r7      /* mask pointer */
   3.686 +    MASK_STRIDE .req        r8      /* mask stride */
   3.687 +
   3.688 +    PF_CTL      .req        r9      /* combined lines counter and prefetch */
   3.689 +                                    /* distance increment counter */
   3.690 +    PF_X        .req        r10     /* pixel index in a scanline for current */
   3.691 +                                    /* pretetch position */
   3.692 +    PF_SRC      .req        r11     /* pointer to source scanline start */
   3.693 +                                    /* for prefetch purposes */
   3.694 +    PF_DST      .req        r12     /* pointer to destination scanline start */
   3.695 +                                    /* for prefetch purposes */
   3.696 +    PF_MASK     .req        r14     /* pointer to mask scanline start */
   3.697 +                                    /* for prefetch purposes */
   3.698 +/*
   3.699 + * Check whether we have enough registers for all the local variables.
   3.700 + * If we don't have enough registers, original width and height are
   3.701 + * kept on top of stack (and 'regs_shortage' variable is set to indicate
   3.702 + * this for the rest of code). Even if there are enough registers, the
   3.703 + * allocation scheme may be a bit different depending on whether source
   3.704 + * or mask is not used.
   3.705 + */
   3.706 +.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
   3.707 +    ORIG_W      .req        r10     /* saved original width */
   3.708 +    DUMMY       .req        r12     /* temporary register */
   3.709 +    .set        regs_shortage, 0
   3.710 +.elseif mask_bpp == 0
   3.711 +    ORIG_W      .req        r7      /* saved original width */
   3.712 +    DUMMY       .req        r8      /* temporary register */
   3.713 +    .set        regs_shortage, 0
   3.714 +.elseif src_bpp == 0
   3.715 +    ORIG_W      .req        r4      /* saved original width */
   3.716 +    DUMMY       .req        r5      /* temporary register */
   3.717 +    .set        regs_shortage, 0
   3.718 +.else
   3.719 +    ORIG_W      .req        r1      /* saved original width */
   3.720 +    DUMMY       .req        r1      /* temporary register */
   3.721 +    .set        regs_shortage, 1
   3.722 +.endif
   3.723 +
   3.724 +    .set mask_bpp_shift, -1
   3.725 +.if src_bpp == 32
   3.726 +    .set src_bpp_shift, 2
   3.727 +.elseif src_bpp == 24
   3.728 +    .set src_bpp_shift, 0
   3.729 +.elseif src_bpp == 16
   3.730 +    .set src_bpp_shift, 1
   3.731 +.elseif src_bpp == 8
   3.732 +    .set src_bpp_shift, 0
   3.733 +.elseif src_bpp == 0
   3.734 +    .set src_bpp_shift, -1
   3.735 +.else
   3.736 +    .error "requested src bpp (src_bpp) is not supported"
   3.737 +.endif
   3.738 +.if mask_bpp == 32
   3.739 +    .set mask_bpp_shift, 2
   3.740 +.elseif mask_bpp == 24
   3.741 +    .set mask_bpp_shift, 0
   3.742 +.elseif mask_bpp == 8
   3.743 +    .set mask_bpp_shift, 0
   3.744 +.elseif mask_bpp == 0
   3.745 +    .set mask_bpp_shift, -1
   3.746 +.else
   3.747 +    .error "requested mask bpp (mask_bpp) is not supported"
   3.748 +.endif
   3.749 +.if dst_w_bpp == 32
   3.750 +    .set dst_bpp_shift, 2
   3.751 +.elseif dst_w_bpp == 24
   3.752 +    .set dst_bpp_shift, 0
   3.753 +.elseif dst_w_bpp == 16
   3.754 +    .set dst_bpp_shift, 1
   3.755 +.elseif dst_w_bpp == 8
   3.756 +    .set dst_bpp_shift, 0
   3.757 +.else
   3.758 +    .error "requested dst bpp (dst_w_bpp) is not supported"
   3.759 +.endif
   3.760 +
   3.761 +.if (((flags) & FLAG_DST_READWRITE) != 0)
   3.762 +    .set dst_r_bpp, dst_w_bpp
   3.763 +.else
   3.764 +    .set dst_r_bpp, 0
   3.765 +.endif
   3.766 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   3.767 +    .set DEINTERLEAVE_32BPP_ENABLED, 1
   3.768 +.else
   3.769 +    .set DEINTERLEAVE_32BPP_ENABLED, 0
   3.770 +.endif
   3.771 +
   3.772 +.if prefetch_distance < 0 || prefetch_distance > 15
   3.773 +    .error "invalid prefetch distance (prefetch_distance)"
   3.774 +.endif
   3.775 +
   3.776 +.if src_bpp > 0
   3.777 +    ldr         SRC, [sp, #40]
   3.778 +.endif
   3.779 +.if mask_bpp > 0
   3.780 +    ldr         MASK, [sp, #48]
   3.781 +.endif
   3.782 +    PF mov      PF_X, #0
   3.783 +.if src_bpp > 0
   3.784 +    ldr         SRC_STRIDE, [sp, #44]
   3.785 +.endif
   3.786 +.if mask_bpp > 0
   3.787 +    ldr         MASK_STRIDE, [sp, #52]
   3.788 +.endif
   3.789 +    mov         DST_R, DST_W
   3.790 +
   3.791 +.if src_bpp == 24
   3.792 +    sub         SRC_STRIDE, SRC_STRIDE, W
   3.793 +    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
   3.794 +.endif
   3.795 +.if mask_bpp == 24
   3.796 +    sub         MASK_STRIDE, MASK_STRIDE, W
   3.797 +    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
   3.798 +.endif
   3.799 +.if dst_w_bpp == 24
   3.800 +    sub         DST_STRIDE, DST_STRIDE, W
   3.801 +    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
   3.802 +.endif
   3.803 +
   3.804 +/*
   3.805 + * Setup advanced prefetcher initial state
   3.806 + */
   3.807 +    PF mov      PF_SRC, SRC
   3.808 +    PF mov      PF_DST, DST_R
   3.809 +    PF mov      PF_MASK, MASK
   3.810 +    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
   3.811 +    PF mov      PF_CTL, H, lsl #4
   3.812 +    PF add      PF_CTL, #(prefetch_distance - 0x10)
   3.813 +
   3.814 +    init
   3.815 +.if regs_shortage
   3.816 +    push        {r0, r1}
   3.817 +.endif
   3.818 +    subs        H, H, #1
   3.819 +.if regs_shortage
   3.820 +    str         H, [sp, #4] /* save updated height to stack */
   3.821 +.else
   3.822 +    mov         ORIG_W, W
   3.823 +.endif
   3.824 +    blt         9f
   3.825 +    cmp         W, #(pixblock_size * 2)
   3.826 +    blt         8f
   3.827 +/*
   3.828 + * This is the start of the pipelined loop, which if optimized for
   3.829 + * long scanlines
   3.830 + */
   3.831 +0:
   3.832 +    ensure_destination_ptr_alignment process_pixblock_head, \
   3.833 +                                     process_pixblock_tail, \
   3.834 +                                     process_pixblock_tail_head
   3.835 +
   3.836 +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   3.837 +    pixld_a     pixblock_size, dst_r_bpp, \
   3.838 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   3.839 +    fetch_src_pixblock
   3.840 +    pixld       pixblock_size, mask_bpp, \
   3.841 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   3.842 +    PF add      PF_X, PF_X, #pixblock_size
   3.843 +    process_pixblock_head
   3.844 +    cache_preload 0, pixblock_size
   3.845 +    cache_preload_simple
   3.846 +    subs        W, W, #(pixblock_size * 2)
   3.847 +    blt         2f
   3.848 +1:
   3.849 +    process_pixblock_tail_head
   3.850 +    cache_preload_simple
   3.851 +    subs        W, W, #pixblock_size
   3.852 +    bge         1b
   3.853 +2:
   3.854 +    process_pixblock_tail
   3.855 +    pixst_a     pixblock_size, dst_w_bpp, \
   3.856 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   3.857 +
   3.858 +    /* Process the remaining trailing pixels in the scanline */
   3.859 +    process_trailing_pixels 1, 1, \
   3.860 +                            process_pixblock_head, \
   3.861 +                            process_pixblock_tail, \
   3.862 +                            process_pixblock_tail_head
   3.863 +    advance_to_next_scanline 0b
   3.864 +
   3.865 +.if regs_shortage
   3.866 +    pop         {r0, r1}
   3.867 +.endif
   3.868 +    cleanup
   3.869 +    pop         {r4-r12, pc}  /* exit */
   3.870 +/*
   3.871 + * This is the start of the loop, designed to process images with small width
   3.872 + * (less than pixblock_size * 2 pixels). In this case neither pipelining
   3.873 + * nor prefetch are used.
   3.874 + */
   3.875 +8:
   3.876 +    /* Process exactly pixblock_size pixels if needed */
   3.877 +    tst         W, #pixblock_size
   3.878 +    beq         1f
   3.879 +    pixld       pixblock_size, dst_r_bpp, \
   3.880 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   3.881 +    fetch_src_pixblock
   3.882 +    pixld       pixblock_size, mask_bpp, \
   3.883 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   3.884 +    process_pixblock_head
   3.885 +    process_pixblock_tail
   3.886 +    pixst       pixblock_size, dst_w_bpp, \
   3.887 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   3.888 +1:
   3.889 +    /* Process the remaining trailing pixels in the scanline */
   3.890 +    process_trailing_pixels 0, 0, \
   3.891 +                            process_pixblock_head, \
   3.892 +                            process_pixblock_tail, \
   3.893 +                            process_pixblock_tail_head
   3.894 +    advance_to_next_scanline 8b
   3.895 +9:
   3.896 +.if regs_shortage
   3.897 +    pop         {r0, r1}
   3.898 +.endif
   3.899 +    cleanup
   3.900 +    pop         {r4-r12, pc}  /* exit */
   3.901 +
   3.902 +    .purgem     fetch_src_pixblock
   3.903 +    .purgem     pixld_src
   3.904 +
   3.905 +    .unreq      SRC
   3.906 +    .unreq      MASK
   3.907 +    .unreq      DST_R
   3.908 +    .unreq      DST_W
   3.909 +    .unreq      ORIG_W
   3.910 +    .unreq      W
   3.911 +    .unreq      H
   3.912 +    .unreq      SRC_STRIDE
   3.913 +    .unreq      DST_STRIDE
   3.914 +    .unreq      MASK_STRIDE
   3.915 +    .unreq      PF_CTL
   3.916 +    .unreq      PF_X
   3.917 +    .unreq      PF_SRC
   3.918 +    .unreq      PF_DST
   3.919 +    .unreq      PF_MASK
   3.920 +    .unreq      DUMMY
   3.921 +    .endfunc
   3.922 +.endm
   3.923 +
   3.924 +/*
   3.925 + * A simplified variant of function generation template for a single
   3.926 + * scanline processing (for implementing pixman combine functions)
   3.927 + */
   3.928 +.macro generate_composite_function_scanline        use_nearest_scaling, \
   3.929 +                                                   fname, \
   3.930 +                                                   src_bpp_, \
   3.931 +                                                   mask_bpp_, \
   3.932 +                                                   dst_w_bpp_, \
   3.933 +                                                   flags, \
   3.934 +                                                   pixblock_size_, \
   3.935 +                                                   init, \
   3.936 +                                                   cleanup, \
   3.937 +                                                   process_pixblock_head, \
   3.938 +                                                   process_pixblock_tail, \
   3.939 +                                                   process_pixblock_tail_head, \
   3.940 +                                                   dst_w_basereg_ = 28, \
   3.941 +                                                   dst_r_basereg_ = 4, \
   3.942 +                                                   src_basereg_   = 0, \
   3.943 +                                                   mask_basereg_  = 24
   3.944 +
   3.945 +    pixman_asm_function fname
   3.946 +
   3.947 +    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   3.948 +/*
   3.949 + * Make some macro arguments globally visible and accessible
   3.950 + * from other macros
   3.951 + */
   3.952 +    .set src_bpp, src_bpp_
   3.953 +    .set mask_bpp, mask_bpp_
   3.954 +    .set dst_w_bpp, dst_w_bpp_
   3.955 +    .set pixblock_size, pixblock_size_
   3.956 +    .set dst_w_basereg, dst_w_basereg_
   3.957 +    .set dst_r_basereg, dst_r_basereg_
   3.958 +    .set src_basereg, src_basereg_
   3.959 +    .set mask_basereg, mask_basereg_
   3.960 +
   3.961 +.if use_nearest_scaling != 0
   3.962 +    /*
   3.963 +     * Assign symbolic names to registers for nearest scaling
   3.964 +     */
   3.965 +    W           .req        r0
   3.966 +    DST_W       .req        r1
   3.967 +    SRC         .req        r2
   3.968 +    VX          .req        r3
   3.969 +    UNIT_X      .req        ip
   3.970 +    MASK        .req        lr
   3.971 +    TMP1        .req        r4
   3.972 +    TMP2        .req        r5
   3.973 +    DST_R       .req        r6
   3.974 +    SRC_WIDTH_FIXED .req        r7
   3.975 +
   3.976 +    .macro pixld_src x:vararg
   3.977 +        pixld_s x
   3.978 +    .endm
   3.979 +
   3.980 +    ldr         UNIT_X, [sp]
   3.981 +    push        {r4-r8, lr}
   3.982 +    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
   3.983 +    .if mask_bpp != 0
   3.984 +    ldr         MASK, [sp, #(24 + 8)]
   3.985 +    .endif
   3.986 +.else
   3.987 +    /*
   3.988 +     * Assign symbolic names to registers
   3.989 +     */
   3.990 +    W           .req        r0      /* width (is updated during processing) */
   3.991 +    DST_W       .req        r1      /* destination buffer pointer for writes */
   3.992 +    SRC         .req        r2      /* source buffer pointer */
   3.993 +    DST_R       .req        ip      /* destination buffer pointer for reads */
   3.994 +    MASK        .req        r3      /* mask pointer */
   3.995 +
   3.996 +    .macro pixld_src x:vararg
   3.997 +        pixld x
   3.998 +    .endm
   3.999 +.endif
  3.1000 +
  3.1001 +.if (((flags) & FLAG_DST_READWRITE) != 0)
  3.1002 +    .set dst_r_bpp, dst_w_bpp
  3.1003 +.else
  3.1004 +    .set dst_r_bpp, 0
  3.1005 +.endif
  3.1006 +.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
  3.1007 +    .set DEINTERLEAVE_32BPP_ENABLED, 1
  3.1008 +.else
  3.1009 +    .set DEINTERLEAVE_32BPP_ENABLED, 0
  3.1010 +.endif
  3.1011 +
  3.1012 +    .macro fetch_src_pixblock
  3.1013 +        pixld_src   pixblock_size, src_bpp, \
  3.1014 +                    (src_basereg - pixblock_size * src_bpp / 64), SRC
  3.1015 +    .endm
  3.1016 +
  3.1017 +    init
  3.1018 +    mov         DST_R, DST_W
  3.1019 +
  3.1020 +    cmp         W, #pixblock_size
  3.1021 +    blt         8f
  3.1022 +
  3.1023 +    ensure_destination_ptr_alignment process_pixblock_head, \
  3.1024 +                                     process_pixblock_tail, \
  3.1025 +                                     process_pixblock_tail_head
  3.1026 +
  3.1027 +    subs        W, W, #pixblock_size
  3.1028 +    blt         7f
  3.1029 +
  3.1030 +    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
  3.1031 +    pixld_a     pixblock_size, dst_r_bpp, \
  3.1032 +                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
  3.1033 +    fetch_src_pixblock
  3.1034 +    pixld       pixblock_size, mask_bpp, \
  3.1035 +                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
  3.1036 +    process_pixblock_head
  3.1037 +    subs        W, W, #pixblock_size
  3.1038 +    blt         2f
  3.1039 +1:
  3.1040 +    process_pixblock_tail_head
  3.1041 +    subs        W, W, #pixblock_size
  3.1042 +    bge         1b
  3.1043 +2:
  3.1044 +    process_pixblock_tail
  3.1045 +    pixst_a     pixblock_size, dst_w_bpp, \
  3.1046 +                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
  3.1047 +7:
  3.1048 +    /* Process the remaining trailing pixels in the scanline (dst aligned) */
  3.1049 +    process_trailing_pixels 0, 1, \
  3.1050 +                            process_pixblock_head, \
  3.1051 +                            process_pixblock_tail, \
  3.1052 +                            process_pixblock_tail_head
  3.1053 +
  3.1054 +    cleanup
  3.1055 +.if use_nearest_scaling != 0
  3.1056 +    pop         {r4-r8, pc}  /* exit */
  3.1057 +.else
  3.1058 +    bx          lr  /* exit */
  3.1059 +.endif
  3.1060 +8:
  3.1061 +    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
  3.1062 +    process_trailing_pixels 0, 0, \
  3.1063 +                            process_pixblock_head, \
  3.1064 +                            process_pixblock_tail, \
  3.1065 +                            process_pixblock_tail_head
  3.1066 +
  3.1067 +    cleanup
  3.1068 +
  3.1069 +.if use_nearest_scaling != 0
  3.1070 +    pop         {r4-r8, pc}  /* exit */
  3.1071 +
  3.1072 +    .unreq      DST_R
  3.1073 +    .unreq      SRC
  3.1074 +    .unreq      W
  3.1075 +    .unreq      VX
  3.1076 +    .unreq      UNIT_X
  3.1077 +    .unreq      TMP1
  3.1078 +    .unreq      TMP2
  3.1079 +    .unreq      DST_W
  3.1080 +    .unreq      MASK
  3.1081 +    .unreq      SRC_WIDTH_FIXED
  3.1082 +
  3.1083 +.else
  3.1084 +    bx          lr  /* exit */
  3.1085 +
  3.1086 +    .unreq      SRC
  3.1087 +    .unreq      MASK
  3.1088 +    .unreq      DST_R
  3.1089 +    .unreq      DST_W
  3.1090 +    .unreq      W
  3.1091 +.endif
  3.1092 +
  3.1093 +    .purgem     fetch_src_pixblock
  3.1094 +    .purgem     pixld_src
  3.1095 +
  3.1096 +    .endfunc
  3.1097 +.endm
  3.1098 +
  3.1099 +.macro generate_composite_function_single_scanline x:vararg
  3.1100 +    generate_composite_function_scanline 0, x
  3.1101 +.endm
  3.1102 +
  3.1103 +.macro generate_composite_function_nearest_scanline x:vararg
  3.1104 +    generate_composite_function_scanline 1, x
  3.1105 +.endm
  3.1106 +
  3.1107 +/* Default prologue/epilogue, nothing special needs to be done */
  3.1108 +
  3.1109 +.macro default_init
  3.1110 +.endm
  3.1111 +
  3.1112 +.macro default_cleanup
  3.1113 +.endm
  3.1114 +
  3.1115 +/*
  3.1116 + * Prologue/epilogue variant which additionally saves/restores d8-d15
  3.1117 + * registers (they need to be saved/restored by callee according to ABI).
  3.1118 + * This is required if the code needs to use all the NEON registers.
  3.1119 + */
  3.1120 +
  3.1121 +.macro default_init_need_all_regs
  3.1122 +    vpush       {d8-d15}
  3.1123 +.endm
  3.1124 +
  3.1125 +.macro default_cleanup_need_all_regs
  3.1126 +    vpop        {d8-d15}
  3.1127 +.endm
  3.1128 +
  3.1129 +/******************************************************************************/
  3.1130 +
  3.1131 +/*
  3.1132 + * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
  3.1133 + * into a planar a8r8g8b8 format (with a, r, g, b color components
  3.1134 + * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
  3.1135 + *
  3.1136 + * Warning: the conversion is destructive and the original
  3.1137 + *          value (in) is lost.
  3.1138 + */
  3.1139 +.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
  3.1140 +    vshrn.u16   out_r, in,    #8
  3.1141 +    vshrn.u16   out_g, in,    #3
  3.1142 +    vsli.u16    in,    in,    #5
  3.1143 +    vmov.u8     out_a, #255
  3.1144 +    vsri.u8     out_r, out_r, #5
  3.1145 +    vsri.u8     out_g, out_g, #6
  3.1146 +    vshrn.u16   out_b, in,    #2
  3.1147 +.endm
  3.1148 +
  3.1149 +.macro convert_0565_to_x888 in, out_r, out_g, out_b
  3.1150 +    vshrn.u16   out_r, in,    #8
  3.1151 +    vshrn.u16   out_g, in,    #3
  3.1152 +    vsli.u16    in,    in,    #5
  3.1153 +    vsri.u8     out_r, out_r, #5
  3.1154 +    vsri.u8     out_g, out_g, #6
  3.1155 +    vshrn.u16   out_b, in,    #2
  3.1156 +.endm
  3.1157 +
  3.1158 +/*
  3.1159 + * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
  3.1160 + * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
  3.1161 + * pixels packed in 128-bit register (out). Requires two temporary 128-bit
  3.1162 + * registers (tmp1, tmp2)
  3.1163 + */
  3.1164 +.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
  3.1165 +    vshll.u8    tmp1, in_g, #8
  3.1166 +    vshll.u8    out, in_r, #8
  3.1167 +    vshll.u8    tmp2, in_b, #8
  3.1168 +    vsri.u16    out, tmp1, #5
  3.1169 +    vsri.u16    out, tmp2, #11
  3.1170 +.endm
  3.1171 +
  3.1172 +/*
  3.1173 + * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
  3.1174 + * returned in (out0, out1) registers pair. Requires one temporary
  3.1175 + * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
  3.1176 + * value from 'in' is lost
  3.1177 + */
  3.1178 +.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
  3.1179 +    vshl.u16    out0, in,   #5  /* G top 6 bits */
  3.1180 +    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
  3.1181 +    vsri.u16    in,   in,   #5  /* R is ready in top bits */
  3.1182 +    vsri.u16    out0, out0, #6  /* G is ready in top bits */
  3.1183 +    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
  3.1184 +    vshr.u16    out1, in,   #8  /* R is in place */
  3.1185 +    vsri.u16    out0, tmp,  #8  /* G & B is in place */
  3.1186 +    vzip.u16    out0, out1      /* everything is in place */
  3.1187 +.endm
  3.1188 +
  3.1189 +/******************************************************************************/
  3.1190 +
  3.1191 +/* Global configuration options and preferences */
  3.1192 +
  3.1193 +/*
  3.1194 + * The code can optionally make use of unaligned memory accesses to improve
  3.1195 + * performance of handling leading/trailing pixels for each scanline.
  3.1196 + * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
  3.1197 + * example in linux if unaligned memory accesses are not configured to
  3.1198 + * generate.exceptions.
  3.1199 + */
  3.1200 +.set RESPECT_STRICT_ALIGNMENT, 1
  3.1201 +
  3.1202 +/*
  3.1203 + * Set default prefetch type. There is a choice between the following options:
  3.1204 + *
  3.1205 + * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
  3.1206 + * as NOP to workaround some HW bugs or for whatever other reason)
  3.1207 + *
  3.1208 + * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
  3.1209 + * advanced prefetch intruduces heavy overhead)
  3.1210 + *
  3.1211 + * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
  3.1212 + * which can run ARM and NEON instructions simultaneously so that extra ARM
  3.1213 + * instructions do not add (many) extra cycles, but improve prefetch efficiency)
  3.1214 + *
  3.1215 + * Note: some types of function can't support advanced prefetch and fallback
  3.1216 + *       to simple one (those which handle 24bpp pixels)
  3.1217 + */
  3.1218 +.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
  3.1219 +
  3.1220 +/* Prefetch distance in pixels for simple prefetch */
  3.1221 +.set PREFETCH_DISTANCE_SIMPLE, 64