/*
 * Copyright © 2009 Nokia Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
 */

/*
 * Copyright (c) 2018 RISC OS Open Ltd
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

/* Prevent the stack from becoming executable for no reason... */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

    .text
    .fpu neon
    .arch armv7a
    .object_arch armv4
    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
    .arm
    .altmacro
    .p2align 2

#include "pixman-arm-asm.h"
#include "pixman-arm-neon-asm.h"

/* Global configuration options and preferences */

/*
 * The code can optionally make use of unaligned memory accesses to improve
 * performance of handling leading/trailing pixels for each scanline.
 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
 * example in linux if unaligned memory accesses are not configured to
 * generate.exceptions.
 */
.set RESPECT_STRICT_ALIGNMENT, 1

/*
 * Set default prefetch type. There is a choice between the following options:
 *
 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
 * as NOP to workaround some HW bugs or for whatever other reason)
 *
 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
 * advanced prefetch intruduces heavy overhead)
 *
 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
 * which can run ARM and NEON instructions simultaneously so that extra ARM
 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
 *
 * Note: some types of function can't support advanced prefetch and fallback
 *       to simple one (those which handle 24bpp pixels)
 */
.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED

/* Prefetch distance in pixels for simple prefetch */
.set PREFETCH_DISTANCE_SIMPLE, 64

/******************************************************************************/

/* We can actually do significantly better than the Pixman macros, at least for
 * the case of fills, by using a carefully scheduled inner loop. Cortex-A53
 * shows an improvement of up to 78% in ideal cases (large fills to L1 cache).
 */

.macro generate_fillrect_function name, bpp, log2Bpp
/*
 * void name(int32_t w, int32_t h, uint8_t *dst, int32_t dst_stride, uint8_t src);
 * On entry:
 * a1 = width, pixels
 * a2 = height, rows
 * a3 = pointer to top-left destination pixel
 * a4 = stride, pixels
 * [sp] = pixel value to fill with
 * Within the function:
 * v1 = width remaining
 * v2 = vst offset
 * v3 = alternate pointer
 * ip = data ARM register
 */
pixman_asm_function name
    vld1.\bpp   {d0[],d1[]}, [sp]
    sub         a4, a1
    vld1.\bpp   {d2[],d3[]}, [sp]
    cmp         a1, #(15+64) >> \log2Bpp
    push        {v1-v3,lr}
    vmov        ip, s0
    blo         51f

    /* Long-row case */
    mov         v2, #64
1:  mov         v1, a1
    ands        v3, a3, #15
    beq         2f
    /* Leading pixels */
    rsb         v3, v3, #16  /* number of leading bytes until 16-byte aligned */
    sub         v1, v1, v3, lsr #\log2Bpp
    rbit        v3, v3
.if bpp <= 16
.if bpp == 8
    tst         a3, #1       /* bit 0 unaffected by rsb so can avoid register interlock */
    strneb      ip, [a3], #1
    tst         v3, #1<<30
.else
    tst         a3, #2       /* bit 1 unaffected by rsb (assuming halfword alignment) so can avoid register interlock */
.endif
    strneh      ip, [a3], #2
.endif
    movs        v3, v3, lsl #3
    vstmcs      a3!, {s0}
    vstmmi      a3!, {d0}
2:  sub         v1, v1, #64 >> \log2Bpp /* simplifies inner loop termination */
    add         v3, a3, #32
    /* Inner loop */
3:  vst1.\bpp   {q0-q1}, [a3 :128], v2
    subs        v1, v1, #64 >> \log2Bpp
    vst1.\bpp   {q0-q1}, [v3 :128], v2
    bhs         3b
    /* Trailing pixels */
4:  movs        v1, v1, lsl #27 + \log2Bpp
    bcc         5f
    vst1.\bpp   {q0-q1}, [a3 :128]!
5:  bpl         6f
    vst1.\bpp   {q0}, [a3 :128]!
6:  movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
.if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
.if bpp == 8
    strmib      ip, [a3], #1
.endif
.endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         1b
    pop         {v1-v3,pc}

    /* Short-row case */
51: movs        v1, a1
.if bpp == 8
    tst         a3, #3
    beq         53f
52: subs        v1, v1, #1
    blo         57f
    strb        ip, [a3], #1
    tst         a3, #3
    bne         52b
.elseif bpp == 16
    tstne       a3, #2
    subne       v1, v1, #1
    strneh      ip, [a3], #2
.endif
53: cmp         v1, #32 >> \log2Bpp
    bcc         54f
    vst1.\bpp   {q0-q1}, [a3]!
    sub         v1, v1, #32 >> \log2Bpp
    /* Trailing pixels */
54: movs        v1, v1, lsl #27 + \log2Bpp
    bcc         55f
    vst1.\bpp   {q0-q1}, [a3]!
55: bpl         56f
    vst1.\bpp   {q0}, [a3]!
56: movs        v1, v1, lsl #2
    vstmcs      a3!, {d0}
    vstmmi      a3!, {s0}
.if bpp <= 16
    movs        v1, v1, lsl #2
    strcsh      ip, [a3], #2
.if bpp == 8
    strmib      ip, [a3], #1
.endif
.endif
    subs        a2, a2, #1
    add         a3, a3, a4, lsl #\log2Bpp
    bhi         51b
57: pop         {v1-v3,pc}

.endfunc
.endm

generate_fillrect_function FillRect32ARMNEONAsm, 32, 2
generate_fillrect_function FillRect16ARMNEONAsm, 16, 1
generate_fillrect_function FillRect8ARMNEONAsm,  8,  0

/******************************************************************************/

.macro RGBtoRGBPixelAlpha_process_pixblock_head
    vmvn        d30, d3  /* get inverted source alpha */
    vmov        d31, d7  /* dest alpha is always unchanged */
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d4, d30
    vmull.u8    q0, d1, d3
    vmlal.u8    q0, d5, d30
    vmull.u8    q1, d2, d3
    vmlal.u8    q1, d6, d30
    vrshr.u16   q2, q14, #8
    vrshr.u16   q3, q0, #8
    vraddhn.u16 d28, q14, q2
    vrshr.u16   q2, q1, #8
    vraddhn.u16 d29, q0, q3
    vraddhn.u16 d30, q1, q2
.endm

.macro RGBtoRGBPixelAlpha_process_pixblock_tail
    /* nothing */
.endm

.macro RGBtoRGBPixelAlpha_process_pixblock_tail_head
    vld4.8      {d0-d3}, [SRC]!
                                    PF add PF_X, PF_X, #8
        vst4.8      {d28-d31}, [DST_W :128]!
                                    PF tst PF_CTL, #0xF
    vld4.8      {d4-d7}, [DST_R :128]!
                                    PF addne PF_X, PF_X, #8
    vmvn        d30, d3  /* get inverted source alpha */
    vmov        d31, d7  /* dest alpha is always unchanged */
    vmull.u8    q14, d0, d3
                                    PF subne PF_CTL, PF_CTL, #1
    vmlal.u8    q14, d4, d30
                                    PF cmp PF_X, ORIG_W
    vmull.u8    q0, d1, d3
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    vmlal.u8    q0, d5, d30
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    vmull.u8    q1, d2, d3
                                    PF subge PF_X, PF_X, ORIG_W
    vmlal.u8    q1, d6, d30
                                    PF subges PF_CTL, PF_CTL, #0x10
    vrshr.u16   q2, q14, #8
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    vrshr.u16   q3, q0, #8
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    vraddhn.u16 d28, q14, q2
    vrshr.u16   q2, q1, #8
    vraddhn.u16 d29, q0, q3
    vraddhn.u16 d30, q1, q2
.endm

generate_composite_function \
    BlitRGBtoRGBPixelAlphaARMNEONAsm, 32, 0, 32, \
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    8, /* number of pixels, processed in a single block */ \
    5, /* prefetch distance */ \
    default_init, \
    default_cleanup, \
    RGBtoRGBPixelAlpha_process_pixblock_head, \
    RGBtoRGBPixelAlpha_process_pixblock_tail, \
    RGBtoRGBPixelAlpha_process_pixblock_tail_head

 /******************************************************************************/

.macro ARGBto565PixelAlpha_process_pixblock_head
    vmvn        d6, d3
    vshr.u8     d1, #2
    vshr.u8     d3, #3
    vshr.u8     d0, #3
    vshrn.u16   d7, q2, #3
    vshrn.u16   d25, q2, #8
    vbic.i16    q2, #0xe0
    vshr.u8     d6, #3
    vshr.u8     d7, #2
    vshr.u8     d2, #3
    vmovn.u16   d24, q2
    vshr.u8     d25, #3
    vmull.u8    q13, d1, d3
    vmlal.u8    q13, d7, d6
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d24, d6
    vmull.u8    q15, d2, d3
    vmlal.u8    q15, d25, d6
.endm

.macro ARGBto565PixelAlpha_process_pixblock_tail
    vsra.u16    q13, #5
    vsra.u16    q14, #5
    vsra.u16    q15, #5
    vrshr.u16   q13, #5
    vrshr.u16   q14, #5
    vrshr.u16   q15, #5
    vsli.u16    q14, q13, #5
    vsli.u16    q14, q15, #11
.endm

.macro ARGBto565PixelAlpha_process_pixblock_tail_head
    vld4.8      {d0-d3}, [SRC]!
                                    PF add PF_X, PF_X, #8
        vsra.u16    q13, #5
                                    PF tst PF_CTL, #0xF
        vsra.u16    q14, #5
                                    PF addne PF_X, PF_X, #8
        vsra.u16    q15, #5
                                    PF subne PF_CTL, PF_CTL, #1
        vrshr.u16   q13, #5
                                    PF cmp PF_X, ORIG_W
        vrshr.u16   q14, #5
                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
        vrshr.u16   q15, #5
                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    vld1.8      {d4-d5}, [DST_R]!
                                    PF subge PF_X, PF_X, ORIG_W
        vsli.u16    q14, q13, #5
                                    PF subges PF_CTL, PF_CTL, #0x10
        vsli.u16    q14, q15, #11
                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
        vst1.8      {q14}, [DST_W :128]!
    vmvn        d6, d3
    vshr.u8     d1, #2
    vshr.u8     d3, #3
    vshr.u8     d0, #3
    vshrn.u16   d7, q2, #3
    vshrn.u16   d25, q2, #8
    vbic.i16    q2, #0xe0
                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    vshr.u8     d6, #3
    vshr.u8     d7, #2
    vshr.u8     d2, #3
    vmovn.u16   d24, q2
    vshr.u8     d25, #3
    vmull.u8    q13, d1, d3
    vmlal.u8    q13, d7, d6
    vmull.u8    q14, d0, d3
    vmlal.u8    q14, d24, d6
    vmull.u8    q15, d2, d3
    vmlal.u8    q15, d25, d6
.endm

generate_composite_function \
    BlitARGBto565PixelAlphaARMNEONAsm, 32, 0, 16, \
    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    8, /* number of pixels, processed in a single block */ \
    6, /* prefetch distance */ \
    default_init, \
    default_cleanup, \
    ARGBto565PixelAlpha_process_pixblock_head, \
    ARGBto565PixelAlpha_process_pixblock_tail, \
    ARGBto565PixelAlpha_process_pixblock_tail_head