Skip to content

Commit

Permalink
ARM: SIMD assembly optimization for function BlitRGBtoRGBPixelAlpha
Browse files Browse the repository at this point in the history
Much of the heavy lifting of this optimization is lifted from the Pixman
project, which is distributed under an MIT-style license. As far as possible,
these elements have been relicensed to the zlib license.
  • Loading branch information
bavison committed Oct 25, 2019
1 parent 6a6a052 commit 57723b8
Show file tree
Hide file tree
Showing 4 changed files with 1,259 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/video/SDL_blit_A.c
Expand Up @@ -389,6 +389,23 @@ BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)

#endif /* __MMX__ */

#if SDL_ARM_SIMD_BLITTERS
void BlitRGBtoRGBPixelAlphaARMSIMDAsm(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, uint32_t *src, int32_t src_stride);

static void
BlitRGBtoRGBPixelAlphaARMSIMD(SDL_BlitInfo * info)
{
int32_t width = info->dst_w;
int32_t height = info->dst_h;
uint32_t *dstp = (uint32_t *)info->dst;
int32_t dststride = width + (info->dst_skip >> 2);
uint32_t *srcp = (uint32_t *)info->src;
int32_t srcstride = width + (info->src_skip >> 2);

BlitRGBtoRGBPixelAlphaARMSIMDAsm(width, height, dstp, dststride, srcp, srcstride);
}
#endif

/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
static void
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
Expand Down Expand Up @@ -1315,6 +1332,10 @@ SDL_CalculateBlitA(SDL_Surface * surface)
}
#endif /* __MMX__ || __3dNOW__ */
if (sf->Amask == 0xff000000) {
#if SDL_ARM_SIMD_BLITTERS
if (SDL_HasARMSIMD())
return BlitRGBtoRGBPixelAlphaARMSIMD;
#endif
return BlitRGBtoRGBPixelAlpha;
}
}
Expand Down
36 changes: 36 additions & 0 deletions src/video/arm/pixman-arm-asm.h
@@ -0,0 +1,36 @@
/*
* Copyright © 2010 Nokia Corporation
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Mozilla Corporation not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. Mozilla Corporation makes no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
* Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
*
*/

/* Supplementary macro for setting function attributes */
.macro pixman_asm_function fname
.func fname
.global fname
#ifdef __ELF__
.hidden fname
.type fname, %function
#endif
fname:
.endm
168 changes: 168 additions & 0 deletions src/video/arm/pixman-arm-simd-asm.S
@@ -0,0 +1,168 @@
/*
* Copyright (c) 2016 RISC OS Open Ltd
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

.text
.arch armv6
.object_arch armv4
.arm
.altmacro
.p2align 2

#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"

/* A head macro should do all processing which results in an output of up to
* 16 bytes, as far as the final load instruction. The corresponding tail macro
* should complete the processing of the up-to-16 bytes. The calling macro will
* sometimes choose to insert a preload or a decrement of X between them.
* cond ARM condition code for code block
* numbytes Number of output bytes that should be generated this time
* firstreg First WK register in which to place output
* unaligned_src Whether to use non-wordaligned loads of source image
* unaligned_mask Whether to use non-wordaligned loads of mask image
* preload If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
*/

/******************************************************************************/

/* This differs from the over_8888_8888 routine in Pixman in that the destination
* alpha component is always left unchanged, and RGB components are not
* premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
* renormalisation is done by multiplying by 257/256 (with rounding) rather than
* simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
*/

.macro RGBtoRGBPixelAlpha_init
line_saved_regs STRIDE_S, ORIG_W
mov MASK, #0x80
.endm

.macro RGBtoRGBPixelAlpha_1pixel_translucent s, d, tmp0, tmp1, tmp2, tmp3, half
uxtb tmp3, s
uxtb tmp0, d
sub tmp0, tmp3, tmp0
uxtb tmp3, s, ror #16
uxtb tmp1, d, ror #16
sub tmp1, tmp3, tmp1
uxtb tmp3, s, ror #8
mov s, s, lsr #24
uxtb tmp2, d, ror #8
sub tmp2, tmp3, tmp2
smlabb tmp0, tmp0, s, half
smlabb tmp1, tmp1, s, half
smlabb tmp2, tmp2, s, half
add tmp0, tmp0, asr #8
add tmp1, tmp1, asr #8
add tmp2, tmp2, asr #8
pkhbt tmp0, tmp0, tmp1, lsl #16
and tmp2, tmp2, #0xff00
uxtb16 tmp0, tmp0, ror #8
orr tmp0, tmp0, tmp2
uadd8 d, d, tmp0
.endm

.macro RGBtoRGBPixelAlpha_1pixel_opaque s, d
and d, d, #0xff000000
bic s, s, #0xff000000
orr d, d, s
.endm

.macro RGBtoRGBPixelAlpha_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
.if numbytes == 16
ldm SRC!, {WK0, WK1}
ldm SRC!, {STRIDE_S, STRIDE_M}
ldrd WK2, WK3, [DST], #16
orr SCRATCH, WK0, WK1
and ORIG_W, WK0, WK1
orr SCRATCH, SCRATCH, STRIDE_S
and ORIG_W, ORIG_W, STRIDE_S
orr SCRATCH, SCRATCH, STRIDE_M
and ORIG_W, ORIG_W, STRIDE_M
tst SCRATCH, #0xff000000
.elseif numbytes == 8
ldm SRC!, {WK0, WK1}
ldm DST!, {WK2, WK3}
orr SCRATCH, WK0, WK1
and ORIG_W, WK0, WK1
tst SCRATCH, #0xff000000
.else // numbytes == 4
ldr WK0, [SRC], #4
ldr WK2, [DST], #4
tst WK0, #0xff000000
.endif
.endm

.macro RGBtoRGBPixelAlpha_process_tail cond, numbytes, firstreg
beq 20f @ all transparent
.if numbytes == 16
cmp ORIG_W, #0xff000000
bhs 10f @ all opaque
RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
strd WK2, WK3, [DST, #-16]
ldrd WK0, WK1, [SRC, #-8]
ldrd WK2, WK3, [DST, #-8]
RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
b 19f
10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
strd WK2, WK3, [DST, #-16]
ldrd WK0, WK1, [SRC, #-8]
ldrd WK2, WK3, [DST, #-8]
RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19: strd WK2, WK3, [DST, #-8]
.elseif numbytes == 8
cmp ORIG_W, #0xff000000
bhs 10f @ all opaque
RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
b 19f
10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19: strd WK2, WK3, [DST, #-8]
.else // numbytes == 4
cmp WK0, #0xff000000
bhs 10f @ opaque
RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
b 19f
10: RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
19: str WK2, [DST, #-4]
.endif
20:
.endm

generate_composite_function \
BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
2, /* prefetch distance */ \
RGBtoRGBPixelAlpha_init, \
nop_macro, /* newline */ \
nop_macro, /* cleanup */ \
RGBtoRGBPixelAlpha_process_head, \
RGBtoRGBPixelAlpha_process_tail

/******************************************************************************/

0 comments on commit 57723b8

Please sign in to comment.