src/hermes/mmxp2_32.asm
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13219 4f88e197acad
parent 5885 bc6043298ccb
permissions -rw-r--r--
ARM: Create configure option --enable-arm-neon to govern assembly optimizations
---
configure.in | 39 +++++++++++++++++++++++++++++++++++++++
include/SDL_config.h.in | 1 +
include/SDL_cpuinfo.h | 3 +++
src/cpuinfo/SDL_cpuinfo.c | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 80 insertions(+)
     1 ;
     2 ; pII-optimised MMX format converters for HERMES
     3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
     4 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
     5 ; This source code is licensed under the GNU LGPL
     6 ; 
     7 ; Please refer to the file COPYING.LIB contained in the distribution for
     8 ; licensing conditions		
     9 ;
    10 ; COPYRIGHT NOTICE
    11 ; 
    12 ; This file partly contains code that is (c) Intel Corporation, specifically
    13 ; the mode detection routine, and the converter to 15 bit (8 pixel
    14 ; conversion routine from the mmx programming tutorial pages).
    15 ;
    16 ;
    17 ; These routines aren't exactly pII optimised - it's just that as they
    18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
    19 ; optimise them for p5 MMXs..
    20 
    21 BITS 32
    22 
    23 %include "common.inc"
    24 	
    25 SDL_FUNC _ConvertMMXpII32_24RGB888
    26 SDL_FUNC _ConvertMMXpII32_16RGB565
    27 SDL_FUNC _ConvertMMXpII32_16BGR565
    28 SDL_FUNC _ConvertMMXpII32_16RGB555
    29 SDL_FUNC _ConvertMMXpII32_16BGR555
    30 
    31 ;; Macros for conversion routines
    32 
    33 %macro _push_immq_mask 1
    34 	push dword %1
    35 	push dword %1
    36 %endmacro
    37 
    38 %macro load_immq 2
    39 	_push_immq_mask %2
    40 	movq %1, [esp]
    41 %endmacro
    42 
    43 %macro pand_immq 2
    44 	_push_immq_mask %2
    45 	pand %1, [esp]
    46 %endmacro
    47 
    48 %define CLEANUP_IMMQ_LOADS(num) \
    49 	add esp, byte 8 * num
    50 
    51 %define mmx32_rgb888_mask 00ffffffh
    52 %define mmx32_rgb565_b 000000f8h
    53 %define mmx32_rgb565_g 0000fc00h
    54 %define mmx32_rgb565_r 00f80000h
    55 
    56 %define mmx32_rgb555_rb 00f800f8h
    57 %define mmx32_rgb555_g 0000f800h
    58 %define mmx32_rgb555_mul 20000008h
    59 %define mmx32_bgr555_mul 00082000h
    60 
    61 SECTION .text
    62 
    63 _ConvertMMXpII32_24RGB888:
    64 
    65         ; set up mm6 as the mask, mm7 as zero
    66         load_immq mm6, mmx32_rgb888_mask
    67         CLEANUP_IMMQ_LOADS(1)
    68         pxor mm7, mm7
    69 
    70         mov edx, ecx                    ; save ecx
    71         and ecx, 0fffffffch             ; clear lower two bits
    72         jnz .L1
    73         jmp .L2
    74 
    75 .L1:
    76 
    77         movq mm0, [esi]                 ; A R G B a r g b
    78         pand mm0, mm6                   ; 0 R G B 0 r g b
    79         movq mm1, [esi+8]               ; A R G B a r g b
    80         pand mm1, mm6                   ; 0 R G B 0 r g b
    81 
    82         movq mm2, mm0                   ; 0 R G B 0 r g b
    83         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
    84         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
    85         psllq mm2, 24                   ; 0 0 R G B 0 0 0
    86         por mm0, mm2                    ; 0 0 R G B r g b
    87 
    88         movq mm3, mm1                   ; 0 R G B 0 r g b
    89         psllq mm3, 48                   ; g b 0 0 0 0 0 0
    90         por mm0, mm3                    ; g b R G B r g b
    91 
    92         movq mm4, mm1                   ; 0 R G B 0 r g b
    93         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
    94         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
    95         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
    96         psllq mm4, 8                    ; 0 0 0 0 R G B 0
    97         por mm1, mm4                    ; 0 0 0 0 R G B r
    98 
    99         movq [edi], mm0
   100         add esi, BYTE 16
   101         movd [edi+8], mm1
   102         add edi, BYTE 12
   103         sub ecx, BYTE 4
   104         jnz .L1
   105 
   106 .L2:
   107         mov ecx, edx
   108         and ecx, BYTE 3
   109         jz .L4
   110 .L3:
   111         mov al, [esi]
   112         mov bl, [esi+1]
   113         mov dl, [esi+2]
   114         mov [edi], al
   115         mov [edi+1], bl
   116         mov [edi+2], dl
   117         add esi, BYTE 4
   118         add edi, BYTE 3
   119         dec ecx
   120         jnz .L3
   121 .L4:
   122         retn
   123 
   124 
   125 
   126 _ConvertMMXpII32_16RGB565:
   127 
   128         ; set up masks
   129         load_immq mm5, mmx32_rgb565_b
   130         load_immq mm6, mmx32_rgb565_g
   131         load_immq mm7, mmx32_rgb565_r
   132         CLEANUP_IMMQ_LOADS(3)
   133 
   134         mov edx, ecx
   135         shr ecx, 2
   136         jnz .L1
   137         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
   138 
   139 .L1:
   140         movq mm0, [esi]         ; argb
   141         movq mm1, mm0           ; argb
   142         pand mm0, mm6           ; 00g0
   143         movq mm3, mm1           ; argb
   144         pand mm1, mm5           ; 000b
   145         pand mm3, mm7           ; 0r00
   146         pslld mm1, 2            ; 0 0 000000bb bbb00000
   147         por mm0, mm1            ; 0 0 ggggggbb bbb00000
   148         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
   149 
   150         movq mm4, [esi+8]       ; argb
   151         movq mm2, mm4           ; argb
   152         pand mm4, mm6           ; 00g0
   153         movq mm1, mm2           ; argb
   154         pand mm2, mm5           ; 000b
   155         pand mm1, mm7           ; 0r00
   156         pslld mm2, 2            ; 0 0 000000bb bbb00000
   157         por mm4, mm2            ; 0 0 ggggggbb bbb00000
   158         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
   159 
   160         packuswb mm3, mm1       ; R 0 r 0
   161         packssdw mm0, mm4       ; as above.. ish
   162         por mm0, mm3            ; done.
   163         movq [edi], mm0
   164 
   165         add esi, 16
   166         add edi, 8
   167         dec ecx
   168         jnz .L1
   169 
   170 .L2:
   171         mov ecx, edx
   172         and ecx, BYTE 3
   173         jz .L4
   174 .L3:
   175         mov al, [esi]
   176         mov bh, [esi+1]
   177         mov ah, [esi+2]
   178         shr al, 3
   179         and eax, 0F81Fh            ; BYTE?
   180         shr ebx, 5
   181         and ebx, 07E0h             ; BYTE?
   182         add eax, ebx
   183         mov [edi], al
   184         mov [edi+1], ah
   185         add esi, BYTE 4
   186         add edi, BYTE 2
   187         dec ecx
   188         jnz .L3
   189 
   190 .L4:
   191 	retn
   192 
   193 	
   194 _ConvertMMXpII32_16BGR565:
   195 
   196         load_immq mm5, mmx32_rgb565_r
   197         load_immq mm6, mmx32_rgb565_g
   198         load_immq mm7, mmx32_rgb565_b
   199         CLEANUP_IMMQ_LOADS(3)
   200 
   201         mov edx, ecx
   202         shr ecx, 2
   203         jnz .L1
   204         jmp .L2
   205 
   206 .L1:
   207         movq mm0, [esi]                 ; a r g b
   208         movq mm1, mm0                   ; a r g b
   209         pand mm0, mm6                   ; 0 0 g 0
   210         movq mm3, mm1                   ; a r g b
   211         pand mm1, mm5                   ; 0 r 0 0
   212         pand mm3, mm7                   ; 0 0 0 b
   213 
   214         psllq mm3, 16                   ; 0 b 0 0
   215         psrld mm1, 14                   ; 0 0 000000rr rrr00000
   216         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
   217         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
   218 
   219         movq mm4, [esi+8]               ; a r g b
   220         movq mm2, mm4                   ; a r g b
   221         pand mm4, mm6                   ; 0 0 g 0
   222         movq mm1, mm2                   ; a r g b
   223         pand mm2, mm5                   ; 0 r 0 0
   224         pand mm1, mm7                   ; 0 0 0 b
   225 
   226         psllq mm1, 16                   ; 0 b 0 0
   227         psrld mm2, 14                   ; 0 0 000000rr rrr00000
   228         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
   229         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
   230 
   231         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
   232         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
   233         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
   234         movq [edi], mm0
   235 
   236         add esi, BYTE 16
   237         add edi, BYTE 8
   238         dec ecx
   239         jnz .L1
   240 
   241 .L2:
   242         and edx, BYTE 3
   243         jz .L4
   244 .L3:
   245         mov al, [esi+2]
   246         mov bh, [esi+1]
   247         mov ah, [esi]
   248         shr al, 3
   249         and eax, 0F81Fh                    ; BYTE ?
   250         shr ebx, 5
   251         and ebx, 07E0h                     ; BYTE ?
   252         add eax, ebx
   253         mov [edi], al
   254         mov [edi+1], ah
   255         add esi, BYTE 4
   256         add edi, BYTE 2
   257         dec edx
   258         jnz .L3
   259 
   260 .L4:
   261         retn
   262 
   263 _ConvertMMXpII32_16BGR555:
   264 
   265         ; the 16BGR555 converter is identical to the RGB555 one,
   266         ; except it uses a different multiplier for the pmaddwd
   267         ; instruction.  cool huh.
   268 
   269         load_immq mm7, mmx32_bgr555_mul
   270         jmp _convert_bgr555_cheat
   271 
   272 ; This is the same as the Intel version.. they obviously went to
   273 ; much more trouble to expand/coil the loop than I did, so theirs
   274 ; would almost certainly be faster, even if only a little.
   275 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
   276 ; (I think) a more accurate name..
   277 _ConvertMMXpII32_16RGB555:
   278 
   279 	load_immq mm7, mmx32_rgb555_mul
   280 _convert_bgr555_cheat:
   281 	load_immq mm6, mmx32_rgb555_g
   282 	CLEANUP_IMMQ_LOADS(2)
   283         
   284 	mov edx,ecx		           ; Save ecx 
   285 
   286         and ecx,DWORD 0fffffff8h            ; clear lower three bits
   287 	jnz .L_OK
   288         jmp near .L2 
   289 
   290 .L_OK:
   291 	
   292 	movq mm2,[esi+8]
   293 
   294 	movq mm0,[esi]
   295 	movq mm3,mm2
   296 
   297 	pand_immq mm3, mmx32_rgb555_rb
   298 	movq mm1,mm0
   299 
   300 	pand_immq mm1, mmx32_rgb555_rb
   301 	pmaddwd mm3,mm7
   302 
   303 	CLEANUP_IMMQ_LOADS(2)
   304 
   305 	pmaddwd mm1,mm7
   306 	pand mm2,mm6
   307 
   308 .L1:
   309 	movq mm4,[esi+24]
   310 	pand mm0,mm6
   311 
   312 	movq mm5,[esi+16]
   313 	por mm3,mm2
   314 
   315 	psrld mm3,6
   316 	por mm1,mm0
   317 
   318 	movq mm0,mm4
   319 	psrld mm1,6
   320 
   321 	pand_immq mm0, mmx32_rgb555_rb
   322 	packssdw mm1,mm3
   323 
   324 	movq mm3,mm5
   325 	pmaddwd mm0,mm7
   326 
   327 	pand_immq mm3, mmx32_rgb555_rb
   328 	pand mm4,mm6
   329 
   330 	movq [edi],mm1			
   331 	pmaddwd mm3,mm7
   332 
   333         add esi,BYTE 32
   334 	por mm4,mm0
   335 
   336 	pand mm5,mm6
   337 	psrld mm4,6
   338 
   339 	movq mm2,[esi+8]
   340 	por mm5,mm3
   341 
   342 	movq mm0,[esi]
   343 	psrld mm5,6
   344 
   345 	movq mm3,mm2
   346 	movq mm1,mm0
   347 
   348 	pand_immq mm3, mmx32_rgb555_rb
   349 	packssdw mm5,mm4
   350 
   351 	pand_immq mm1, mmx32_rgb555_rb
   352 	pand mm2,mm6
   353 
   354 	CLEANUP_IMMQ_LOADS(4)
   355 
   356 	movq [edi+8],mm5
   357 	pmaddwd mm3,mm7
   358 
   359 	pmaddwd mm1,mm7
   360         add edi,BYTE 16
   361 	
   362         sub ecx,BYTE 8
   363 	jz .L2
   364         jmp .L1
   365 
   366 
   367 .L2:	
   368 	mov ecx,edx
   369 	
   370         and ecx,BYTE 7
   371 	jz .L4
   372 	
   373 .L3:	
   374 	mov ebx,[esi]
   375         add esi,BYTE 4
   376 	
   377         mov eax,ebx
   378         mov edx,ebx
   379 
   380         shr eax,3
   381         shr edx,6
   382 
   383         and eax,BYTE 0000000000011111b
   384         and edx,     0000001111100000b
   385 
   386         shr ebx,9
   387 
   388         or eax,edx
   389 
   390         and ebx,     0111110000000000b
   391 
   392         or eax,ebx
   393 
   394         mov [edi],ax
   395         add edi,BYTE 2
   396 
   397 	dec ecx
   398 	jnz .L3	
   399 
   400 .L4:		
   401 	retn
   402 
   403 %ifidn __OUTPUT_FORMAT__,elf32
   404 section .note.GNU-stack noalloc noexec nowrite progbits
   405 %endif