src/hermes/mmxp2_32.asm
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13219 4f88e197acad
parent 5885 bc6043298ccb
permissions -rw-r--r--
ARM: Create configure option --enable-arm-neon to govern assembly optimizations
---
configure.in | 39 +++++++++++++++++++++++++++++++++++++++
include/SDL_config.h.in | 1 +
include/SDL_cpuinfo.h | 3 +++
src/cpuinfo/SDL_cpuinfo.c | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 80 insertions(+)
slouken@0
     1
;
slouken@0
     2
; pII-optimised MMX format converters for HERMES
slouken@0
     3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
slouken@0
     4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
slouken@0
     5
; This source code is licensed under the GNU LGPL
slouken@0
     6
; 
slouken@0
     7
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     8
; licensing conditions		
slouken@0
     9
;
slouken@0
    10
; COPYRIGHT NOTICE
slouken@0
    11
; 
slouken@0
    12
; This file partly contains code that is (c) Intel Corporation, specifically
slouken@0
    13
; the mode detection routine, and the converter to 15 bit (8 pixel
slouken@0
    14
; conversion routine from the mmx programming tutorial pages).
slouken@0
    15
;
slouken@0
    16
;
slouken@0
    17
; These routines aren't exactly pII optimised - it's just that as they
slouken@0
    18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
slouken@0
    19
; optimise them for p5 MMXs..
slouken@0
    20
slouken@0
    21
BITS 32
slouken@0
    22
slouken@1873
    23
%include "common.inc"
slouken@0
    24
	
slouken@1871
    25
SDL_FUNC _ConvertMMXpII32_24RGB888
slouken@1871
    26
SDL_FUNC _ConvertMMXpII32_16RGB565
slouken@1871
    27
SDL_FUNC _ConvertMMXpII32_16BGR565
slouken@1871
    28
SDL_FUNC _ConvertMMXpII32_16RGB555
slouken@1871
    29
SDL_FUNC _ConvertMMXpII32_16BGR555
slouken@0
    30
icculus@1230
    31
;; Macros for conversion routines
slouken@0
    32
icculus@1230
    33
%macro _push_immq_mask 1
icculus@1230
    34
	push dword %1
icculus@1230
    35
	push dword %1
icculus@1230
    36
%endmacro
slouken@0
    37
icculus@1230
    38
%macro load_immq 2
icculus@1230
    39
	_push_immq_mask %2
icculus@1230
    40
	movq %1, [esp]
icculus@1230
    41
%endmacro
slouken@0
    42
icculus@1230
    43
%macro pand_immq 2
icculus@1230
    44
	_push_immq_mask %2
icculus@1230
    45
	pand %1, [esp]
icculus@1230
    46
%endmacro
icculus@1230
    47
icculus@1230
    48
%define CLEANUP_IMMQ_LOADS(num) \
icculus@1230
    49
	add esp, byte 8 * num
slouken@0
    50
icculus@1230
    51
%define mmx32_rgb888_mask 00ffffffh
icculus@1230
    52
%define mmx32_rgb565_b 000000f8h
icculus@1230
    53
%define mmx32_rgb565_g 0000fc00h
icculus@1230
    54
%define mmx32_rgb565_r 00f80000h
slouken@0
    55
icculus@1230
    56
%define mmx32_rgb555_rb 00f800f8h
icculus@1230
    57
%define mmx32_rgb555_g 0000f800h
icculus@1230
    58
%define mmx32_rgb555_mul 20000008h
icculus@1230
    59
%define mmx32_bgr555_mul 00082000h
slouken@0
    60
slouken@0
    61
SECTION .text
slouken@0
    62
slouken@0
    63
_ConvertMMXpII32_24RGB888:
slouken@0
    64
slouken@0
    65
        ; set up mm6 as the mask, mm7 as zero
icculus@1230
    66
        load_immq mm6, mmx32_rgb888_mask
icculus@1230
    67
        CLEANUP_IMMQ_LOADS(1)
slouken@0
    68
        pxor mm7, mm7
slouken@0
    69
slouken@0
    70
        mov edx, ecx                    ; save ecx
slouken@0
    71
        and ecx, 0fffffffch             ; clear lower two bits
slouken@0
    72
        jnz .L1
slouken@0
    73
        jmp .L2
slouken@0
    74
slouken@0
    75
.L1:
slouken@0
    76
slouken@0
    77
        movq mm0, [esi]                 ; A R G B a r g b
slouken@0
    78
        pand mm0, mm6                   ; 0 R G B 0 r g b
slouken@0
    79
        movq mm1, [esi+8]               ; A R G B a r g b
slouken@0
    80
        pand mm1, mm6                   ; 0 R G B 0 r g b
slouken@0
    81
slouken@0
    82
        movq mm2, mm0                   ; 0 R G B 0 r g b
slouken@0
    83
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
slouken@0
    84
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
slouken@0
    85
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
slouken@0
    86
        por mm0, mm2                    ; 0 0 R G B r g b
slouken@0
    87
slouken@0
    88
        movq mm3, mm1                   ; 0 R G B 0 r g b
slouken@0
    89
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
slouken@0
    90
        por mm0, mm3                    ; g b R G B r g b
slouken@0
    91
slouken@0
    92
        movq mm4, mm1                   ; 0 R G B 0 r g b
slouken@0
    93
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
slouken@0
    94
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
slouken@0
    95
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
slouken@0
    96
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
slouken@0
    97
        por mm1, mm4                    ; 0 0 0 0 R G B r
slouken@0
    98
slouken@0
    99
        movq [edi], mm0
slouken@0
   100
        add esi, BYTE 16
slouken@0
   101
        movd [edi+8], mm1
slouken@0
   102
        add edi, BYTE 12
slouken@0
   103
        sub ecx, BYTE 4
slouken@0
   104
        jnz .L1
slouken@0
   105
slouken@0
   106
.L2:
slouken@0
   107
        mov ecx, edx
slouken@0
   108
        and ecx, BYTE 3
slouken@0
   109
        jz .L4
slouken@0
   110
.L3:
slouken@0
   111
        mov al, [esi]
slouken@0
   112
        mov bl, [esi+1]
slouken@0
   113
        mov dl, [esi+2]
slouken@0
   114
        mov [edi], al
slouken@0
   115
        mov [edi+1], bl
slouken@0
   116
        mov [edi+2], dl
slouken@0
   117
        add esi, BYTE 4
slouken@0
   118
        add edi, BYTE 3
slouken@0
   119
        dec ecx
slouken@0
   120
        jnz .L3
slouken@0
   121
.L4:
icculus@5885
   122
        retn
slouken@0
   123
slouken@0
   124
slouken@0
   125
slouken@0
   126
_ConvertMMXpII32_16RGB565:
slouken@0
   127
slouken@0
   128
        ; set up masks
icculus@1230
   129
        load_immq mm5, mmx32_rgb565_b
icculus@1230
   130
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   131
        load_immq mm7, mmx32_rgb565_r
icculus@1230
   132
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   133
slouken@0
   134
        mov edx, ecx
slouken@0
   135
        shr ecx, 2
slouken@0
   136
        jnz .L1
slouken@0
   137
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
slouken@0
   138
slouken@0
   139
.L1:
slouken@0
   140
        movq mm0, [esi]         ; argb
slouken@0
   141
        movq mm1, mm0           ; argb
slouken@0
   142
        pand mm0, mm6           ; 00g0
slouken@0
   143
        movq mm3, mm1           ; argb
slouken@0
   144
        pand mm1, mm5           ; 000b
slouken@0
   145
        pand mm3, mm7           ; 0r00
slouken@0
   146
        pslld mm1, 2            ; 0 0 000000bb bbb00000
slouken@0
   147
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
slouken@0
   148
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   149
slouken@0
   150
        movq mm4, [esi+8]       ; argb
slouken@0
   151
        movq mm2, mm4           ; argb
slouken@0
   152
        pand mm4, mm6           ; 00g0
slouken@0
   153
        movq mm1, mm2           ; argb
slouken@0
   154
        pand mm2, mm5           ; 000b
slouken@0
   155
        pand mm1, mm7           ; 0r00
slouken@0
   156
        pslld mm2, 2            ; 0 0 000000bb bbb00000
slouken@0
   157
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
slouken@0
   158
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   159
slouken@0
   160
        packuswb mm3, mm1       ; R 0 r 0
slouken@0
   161
        packssdw mm0, mm4       ; as above.. ish
slouken@0
   162
        por mm0, mm3            ; done.
slouken@0
   163
        movq [edi], mm0
slouken@0
   164
slouken@0
   165
        add esi, 16
slouken@0
   166
        add edi, 8
slouken@0
   167
        dec ecx
slouken@0
   168
        jnz .L1
slouken@0
   169
slouken@0
   170
.L2:
slouken@0
   171
        mov ecx, edx
slouken@0
   172
        and ecx, BYTE 3
slouken@0
   173
        jz .L4
slouken@0
   174
.L3:
slouken@0
   175
        mov al, [esi]
slouken@0
   176
        mov bh, [esi+1]
slouken@0
   177
        mov ah, [esi+2]
slouken@0
   178
        shr al, 3
slouken@0
   179
        and eax, 0F81Fh            ; BYTE?
slouken@0
   180
        shr ebx, 5
slouken@0
   181
        and ebx, 07E0h             ; BYTE?
slouken@0
   182
        add eax, ebx
slouken@0
   183
        mov [edi], al
slouken@0
   184
        mov [edi+1], ah
slouken@0
   185
        add esi, BYTE 4
slouken@0
   186
        add edi, BYTE 2
slouken@0
   187
        dec ecx
slouken@0
   188
        jnz .L3
slouken@0
   189
slouken@0
   190
.L4:
icculus@3983
   191
	retn
slouken@0
   192
slouken@0
   193
	
slouken@0
   194
_ConvertMMXpII32_16BGR565:
slouken@0
   195
icculus@1230
   196
        load_immq mm5, mmx32_rgb565_r
icculus@1230
   197
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   198
        load_immq mm7, mmx32_rgb565_b
icculus@1230
   199
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   200
slouken@0
   201
        mov edx, ecx
slouken@0
   202
        shr ecx, 2
slouken@0
   203
        jnz .L1
slouken@0
   204
        jmp .L2
slouken@0
   205
slouken@0
   206
.L1:
slouken@0
   207
        movq mm0, [esi]                 ; a r g b
slouken@0
   208
        movq mm1, mm0                   ; a r g b
slouken@0
   209
        pand mm0, mm6                   ; 0 0 g 0
slouken@0
   210
        movq mm3, mm1                   ; a r g b
slouken@0
   211
        pand mm1, mm5                   ; 0 r 0 0
slouken@0
   212
        pand mm3, mm7                   ; 0 0 0 b
slouken@0
   213
slouken@0
   214
        psllq mm3, 16                   ; 0 b 0 0
slouken@0
   215
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
slouken@0
   216
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
slouken@0
   217
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   218
slouken@0
   219
        movq mm4, [esi+8]               ; a r g b
slouken@0
   220
        movq mm2, mm4                   ; a r g b
slouken@0
   221
        pand mm4, mm6                   ; 0 0 g 0
slouken@0
   222
        movq mm1, mm2                   ; a r g b
slouken@0
   223
        pand mm2, mm5                   ; 0 r 0 0
slouken@0
   224
        pand mm1, mm7                   ; 0 0 0 b
slouken@0
   225
slouken@0
   226
        psllq mm1, 16                   ; 0 b 0 0
slouken@0
   227
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
slouken@0
   228
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
slouken@0
   229
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   230
slouken@0
   231
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
slouken@0
   232
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
slouken@0
   233
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
slouken@0
   234
        movq [edi], mm0
slouken@0
   235
slouken@0
   236
        add esi, BYTE 16
slouken@0
   237
        add edi, BYTE 8
slouken@0
   238
        dec ecx
slouken@0
   239
        jnz .L1
slouken@0
   240
slouken@0
   241
.L2:
slouken@0
   242
        and edx, BYTE 3
slouken@0
   243
        jz .L4
slouken@0
   244
.L3:
slouken@0
   245
        mov al, [esi+2]
slouken@0
   246
        mov bh, [esi+1]
slouken@0
   247
        mov ah, [esi]
slouken@0
   248
        shr al, 3
slouken@0
   249
        and eax, 0F81Fh                    ; BYTE ?
slouken@0
   250
        shr ebx, 5
slouken@0
   251
        and ebx, 07E0h                     ; BYTE ?
slouken@0
   252
        add eax, ebx
slouken@0
   253
        mov [edi], al
slouken@0
   254
        mov [edi+1], ah
slouken@0
   255
        add esi, BYTE 4
slouken@0
   256
        add edi, BYTE 2
slouken@0
   257
        dec edx
slouken@0
   258
        jnz .L3
slouken@0
   259
slouken@0
   260
.L4:
icculus@3983
   261
        retn
slouken@0
   262
slouken@0
   263
_ConvertMMXpII32_16BGR555:
slouken@0
   264
slouken@0
   265
        ; the 16BGR555 converter is identical to the RGB555 one,
slouken@0
   266
        ; except it uses a different multiplier for the pmaddwd
slouken@0
   267
        ; instruction.  cool huh.
slouken@0
   268
icculus@1230
   269
        load_immq mm7, mmx32_bgr555_mul
slouken@0
   270
        jmp _convert_bgr555_cheat
slouken@0
   271
slouken@0
   272
; This is the same as the Intel version.. they obviously went to
slouken@0
   273
; much more trouble to expand/coil the loop than I did, so theirs
slouken@0
   274
; would almost certainly be faster, even if only a little.
slouken@0
   275
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
slouken@0
   276
; (I think) a more accurate name..
slouken@0
   277
_ConvertMMXpII32_16RGB555:
slouken@0
   278
icculus@1230
   279
	load_immq mm7, mmx32_rgb555_mul
slouken@0
   280
_convert_bgr555_cheat:
icculus@1230
   281
	load_immq mm6, mmx32_rgb555_g
icculus@1230
   282
	CLEANUP_IMMQ_LOADS(2)
slouken@0
   283
        
slouken@0
   284
	mov edx,ecx		           ; Save ecx 
slouken@0
   285
icculus@3996
   286
        and ecx,DWORD 0fffffff8h            ; clear lower three bits
slouken@0
   287
	jnz .L_OK
slouken@289
   288
        jmp near .L2 
slouken@0
   289
slouken@0
   290
.L_OK:
slouken@0
   291
	
slouken@0
   292
	movq mm2,[esi+8]
slouken@0
   293
slouken@0
   294
	movq mm0,[esi]
slouken@0
   295
	movq mm3,mm2
slouken@0
   296
icculus@1230
   297
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   298
	movq mm1,mm0
slouken@0
   299
icculus@1230
   300
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   301
	pmaddwd mm3,mm7
slouken@0
   302
icculus@1230
   303
	CLEANUP_IMMQ_LOADS(2)
icculus@1230
   304
slouken@0
   305
	pmaddwd mm1,mm7
slouken@0
   306
	pand mm2,mm6
slouken@0
   307
slouken@0
   308
.L1:
slouken@0
   309
	movq mm4,[esi+24]
slouken@0
   310
	pand mm0,mm6
slouken@0
   311
slouken@0
   312
	movq mm5,[esi+16]
slouken@0
   313
	por mm3,mm2
slouken@0
   314
slouken@0
   315
	psrld mm3,6
slouken@0
   316
	por mm1,mm0
slouken@0
   317
slouken@0
   318
	movq mm0,mm4
slouken@0
   319
	psrld mm1,6
slouken@0
   320
icculus@1230
   321
	pand_immq mm0, mmx32_rgb555_rb
slouken@0
   322
	packssdw mm1,mm3
slouken@0
   323
slouken@0
   324
	movq mm3,mm5
slouken@0
   325
	pmaddwd mm0,mm7
slouken@0
   326
icculus@1230
   327
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   328
	pand mm4,mm6
slouken@0
   329
slouken@0
   330
	movq [edi],mm1			
slouken@0
   331
	pmaddwd mm3,mm7
slouken@0
   332
slouken@0
   333
        add esi,BYTE 32
slouken@0
   334
	por mm4,mm0
slouken@0
   335
slouken@0
   336
	pand mm5,mm6
slouken@0
   337
	psrld mm4,6
slouken@0
   338
slouken@0
   339
	movq mm2,[esi+8]
slouken@0
   340
	por mm5,mm3
slouken@0
   341
slouken@0
   342
	movq mm0,[esi]
slouken@0
   343
	psrld mm5,6
slouken@0
   344
slouken@0
   345
	movq mm3,mm2
slouken@0
   346
	movq mm1,mm0
slouken@0
   347
icculus@1230
   348
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   349
	packssdw mm5,mm4
slouken@0
   350
icculus@1230
   351
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   352
	pand mm2,mm6
slouken@0
   353
icculus@1230
   354
	CLEANUP_IMMQ_LOADS(4)
icculus@1230
   355
slouken@0
   356
	movq [edi+8],mm5
slouken@0
   357
	pmaddwd mm3,mm7
slouken@0
   358
slouken@0
   359
	pmaddwd mm1,mm7
slouken@0
   360
        add edi,BYTE 16
slouken@0
   361
	
slouken@0
   362
        sub ecx,BYTE 8
slouken@0
   363
	jz .L2
slouken@0
   364
        jmp .L1
slouken@0
   365
slouken@0
   366
slouken@0
   367
.L2:	
slouken@0
   368
	mov ecx,edx
slouken@0
   369
	
slouken@0
   370
        and ecx,BYTE 7
slouken@0
   371
	jz .L4
slouken@0
   372
	
slouken@0
   373
.L3:	
slouken@0
   374
	mov ebx,[esi]
slouken@0
   375
        add esi,BYTE 4
slouken@0
   376
	
slouken@0
   377
        mov eax,ebx
slouken@0
   378
        mov edx,ebx
slouken@0
   379
slouken@0
   380
        shr eax,3
slouken@0
   381
        shr edx,6
slouken@0
   382
slouken@0
   383
        and eax,BYTE 0000000000011111b
slouken@0
   384
        and edx,     0000001111100000b
slouken@0
   385
slouken@0
   386
        shr ebx,9
slouken@0
   387
slouken@0
   388
        or eax,edx
slouken@0
   389
slouken@0
   390
        and ebx,     0111110000000000b
slouken@0
   391
slouken@0
   392
        or eax,ebx
slouken@0
   393
slouken@0
   394
        mov [edi],ax
slouken@0
   395
        add edi,BYTE 2
slouken@0
   396
slouken@0
   397
	dec ecx
slouken@0
   398
	jnz .L3	
slouken@0
   399
slouken@0
   400
.L4:		
icculus@3983
   401
	retn
slouken@0
   402
slouken@5392
   403
%ifidn __OUTPUT_FORMAT__,elf32
icculus@1199
   404
section .note.GNU-stack noalloc noexec nowrite progbits
icculus@1199
   405
%endif