src/hermes/x86p_16.asm
author Ben Avison <bavison@riscosopen.org>
Thu, 31 Oct 2019 14:00:28 +0300
branchSDL-1.2
changeset 13219 4f88e197acad
parent 5884 d832552bc9e5
permissions -rw-r--r--
ARM: Create configure option --enable-arm-neon to govern assembly optimizations
---
configure.in | 39 +++++++++++++++++++++++++++++++++++++++
include/SDL_config.h.in | 1 +
include/SDL_cpuinfo.h | 3 +++
src/cpuinfo/SDL_cpuinfo.c | 37 +++++++++++++++++++++++++++++++++++++
4 files changed, 80 insertions(+)
slouken@0
     1
;
slouken@0
     2
; x86 format converters for HERMES
slouken@0
     3
; Copyright (c) 1998 Glenn Fielder (gaffer@gaffer.org)
slouken@0
     4
; This source code is licensed under the GNU LGPL
slouken@0
     5
; 
slouken@0
     6
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     7
; licensing conditions		
slouken@0
     8
; 
slouken@0
     9
; Routines adjusted for Hermes by Christian Nentwich (brn@eleet.mcb.at)
slouken@0
    10
; Used with permission.
slouken@0
    11
; 
slouken@0
    12
slouken@0
    13
BITS 32
slouken@0
    14
slouken@1873
    15
%include "common.inc"
slouken@1871
    16
slouken@1871
    17
SDL_FUNC _ConvertX86p16_16BGR565
slouken@1871
    18
SDL_FUNC _ConvertX86p16_16RGB555
slouken@1871
    19
SDL_FUNC _ConvertX86p16_16BGR555
slouken@1871
    20
SDL_FUNC _ConvertX86p16_8RGB332
slouken@0
    21
slouken@0
    22
EXTERN _ConvertX86
slouken@0
    23
slouken@0
    24
SECTION .text
slouken@0
    25
slouken@0
    26
_ConvertX86p16_16BGR565:
slouken@0
    27
slouken@0
    28
    ; check short
slouken@0
    29
    cmp ecx,BYTE 16
slouken@0
    30
    ja .L3
slouken@0
    31
slouken@0
    32
icculus@5884
    33
.L1: ; short loop
slouken@0
    34
    mov al,[esi]
slouken@0
    35
    mov ah,[esi+1]
slouken@0
    36
    mov ebx,eax
slouken@0
    37
    mov edx,eax
slouken@0
    38
    shr eax,11
slouken@0
    39
    and eax,BYTE 11111b
slouken@0
    40
    and ebx,11111100000b
slouken@0
    41
    shl edx,11
slouken@0
    42
    add eax,ebx
slouken@0
    43
    add eax,edx
slouken@0
    44
    mov [edi],al
slouken@0
    45
    mov [edi+1],ah
slouken@0
    46
    add esi,BYTE 2
slouken@0
    47
    add edi,BYTE 2
slouken@0
    48
    dec ecx
slouken@0
    49
    jnz .L1
icculus@5884
    50
.L2:
icculus@3983
    51
    retn
slouken@0
    52
icculus@5884
    53
.L3: ; head
slouken@0
    54
    mov eax,edi
slouken@0
    55
    and eax,BYTE 11b
slouken@0
    56
    jz .L4
slouken@0
    57
    mov al,[esi]
slouken@0
    58
    mov ah,[esi+1]
slouken@0
    59
    mov ebx,eax
slouken@0
    60
    mov edx,eax
slouken@0
    61
    shr eax,11
slouken@0
    62
    and eax,BYTE 11111b
slouken@0
    63
    and ebx,11111100000b
slouken@0
    64
    shl edx,11
slouken@0
    65
    add eax,ebx
slouken@0
    66
    add eax,edx
slouken@0
    67
    mov [edi],al
slouken@0
    68
    mov [edi+1],ah
slouken@0
    69
    add esi,BYTE 2
slouken@0
    70
    add edi,BYTE 2
slouken@0
    71
    dec ecx
slouken@0
    72
icculus@5884
    73
.L4: ; save count
slouken@0
    74
    push ecx
slouken@0
    75
slouken@0
    76
    ; unroll twice
slouken@0
    77
    shr ecx,1
slouken@0
    78
    
slouken@0
    79
    ; point arrays to end
slouken@0
    80
    lea esi,[esi+ecx*4]
slouken@0
    81
    lea edi,[edi+ecx*4]
slouken@0
    82
slouken@0
    83
    ; negative counter 
slouken@0
    84
    neg ecx
slouken@0
    85
    jmp SHORT .L6
slouken@0
    86
                              
icculus@5884
    87
.L5:    mov [edi+ecx*4-4],eax
icculus@5884
    88
.L6:    mov eax,[esi+ecx*4]
slouken@0
    89
slouken@0
    90
        mov ebx,[esi+ecx*4]
slouken@0
    91
        and eax,07E007E0h         
slouken@0
    92
slouken@0
    93
        mov edx,[esi+ecx*4]
slouken@0
    94
        and ebx,0F800F800h
slouken@0
    95
slouken@0
    96
        shr ebx,11
slouken@0
    97
        and edx,001F001Fh
slouken@0
    98
slouken@0
    99
        shl edx,11
slouken@0
   100
        add eax,ebx
slouken@0
   101
slouken@0
   102
        add eax,edx                 
slouken@0
   103
        inc ecx
slouken@0
   104
slouken@0
   105
        jnz .L5                 
slouken@0
   106
         
slouken@0
   107
    mov [edi+ecx*4-4],eax
slouken@0
   108
slouken@0
   109
    ; tail
slouken@0
   110
    pop ecx
slouken@0
   111
    and ecx,BYTE 1
slouken@0
   112
    jz .L7
slouken@0
   113
    mov al,[esi]
slouken@0
   114
    mov ah,[esi+1]
slouken@0
   115
    mov ebx,eax
slouken@0
   116
    mov edx,eax
slouken@0
   117
    shr eax,11
slouken@0
   118
    and eax,BYTE 11111b
slouken@0
   119
    and ebx,11111100000b
slouken@0
   120
    shl edx,11
slouken@0
   121
    add eax,ebx
slouken@0
   122
    add eax,edx
slouken@0
   123
    mov [edi],al
slouken@0
   124
    mov [edi+1],ah
slouken@0
   125
    add esi,BYTE 2
slouken@0
   126
    add edi,BYTE 2
slouken@0
   127
icculus@5884
   128
.L7:
icculus@3983
   129
    retn
slouken@0
   130
slouken@0
   131
slouken@0
   132
slouken@0
   133
slouken@0
   134
slouken@0
   135
slouken@0
   136
_ConvertX86p16_16RGB555:
slouken@0
   137
slouken@0
   138
    ; check short
slouken@0
   139
    cmp ecx,BYTE 32
slouken@0
   140
    ja .L3
slouken@0
   141
slouken@0
   142
icculus@5884
   143
.L1: ; short loop
slouken@0
   144
    mov al,[esi]
slouken@0
   145
    mov ah,[esi+1]
slouken@0
   146
    mov ebx,eax
slouken@0
   147
    shr ebx,1
slouken@0
   148
    and ebx,     0111111111100000b
slouken@0
   149
    and eax,BYTE 0000000000011111b
slouken@0
   150
    add eax,ebx
slouken@0
   151
    mov [edi],al
slouken@0
   152
    mov [edi+1],ah
slouken@0
   153
    add esi,BYTE 2
slouken@0
   154
    add edi,BYTE 2
slouken@0
   155
    dec ecx
slouken@0
   156
    jnz .L1
icculus@5884
   157
.L2:
icculus@3983
   158
    retn
slouken@0
   159
icculus@5884
   160
.L3: ; head
slouken@0
   161
    mov eax,edi
slouken@0
   162
    and eax,BYTE 11b
slouken@0
   163
    jz .L4
slouken@0
   164
    mov al,[esi]
slouken@0
   165
    mov ah,[esi+1]
slouken@0
   166
    mov ebx,eax
slouken@0
   167
    shr ebx,1
slouken@0
   168
    and ebx,     0111111111100000b
slouken@0
   169
    and eax,BYTE 0000000000011111b
slouken@0
   170
    add eax,ebx
slouken@0
   171
    mov [edi],al
slouken@0
   172
    mov [edi+1],ah
slouken@0
   173
    add esi,BYTE 2
slouken@0
   174
    add edi,BYTE 2
slouken@0
   175
    dec ecx
slouken@0
   176
icculus@5884
   177
.L4: ; save ebp
slouken@0
   178
    push ebp
slouken@0
   179
slouken@0
   180
    ; save count
slouken@0
   181
    push ecx
slouken@0
   182
slouken@0
   183
    ; unroll four times
slouken@0
   184
    shr ecx,2
slouken@0
   185
    
slouken@0
   186
    ; point arrays to end
slouken@0
   187
    lea esi,[esi+ecx*8]
slouken@0
   188
    lea edi,[edi+ecx*8]
slouken@0
   189
slouken@0
   190
    ; negative counter 
slouken@0
   191
    xor ebp,ebp
slouken@0
   192
    sub ebp,ecx
slouken@0
   193
icculus@5884
   194
.L5:    mov eax,[esi+ebp*8]        ; agi?
slouken@0
   195
        mov ecx,[esi+ebp*8+4]
slouken@0
   196
       
slouken@0
   197
        mov ebx,eax
slouken@0
   198
        mov edx,ecx
slouken@0
   199
slouken@0
   200
        and eax,0FFC0FFC0h
slouken@0
   201
        and ecx,0FFC0FFC0h
slouken@0
   202
slouken@0
   203
        shr eax,1
slouken@0
   204
        and ebx,001F001Fh
slouken@0
   205
slouken@0
   206
        shr ecx,1
slouken@0
   207
        and edx,001F001Fh
slouken@0
   208
slouken@0
   209
        add eax,ebx
slouken@0
   210
        add ecx,edx
slouken@0
   211
slouken@0
   212
        mov [edi+ebp*8],eax
slouken@0
   213
        mov [edi+ebp*8+4],ecx
slouken@0
   214
slouken@0
   215
        inc ebp
slouken@0
   216
        jnz .L5                 
slouken@0
   217
slouken@0
   218
    ; tail
slouken@0
   219
    pop ecx
icculus@5884
   220
.L6: and ecx,BYTE 11b
slouken@0
   221
    jz .L7
slouken@0
   222
    mov al,[esi]
slouken@0
   223
    mov ah,[esi+1]
slouken@0
   224
    mov ebx,eax
slouken@0
   225
    shr ebx,1
slouken@0
   226
    and ebx,     0111111111100000b
slouken@0
   227
    and eax,BYTE 0000000000011111b
slouken@0
   228
    add eax,ebx
slouken@0
   229
    mov [edi],al
slouken@0
   230
    mov [edi+1],ah
slouken@0
   231
    add esi,BYTE 2
slouken@0
   232
    add edi,BYTE 2
slouken@0
   233
    dec ecx
slouken@0
   234
    jmp SHORT .L6
slouken@0
   235
icculus@5884
   236
.L7: pop ebp
icculus@3983
   237
    retn
slouken@0
   238
slouken@0
   239
slouken@0
   240
slouken@0
   241
slouken@0
   242
slouken@0
   243
slouken@0
   244
_ConvertX86p16_16BGR555:
slouken@0
   245
slouken@0
   246
    ; check short
slouken@0
   247
    cmp ecx,BYTE 16
slouken@0
   248
    ja .L3
slouken@0
   249
slouken@0
   250
	
icculus@5884
   251
.L1: ; short loop
slouken@0
   252
    mov al,[esi]
slouken@0
   253
    mov ah,[esi+1]
slouken@0
   254
    mov ebx,eax
slouken@0
   255
    mov edx,eax
slouken@0
   256
    shr eax,11
slouken@0
   257
    and eax,BYTE 11111b
slouken@0
   258
    shr ebx,1
slouken@0
   259
    and ebx,1111100000b
slouken@0
   260
    shl edx,10
slouken@0
   261
    and edx,0111110000000000b
slouken@0
   262
    add eax,ebx
slouken@0
   263
    add eax,edx
slouken@0
   264
    mov [edi],al
slouken@0
   265
    mov [edi+1],ah
slouken@0
   266
    add esi,BYTE 2
slouken@0
   267
    add edi,BYTE 2
slouken@0
   268
    dec ecx
slouken@0
   269
    jnz .L1
icculus@5884
   270
.L2:
icculus@3983
   271
    retn
slouken@0
   272
icculus@5884
   273
.L3: ; head
slouken@0
   274
    mov eax,edi
slouken@0
   275
    and eax,BYTE 11b
slouken@0
   276
    jz .L4
slouken@0
   277
    mov al,[esi]
slouken@0
   278
    mov ah,[esi+1]
slouken@0
   279
    mov ebx,eax
slouken@0
   280
    mov edx,eax
slouken@0
   281
    shr eax,11
slouken@0
   282
    and eax,BYTE 11111b
slouken@0
   283
    shr ebx,1
slouken@0
   284
    and ebx,1111100000b
slouken@0
   285
    shl edx,10
slouken@0
   286
    and edx,0111110000000000b
slouken@0
   287
    add eax,ebx
slouken@0
   288
    add eax,edx
slouken@0
   289
    mov [edi],al
slouken@0
   290
    mov [edi+1],ah
slouken@0
   291
    add esi,BYTE 2
slouken@0
   292
    add edi,BYTE 2
slouken@0
   293
    dec ecx
slouken@0
   294
icculus@5884
   295
.L4: ; save count
slouken@0
   296
    push ecx
slouken@0
   297
slouken@0
   298
    ; unroll twice
slouken@0
   299
    shr ecx,1
slouken@0
   300
    
slouken@0
   301
    ; point arrays to end
slouken@0
   302
    lea esi,[esi+ecx*4]
slouken@0
   303
    lea edi,[edi+ecx*4]
slouken@0
   304
slouken@0
   305
    ; negative counter 
slouken@0
   306
    neg ecx
slouken@0
   307
    jmp SHORT .L6
slouken@0
   308
                              
icculus@5884
   309
.L5:     mov [edi+ecx*4-4],eax
icculus@5884
   310
.L6:     mov eax,[esi+ecx*4]
slouken@0
   311
slouken@0
   312
        shr eax,1
slouken@0
   313
        mov ebx,[esi+ecx*4]
slouken@0
   314
        
slouken@0
   315
        and eax,03E003E0h         
slouken@0
   316
        mov edx,[esi+ecx*4]
slouken@0
   317
slouken@0
   318
        and ebx,0F800F800h
slouken@0
   319
slouken@0
   320
        shr ebx,11
slouken@0
   321
        and edx,001F001Fh
slouken@0
   322
slouken@0
   323
        shl edx,10
slouken@0
   324
        add eax,ebx
slouken@0
   325
slouken@0
   326
        add eax,edx                 
slouken@0
   327
        inc ecx
slouken@0
   328
slouken@0
   329
        jnz .L5                 
slouken@0
   330
         
slouken@0
   331
    mov [edi+ecx*4-4],eax
slouken@0
   332
slouken@0
   333
    ; tail
slouken@0
   334
    pop ecx
slouken@0
   335
    and ecx,BYTE 1
slouken@0
   336
    jz .L7
slouken@0
   337
    mov al,[esi]
slouken@0
   338
    mov ah,[esi+1]
slouken@0
   339
    mov ebx,eax
slouken@0
   340
    mov edx,eax
slouken@0
   341
    shr eax,11
slouken@0
   342
    and eax,BYTE 11111b
slouken@0
   343
    shr ebx,1
slouken@0
   344
    and ebx,1111100000b
slouken@0
   345
    shl edx,10
slouken@0
   346
    and edx,0111110000000000b
slouken@0
   347
    add eax,ebx
slouken@0
   348
    add eax,edx
slouken@0
   349
    mov [edi],al
slouken@0
   350
    mov [edi+1],ah
slouken@0
   351
    add esi,BYTE 2
slouken@0
   352
    add edi,BYTE 2
slouken@0
   353
icculus@5884
   354
.L7:
icculus@3983
   355
    retn
slouken@0
   356
slouken@0
   357
slouken@0
   358
slouken@0
   359
slouken@0
   360
slouken@0
   361
slouken@0
   362
_ConvertX86p16_8RGB332:
slouken@0
   363
slouken@0
   364
    ; check short
slouken@0
   365
    cmp ecx,BYTE 16
slouken@0
   366
    ja .L3
slouken@0
   367
slouken@0
   368
icculus@5884
   369
.L1: ; short loop
slouken@0
   370
    mov al,[esi+0]
slouken@0
   371
    mov ah,[esi+1]
slouken@0
   372
    mov ebx,eax
slouken@0
   373
    mov edx,eax
slouken@0
   374
    and eax,BYTE 11000b         ; blue
slouken@0
   375
    shr eax,3
slouken@0
   376
    and ebx,11100000000b        ; green
slouken@0
   377
    shr ebx,6
slouken@0
   378
    and edx,1110000000000000b   ; red
slouken@0
   379
    shr edx,8
slouken@0
   380
    add eax,ebx
slouken@0
   381
    add eax,edx
slouken@0
   382
    mov [edi],al
slouken@0
   383
    add esi,BYTE 2
slouken@0
   384
    inc edi
slouken@0
   385
    dec ecx
slouken@0
   386
    jnz .L1
icculus@5884
   387
.L2:
icculus@3983
   388
    retn
slouken@0
   389
icculus@5884
   390
.L3: mov eax,edi
slouken@0
   391
    and eax,BYTE 11b
slouken@0
   392
    jz .L4
slouken@0
   393
    mov al,[esi+0]
slouken@0
   394
    mov ah,[esi+1]
slouken@0
   395
    mov ebx,eax
slouken@0
   396
    mov edx,eax
slouken@0
   397
    and eax,BYTE 11000b         ; blue
slouken@0
   398
    shr eax,3
slouken@0
   399
    and ebx,11100000000b        ; green
slouken@0
   400
    shr ebx,6
slouken@0
   401
    and edx,1110000000000000b   ; red
slouken@0
   402
    shr edx,8
slouken@0
   403
    add eax,ebx
slouken@0
   404
    add eax,edx
slouken@0
   405
    mov [edi],al
slouken@0
   406
    add esi,BYTE 2
slouken@0
   407
    inc edi
slouken@0
   408
    dec ecx
slouken@0
   409
    jmp SHORT .L3
slouken@0
   410
icculus@5884
   411
.L4: ; save ebp
slouken@0
   412
    push ebp
slouken@0
   413
slouken@0
   414
    ; save count
slouken@0
   415
    push ecx
slouken@0
   416
slouken@0
   417
    ; unroll 4 times
slouken@0
   418
    shr ecx,2
slouken@0
   419
slouken@0
   420
    ; prestep
slouken@0
   421
    mov dl,[esi+0]
slouken@0
   422
    mov bl,[esi+1]
slouken@0
   423
    mov dh,[esi+2]
slouken@0
   424
        
icculus@5884
   425
.L5:     shl edx,16
slouken@0
   426
        mov bh,[esi+3]
slouken@0
   427
        
slouken@0
   428
        shl ebx,16
slouken@0
   429
        mov dl,[esi+4]
slouken@0
   430
slouken@0
   431
        mov dh,[esi+6]
slouken@0
   432
        mov bl,[esi+5]
slouken@0
   433
slouken@0
   434
        and edx,00011000000110000001100000011000b
slouken@0
   435
        mov bh,[esi+7]
slouken@0
   436
slouken@0
   437
        ror edx,16+3
slouken@0
   438
        mov eax,ebx                                     ; setup eax for reds
slouken@0
   439
slouken@0
   440
        and ebx,00000111000001110000011100000111b
slouken@0
   441
        and eax,11100000111000001110000011100000b       ; reds
slouken@0
   442
slouken@0
   443
        ror ebx,16-2
slouken@0
   444
        add esi,BYTE 8
slouken@0
   445
slouken@0
   446
        ror eax,16
slouken@0
   447
        add edi,BYTE 4
slouken@0
   448
slouken@0
   449
        add eax,ebx
slouken@0
   450
        mov bl,[esi+1]                                  ; greens
slouken@0
   451
slouken@0
   452
        add eax,edx
slouken@0
   453
        mov dl,[esi+0]                                  ; blues
slouken@0
   454
slouken@0
   455
        mov [edi-4],eax
slouken@0
   456
        mov dh,[esi+2]
slouken@0
   457
slouken@0
   458
        dec ecx
slouken@0
   459
        jnz .L5                 
slouken@0
   460
    
slouken@0
   461
    ; check tail
slouken@0
   462
    pop ecx
slouken@0
   463
    and ecx,BYTE 11b
slouken@0
   464
    jz .L7
slouken@0
   465
icculus@5884
   466
.L6: ; tail
slouken@0
   467
    mov al,[esi+0]
slouken@0
   468
    mov ah,[esi+1]
slouken@0
   469
    mov ebx,eax
slouken@0
   470
    mov edx,eax
slouken@0
   471
    and eax,BYTE 11000b         ; blue
slouken@0
   472
    shr eax,3
slouken@0
   473
    and ebx,11100000000b        ; green
slouken@0
   474
    shr ebx,6
slouken@0
   475
    and edx,1110000000000000b   ; red
slouken@0
   476
    shr edx,8
slouken@0
   477
    add eax,ebx
slouken@0
   478
    add eax,edx
slouken@0
   479
    mov [edi],al
slouken@0
   480
    add esi,BYTE 2
slouken@0
   481
    inc edi
slouken@0
   482
    dec ecx
slouken@0
   483
    jnz .L6
slouken@0
   484
icculus@5884
   485
.L7: pop ebp
icculus@3983
   486
    retn
slouken@0
   487
slouken@5392
   488
%ifidn __OUTPUT_FORMAT__,elf32
icculus@1199
   489
section .note.GNU-stack noalloc noexec nowrite progbits
icculus@1199
   490
%endif