src/hermes/x86p_32.asm
author Sam Lantinga
Mon, 06 Feb 2006 08:28:51 +0000
changeset 1330 450721ad5436
parent 1199 2d6dc7de1145
child 1697 393092a3ebf6
permissions -rw-r--r--
It's now possible to build SDL without any C runtime at all on Windows,
using Visual C++ 2005
slouken@0
     1
;
slouken@0
     2
; x86 format converters for HERMES
slouken@0
     3
; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
slouken@0
     4
; This source code is licensed under the GNU LGPL
slouken@0
     5
; 
slouken@0
     6
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     7
; licensing conditions		
slouken@0
     8
;
slouken@0
     9
; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
slouken@0
    10
; 
slouken@0
    11
slouken@0
    12
	
slouken@0
    13
BITS 32
slouken@0
    14
slouken@0
    15
GLOBAL _ConvertX86p32_32BGR888
slouken@0
    16
GLOBAL _ConvertX86p32_32RGBA888
slouken@0
    17
GLOBAL _ConvertX86p32_32BGRA888
slouken@0
    18
GLOBAL _ConvertX86p32_24RGB888	
slouken@0
    19
GLOBAL _ConvertX86p32_24BGR888
slouken@0
    20
GLOBAL _ConvertX86p32_16RGB565
slouken@0
    21
GLOBAL _ConvertX86p32_16BGR565
slouken@0
    22
GLOBAL _ConvertX86p32_16RGB555
slouken@0
    23
GLOBAL _ConvertX86p32_16BGR555
slouken@0
    24
GLOBAL _ConvertX86p32_8RGB332
slouken@0
    25
slouken@0
    26
EXTERN _x86return
slouken@1166
    27
	
slouken@0
    28
SECTION .text
slouken@0
    29
slouken@0
    30
;; _Convert_*
slouken@0
    31
;; Paramters:	
slouken@0
    32
;;   ESI = source 
slouken@0
    33
;;   EDI = dest
slouken@0
    34
;;   ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
slouken@0
    35
;; Destroys:
slouken@0
    36
;;   EAX, EBX, EDX
slouken@0
    37
slouken@0
    38
slouken@0
    39
_ConvertX86p32_32BGR888:
slouken@0
    40
slouken@0
    41
    ; check short
slouken@0
    42
    cmp ecx,BYTE 32
slouken@0
    43
    ja .L3
slouken@0
    44
slouken@0
    45
.L1 ; short loop
slouken@0
    46
    mov edx,[esi]
slouken@0
    47
    bswap edx
slouken@0
    48
    ror edx,8
slouken@0
    49
    mov [edi],edx
slouken@0
    50
    add esi,BYTE 4
slouken@0
    51
    add edi,BYTE 4
slouken@0
    52
    dec ecx
slouken@0
    53
    jnz .L1
slouken@0
    54
.L2
slouken@0
    55
    jmp _x86return
slouken@0
    56
slouken@0
    57
.L3 ; save ebp
slouken@0
    58
    push ebp
slouken@0
    59
slouken@0
    60
    ; unroll four times
slouken@0
    61
    mov ebp,ecx
slouken@0
    62
    shr ebp,2
slouken@0
    63
    
slouken@0
    64
    ; save count
slouken@0
    65
    push ecx
slouken@0
    66
slouken@0
    67
.L4     mov eax,[esi]
slouken@0
    68
        mov ebx,[esi+4]
slouken@0
    69
slouken@0
    70
        bswap eax
slouken@0
    71
slouken@0
    72
        bswap ebx
slouken@0
    73
slouken@0
    74
        ror eax,8
slouken@0
    75
        mov ecx,[esi+8]
slouken@0
    76
slouken@0
    77
        ror ebx,8
slouken@0
    78
        mov edx,[esi+12]
slouken@0
    79
slouken@0
    80
        bswap ecx
slouken@0
    81
slouken@0
    82
        bswap edx
slouken@0
    83
slouken@0
    84
        ror ecx,8
slouken@0
    85
        mov [edi+0],eax
slouken@0
    86
slouken@0
    87
        ror edx,8
slouken@0
    88
        mov [edi+4],ebx
slouken@0
    89
slouken@0
    90
        mov [edi+8],ecx
slouken@0
    91
        mov [edi+12],edx
slouken@0
    92
slouken@0
    93
        add esi,BYTE 16
slouken@0
    94
        add edi,BYTE 16
slouken@0
    95
slouken@0
    96
        dec ebp
slouken@0
    97
        jnz .L4                 
slouken@0
    98
slouken@0
    99
    ; check tail
slouken@0
   100
    pop ecx
slouken@0
   101
    and ecx,BYTE 11b
slouken@0
   102
    jz .L6
slouken@0
   103
slouken@0
   104
.L5 ; tail loop
slouken@0
   105
    mov edx,[esi]
slouken@0
   106
    bswap edx
slouken@0
   107
    ror edx,8
slouken@0
   108
    mov [edi],edx
slouken@0
   109
    add esi,BYTE 4
slouken@0
   110
    add edi,BYTE 4
slouken@0
   111
    dec ecx
slouken@0
   112
    jnz .L5
slouken@0
   113
slouken@0
   114
.L6 pop ebp
slouken@0
   115
    jmp _x86return
slouken@0
   116
	
slouken@0
   117
slouken@0
   118
	
slouken@0
   119
		
slouken@0
   120
_ConvertX86p32_32RGBA888:
slouken@0
   121
	
slouken@0
   122
    ; check short
slouken@0
   123
    cmp ecx,BYTE 32
slouken@0
   124
    ja .L3
slouken@0
   125
slouken@0
   126
.L1 ; short loop
slouken@0
   127
    mov edx,[esi]
slouken@0
   128
    rol edx,8
slouken@0
   129
    mov [edi],edx
slouken@0
   130
    add esi,BYTE 4
slouken@0
   131
    add edi,BYTE 4
slouken@0
   132
    dec ecx
slouken@0
   133
    jnz .L1
slouken@0
   134
.L2
slouken@0
   135
    jmp _x86return
slouken@0
   136
slouken@0
   137
.L3 ; save ebp
slouken@0
   138
    push ebp
slouken@0
   139
slouken@0
   140
    ; unroll four times
slouken@0
   141
    mov ebp,ecx
slouken@0
   142
    shr ebp,2
slouken@0
   143
    
slouken@0
   144
    ; save count
slouken@0
   145
    push ecx
slouken@0
   146
slouken@0
   147
.L4     mov eax,[esi]
slouken@0
   148
        mov ebx,[esi+4]
slouken@0
   149
slouken@0
   150
        rol eax,8
slouken@0
   151
        mov ecx,[esi+8]
slouken@0
   152
slouken@0
   153
        rol ebx,8
slouken@0
   154
        mov edx,[esi+12]
slouken@0
   155
slouken@0
   156
        rol ecx,8
slouken@0
   157
        mov [edi+0],eax
slouken@0
   158
slouken@0
   159
        rol edx,8
slouken@0
   160
        mov [edi+4],ebx
slouken@0
   161
slouken@0
   162
        mov [edi+8],ecx
slouken@0
   163
        mov [edi+12],edx
slouken@0
   164
slouken@0
   165
        add esi,BYTE 16
slouken@0
   166
        add edi,BYTE 16
slouken@0
   167
slouken@0
   168
        dec ebp
slouken@0
   169
        jnz .L4                 
slouken@0
   170
slouken@0
   171
    ; check tail
slouken@0
   172
    pop ecx
slouken@0
   173
    and ecx,BYTE 11b
slouken@0
   174
    jz .L6
slouken@0
   175
slouken@0
   176
.L5 ; tail loop
slouken@0
   177
    mov edx,[esi]
slouken@0
   178
    rol edx,8
slouken@0
   179
    mov [edi],edx
slouken@0
   180
    add esi,BYTE 4
slouken@0
   181
    add edi,BYTE 4
slouken@0
   182
    dec ecx
slouken@0
   183
    jnz .L5
slouken@0
   184
slouken@0
   185
.L6 pop ebp
slouken@0
   186
    jmp _x86return
slouken@0
   187
slouken@0
   188
	
slouken@0
   189
slouken@0
   190
slouken@0
   191
_ConvertX86p32_32BGRA888:
slouken@0
   192
slouken@0
   193
    ; check short
slouken@0
   194
    cmp ecx,BYTE 32
slouken@0
   195
    ja .L3
slouken@0
   196
slouken@0
   197
.L1 ; short loop
slouken@0
   198
    mov edx,[esi]
slouken@0
   199
    bswap edx
slouken@0
   200
    mov [edi],edx
slouken@0
   201
    add esi,BYTE 4
slouken@0
   202
    add edi,BYTE 4
slouken@0
   203
    dec ecx
slouken@0
   204
    jnz .L1
slouken@0
   205
.L2
slouken@0
   206
    jmp _x86return
slouken@0
   207
slouken@0
   208
.L3 ; save ebp
slouken@0
   209
    push ebp
slouken@0
   210
slouken@0
   211
    ; unroll four times
slouken@0
   212
    mov ebp,ecx
slouken@0
   213
    shr ebp,2
slouken@0
   214
    
slouken@0
   215
    ; save count
slouken@0
   216
    push ecx
slouken@0
   217
slouken@0
   218
.L4     mov eax,[esi]
slouken@0
   219
        mov ebx,[esi+4]
slouken@0
   220
slouken@0
   221
        mov ecx,[esi+8]
slouken@0
   222
        mov edx,[esi+12]
slouken@0
   223
slouken@0
   224
        bswap eax
slouken@0
   225
slouken@0
   226
        bswap ebx
slouken@0
   227
slouken@0
   228
        bswap ecx
slouken@0
   229
slouken@0
   230
        bswap edx
slouken@0
   231
slouken@0
   232
        mov [edi+0],eax
slouken@0
   233
        mov [edi+4],ebx
slouken@0
   234
slouken@0
   235
        mov [edi+8],ecx
slouken@0
   236
        mov [edi+12],edx
slouken@0
   237
slouken@0
   238
        add esi,BYTE 16
slouken@0
   239
        add edi,BYTE 16
slouken@0
   240
slouken@0
   241
        dec ebp
slouken@0
   242
        jnz .L4                 
slouken@0
   243
slouken@0
   244
    ; check tail
slouken@0
   245
    pop ecx
slouken@0
   246
    and ecx,BYTE 11b
slouken@0
   247
    jz .L6
slouken@0
   248
slouken@0
   249
.L5 ; tail loop
slouken@0
   250
    mov edx,[esi]
slouken@0
   251
    bswap edx
slouken@0
   252
    mov [edi],edx
slouken@0
   253
    add esi,BYTE 4
slouken@0
   254
    add edi,BYTE 4
slouken@0
   255
    dec ecx
slouken@0
   256
    jnz .L5
slouken@0
   257
slouken@0
   258
.L6 pop ebp
slouken@0
   259
    jmp _x86return
slouken@0
   260
slouken@0
   261
slouken@0
   262
	
slouken@0
   263
	
slouken@0
   264
;; 32 bit RGB 888 to 24 BIT RGB 888
slouken@0
   265
slouken@0
   266
_ConvertX86p32_24RGB888:
slouken@0
   267
slouken@0
   268
	; check short
slouken@0
   269
	cmp ecx,BYTE 32
slouken@0
   270
	ja .L3
slouken@0
   271
slouken@0
   272
.L1	; short loop
slouken@0
   273
	mov al,[esi]
slouken@0
   274
	mov bl,[esi+1]
slouken@0
   275
	mov dl,[esi+2]
slouken@0
   276
	mov [edi],al
slouken@0
   277
	mov [edi+1],bl
slouken@0
   278
	mov [edi+2],dl
slouken@0
   279
	add esi,BYTE 4
slouken@0
   280
	add edi,BYTE 3
slouken@0
   281
	dec ecx
slouken@0
   282
	jnz .L1
slouken@0
   283
.L2 
slouken@0
   284
	jmp _x86return
slouken@0
   285
slouken@0
   286
.L3	;	 head
slouken@0
   287
	mov edx,edi
slouken@0
   288
	and edx,BYTE 11b
slouken@0
   289
	jz .L4
slouken@0
   290
	mov al,[esi]
slouken@0
   291
	mov bl,[esi+1]
slouken@0
   292
	mov dl,[esi+2]
slouken@0
   293
	mov [edi],al
slouken@0
   294
	mov [edi+1],bl
slouken@0
   295
	mov [edi+2],dl
slouken@0
   296
	add esi,BYTE 4
slouken@0
   297
	add edi,BYTE 3
slouken@0
   298
	dec ecx
slouken@0
   299
	jmp SHORT .L3
slouken@0
   300
slouken@0
   301
.L4 ; unroll 4 times
slouken@0
   302
	push ebp
slouken@0
   303
	mov ebp,ecx
slouken@0
   304
	shr ebp,2
slouken@0
   305
slouken@0
   306
    ; save count
slouken@0
   307
	push ecx
slouken@0
   308
slouken@0
   309
.L5     mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
slouken@0
   310
        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
slouken@0
   311
slouken@0
   312
        shl eax,8                       ;                        eax = [R][G][B][.]
slouken@0
   313
        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
slouken@0
   314
slouken@0
   315
        shl ebx,8                       ;                        ebx = [r][g][b][.]
slouken@0
   316
        mov al,[esi+4]                  ;                        eax = [R][G][B][b]
slouken@0
   317
slouken@0
   318
        ror eax,8                       ;                        eax = [b][R][G][B] (done)
slouken@0
   319
        mov bh,[esi+8+1]                ;                        ebx = [r][g][G][.]
slouken@0
   320
slouken@0
   321
        mov [edi],eax
slouken@0
   322
        add edi,BYTE 3*4
slouken@0
   323
slouken@0
   324
        shl ecx,8                       ;                        ecx = [r][g][b][.]
slouken@0
   325
        mov bl,[esi+8+0]                ;                        ebx = [r][g][G][B]
slouken@0
   326
slouken@0
   327
        rol ebx,16                      ;                        ebx = [G][B][r][g] (done)
slouken@0
   328
        mov cl,[esi+8+2]                ;                        ecx = [r][g][b][R] (done)
slouken@0
   329
slouken@0
   330
        mov [edi+4-3*4],ebx
slouken@0
   331
        add esi,BYTE 4*4
slouken@0
   332
        
slouken@0
   333
        mov [edi+8-3*4],ecx
slouken@0
   334
        dec ebp
slouken@0
   335
slouken@0
   336
        jnz .L5
slouken@0
   337
slouken@0
   338
    ; check tail
slouken@0
   339
	pop ecx
slouken@0
   340
	and ecx,BYTE 11b
slouken@0
   341
	jz .L7
slouken@0
   342
slouken@0
   343
.L6 ; tail loop
slouken@0
   344
	mov al,[esi]
slouken@0
   345
	mov bl,[esi+1]
slouken@0
   346
	mov dl,[esi+2]
slouken@0
   347
	mov [edi],al
slouken@0
   348
	mov [edi+1],bl
slouken@0
   349
	mov [edi+2],dl
slouken@0
   350
	add esi,BYTE 4
slouken@0
   351
	add edi,BYTE 3
slouken@0
   352
	dec ecx
slouken@0
   353
	jnz .L6
slouken@0
   354
slouken@0
   355
.L7	pop ebp
slouken@0
   356
	jmp _x86return
slouken@0
   357
slouken@0
   358
slouken@0
   359
slouken@0
   360
slouken@0
   361
;; 32 bit RGB 888 to 24 bit BGR 888
slouken@0
   362
slouken@0
   363
_ConvertX86p32_24BGR888:
slouken@0
   364
slouken@0
   365
	; check short
slouken@0
   366
	cmp ecx,BYTE 32
slouken@0
   367
	ja .L3
slouken@0
   368
slouken@0
   369
	
slouken@0
   370
.L1	; short loop
slouken@0
   371
	mov dl,[esi]
slouken@0
   372
	mov bl,[esi+1]
slouken@0
   373
	mov al,[esi+2]
slouken@0
   374
	mov [edi],al
slouken@0
   375
	mov [edi+1],bl
slouken@0
   376
	mov [edi+2],dl
slouken@0
   377
	add esi,BYTE 4
slouken@0
   378
	add edi,BYTE 3
slouken@0
   379
	dec ecx
slouken@0
   380
	jnz .L1
slouken@0
   381
.L2
slouken@0
   382
	jmp _x86return
slouken@0
   383
slouken@0
   384
.L3 ; head
slouken@0
   385
	mov edx,edi
slouken@0
   386
	and edx,BYTE 11b
slouken@0
   387
	jz .L4
slouken@0
   388
	mov dl,[esi]
slouken@0
   389
	mov bl,[esi+1]
slouken@0
   390
	mov al,[esi+2]
slouken@0
   391
	mov [edi],al
slouken@0
   392
	mov [edi+1],bl
slouken@0
   393
	mov [edi+2],dl
slouken@0
   394
	add esi,BYTE 4
slouken@0
   395
	add edi,BYTE 3
slouken@0
   396
	dec ecx
slouken@0
   397
	jmp SHORT .L3
slouken@0
   398
slouken@0
   399
.L4	; unroll 4 times
slouken@0
   400
	push ebp
slouken@0
   401
	mov ebp,ecx
slouken@0
   402
	shr ebp,2
slouken@0
   403
slouken@0
   404
	; save count
slouken@0
   405
	push ecx
slouken@0
   406
slouken@0
   407
.L5     
slouken@0
   408
	mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
slouken@0
   409
        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
slouken@0
   410
        
slouken@0
   411
        bswap eax                       ;                        eax = [B][G][R][A]
slouken@0
   412
slouken@0
   413
        bswap ebx                       ;                        ebx = [b][g][r][a]
slouken@0
   414
slouken@0
   415
        mov al,[esi+4+2]                ;                        eax = [B][G][R][r] 
slouken@0
   416
        mov bh,[esi+4+4+1]              ;                        ebx = [b][g][G][a]
slouken@0
   417
slouken@0
   418
        ror eax,8                       ;                        eax = [r][B][G][R] (done)
slouken@0
   419
        mov bl,[esi+4+4+2]              ;                        ebx = [b][g][G][R]
slouken@0
   420
slouken@0
   421
        ror ebx,16                      ;                        ebx = [G][R][b][g] (done)
slouken@0
   422
        mov [edi],eax
slouken@0
   423
    
slouken@0
   424
        mov [edi+4],ebx
slouken@0
   425
        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
slouken@0
   426
        
slouken@0
   427
        bswap ecx                       ;                        ecx = [b][g][r][a]
slouken@0
   428
        
slouken@0
   429
        mov cl,[esi+8]                  ;                        ecx = [b][g][r][B] (done)
slouken@0
   430
        add esi,BYTE 4*4
slouken@0
   431
slouken@0
   432
        mov [edi+8],ecx
slouken@0
   433
        add edi,BYTE 3*4
slouken@0
   434
slouken@0
   435
        dec ebp
slouken@0
   436
        jnz .L5
slouken@0
   437
slouken@0
   438
	; check tail
slouken@0
   439
	pop ecx
slouken@0
   440
	and ecx,BYTE 11b
slouken@0
   441
	jz .L7
slouken@0
   442
slouken@0
   443
.L6	; tail loop
slouken@0
   444
	mov dl,[esi]
slouken@0
   445
	mov bl,[esi+1]
slouken@0
   446
	mov al,[esi+2]
slouken@0
   447
	mov [edi],al
slouken@0
   448
	mov [edi+1],bl
slouken@0
   449
	mov [edi+2],dl
slouken@0
   450
	add esi,BYTE 4
slouken@0
   451
	add edi,BYTE 3
slouken@0
   452
	dec ecx
slouken@0
   453
	jnz .L6
slouken@0
   454
slouken@0
   455
.L7 
slouken@0
   456
	pop ebp
slouken@0
   457
	jmp _x86return
slouken@0
   458
 
slouken@0
   459
slouken@0
   460
	
slouken@0
   461
		
slouken@0
   462
;; 32 bit RGB 888 to 16 BIT RGB 565 
slouken@0
   463
slouken@0
   464
_ConvertX86p32_16RGB565:
slouken@0
   465
	; check short
slouken@0
   466
	cmp ecx,BYTE 16
slouken@0
   467
	ja .L3
slouken@0
   468
slouken@0
   469
.L1 ; short loop
slouken@0
   470
	mov bl,[esi+0]    ; blue
slouken@0
   471
	mov al,[esi+1]    ; green
slouken@0
   472
	mov ah,[esi+2]    ; red
slouken@0
   473
	shr ah,3
slouken@0
   474
        and al,11111100b
slouken@0
   475
	shl eax,3
slouken@0
   476
	shr bl,3
slouken@0
   477
	add al,bl
slouken@0
   478
	mov [edi+0],al
slouken@0
   479
	mov [edi+1],ah
slouken@0
   480
	add esi,BYTE 4
slouken@0
   481
	add edi,BYTE 2
slouken@0
   482
	dec ecx
slouken@0
   483
	jnz .L1
slouken@0
   484
slouken@0
   485
.L2:				; End of short loop
slouken@0
   486
	jmp _x86return
slouken@0
   487
slouken@0
   488
	
slouken@0
   489
.L3	; head
slouken@0
   490
	mov ebx,edi
slouken@0
   491
	and ebx,BYTE 11b
slouken@0
   492
	jz .L4
slouken@0
   493
	
slouken@0
   494
	mov bl,[esi+0]    ; blue
slouken@0
   495
	mov al,[esi+1]    ; green
slouken@0
   496
	mov ah,[esi+2]    ; red
slouken@0
   497
	shr ah,3
slouken@0
   498
	and al,11111100b
slouken@0
   499
	shl eax,3
slouken@0
   500
	shr bl,3
slouken@0
   501
	add al,bl
slouken@0
   502
	mov [edi+0],al
slouken@0
   503
	mov [edi+1],ah
slouken@0
   504
	add esi,BYTE 4
slouken@0
   505
	add edi,BYTE 2
slouken@0
   506
	dec ecx
slouken@0
   507
slouken@0
   508
.L4:	 
slouken@0
   509
    ; save count
slouken@0
   510
	push ecx
slouken@0
   511
slouken@0
   512
    ; unroll twice
slouken@0
   513
	shr ecx,1
slouken@0
   514
    
slouken@0
   515
    ; point arrays to end
slouken@0
   516
	lea esi,[esi+ecx*8]
slouken@0
   517
	lea edi,[edi+ecx*4]
slouken@0
   518
slouken@0
   519
    ; negative counter 
slouken@0
   520
	neg ecx
slouken@0
   521
	jmp SHORT .L6
slouken@0
   522
slouken@0
   523
.L5:	    
slouken@0
   524
	mov [edi+ecx*4-4],eax
slouken@0
   525
.L6:	
slouken@0
   526
	mov eax,[esi+ecx*8]
slouken@0
   527
slouken@0
   528
        shr ah,2
slouken@0
   529
        mov ebx,[esi+ecx*8+4]
slouken@0
   530
slouken@0
   531
        shr eax,3
slouken@0
   532
        mov edx,[esi+ecx*8+4]
slouken@0
   533
slouken@0
   534
        shr bh,2
slouken@0
   535
        mov dl,[esi+ecx*8+2]
slouken@0
   536
slouken@0
   537
        shl ebx,13
slouken@0
   538
        and eax,000007FFh
slouken@0
   539
        
slouken@0
   540
        shl edx,8
slouken@0
   541
        and ebx,07FF0000h
slouken@0
   542
slouken@0
   543
        and edx,0F800F800h
slouken@0
   544
        add eax,ebx
slouken@0
   545
slouken@0
   546
        add eax,edx
slouken@0
   547
        inc ecx
slouken@0
   548
slouken@0
   549
        jnz .L5                 
slouken@0
   550
slouken@0
   551
	mov [edi+ecx*4-4],eax
slouken@0
   552
slouken@0
   553
    ; tail
slouken@0
   554
	pop ecx
slouken@0
   555
	test cl,1
slouken@0
   556
	jz .L7
slouken@0
   557
	
slouken@0
   558
	mov bl,[esi+0]    ; blue
slouken@0
   559
	mov al,[esi+1]    ; green
slouken@0
   560
	mov ah,[esi+2]    ; red
slouken@0
   561
	shr ah,3
slouken@0
   562
	and al,11111100b
slouken@0
   563
	shl eax,3
slouken@0
   564
	shr bl,3
slouken@0
   565
	add al,bl
slouken@0
   566
	mov [edi+0],al
slouken@0
   567
	mov [edi+1],ah
slouken@0
   568
	add esi,BYTE 4
slouken@0
   569
	add edi,BYTE 2
slouken@0
   570
slouken@0
   571
.L7:	
slouken@0
   572
	jmp _x86return
slouken@0
   573
slouken@0
   574
slouken@0
   575
slouken@0
   576
	
slouken@0
   577
;; 32 bit RGB 888 to 16 BIT BGR 565 
slouken@0
   578
slouken@0
   579
_ConvertX86p32_16BGR565:
slouken@0
   580
	
slouken@0
   581
	; check short
slouken@0
   582
	cmp ecx,BYTE 16
slouken@0
   583
	ja .L3
slouken@0
   584
slouken@0
   585
.L1	; short loop
slouken@0
   586
	mov ah,[esi+0]    ; blue
slouken@0
   587
	mov al,[esi+1]    ; green
slouken@0
   588
	mov bl,[esi+2]    ; red
slouken@0
   589
	shr ah,3
slouken@0
   590
	and al,11111100b
slouken@0
   591
	shl eax,3
slouken@0
   592
	shr bl,3
slouken@0
   593
	add al,bl
slouken@0
   594
	mov [edi+0],al
slouken@0
   595
	mov [edi+1],ah
slouken@0
   596
	add esi,BYTE 4
slouken@0
   597
	add edi,BYTE 2
slouken@0
   598
	dec ecx
slouken@0
   599
	jnz .L1
slouken@0
   600
.L2
slouken@0
   601
	jmp _x86return
slouken@0
   602
slouken@0
   603
.L3	; head
slouken@0
   604
	mov ebx,edi
slouken@0
   605
	and ebx,BYTE 11b
slouken@0
   606
	jz .L4   
slouken@0
   607
	mov ah,[esi+0]    ; blue
slouken@0
   608
	mov al,[esi+1]    ; green
slouken@0
   609
	mov bl,[esi+2]    ; red
slouken@0
   610
	shr ah,3
slouken@0
   611
	and al,11111100b
slouken@0
   612
	shl eax,3
slouken@0
   613
	shr bl,3
slouken@0
   614
	add al,bl
slouken@0
   615
	mov [edi+0],al
slouken@0
   616
	mov [edi+1],ah
slouken@0
   617
	add esi,BYTE 4
slouken@0
   618
	add edi,BYTE 2
slouken@0
   619
	dec ecx
slouken@0
   620
slouken@0
   621
.L4	; save count
slouken@0
   622
	push ecx
slouken@0
   623
slouken@0
   624
	; unroll twice
slouken@0
   625
	shr ecx,1
slouken@0
   626
    
slouken@0
   627
	; point arrays to end
slouken@0
   628
	lea esi,[esi+ecx*8]
slouken@0
   629
	lea edi,[edi+ecx*4]
slouken@0
   630
slouken@0
   631
	; negative count
slouken@0
   632
	neg ecx
slouken@0
   633
	jmp SHORT .L6
slouken@0
   634
slouken@0
   635
.L5     
slouken@0
   636
	mov [edi+ecx*4-4],eax            
slouken@0
   637
.L6     
slouken@0
   638
	mov edx,[esi+ecx*8+4]
slouken@0
   639
slouken@0
   640
        mov bh,[esi+ecx*8+4]                       
slouken@0
   641
        mov ah,[esi+ecx*8]                       
slouken@0
   642
slouken@0
   643
        shr bh,3
slouken@0
   644
        mov al,[esi+ecx*8+1]             
slouken@0
   645
slouken@0
   646
        shr ah,3
slouken@0
   647
        mov bl,[esi+ecx*8+5]           
slouken@0
   648
slouken@0
   649
        shl eax,3
slouken@0
   650
        mov dl,[esi+ecx*8+2]
slouken@0
   651
slouken@0
   652
        shl ebx,19
slouken@0
   653
        and eax,0000FFE0h              
slouken@0
   654
                
slouken@0
   655
        shr edx,3
slouken@0
   656
        and ebx,0FFE00000h             
slouken@0
   657
        
slouken@0
   658
        and edx,001F001Fh               
slouken@0
   659
        add eax,ebx
slouken@0
   660
slouken@0
   661
        add eax,edx
slouken@0
   662
        inc ecx
slouken@0
   663
slouken@0
   664
        jnz .L5                 
slouken@0
   665
slouken@0
   666
	mov [edi+ecx*4-4],eax            
slouken@0
   667
slouken@0
   668
	; tail
slouken@0
   669
	pop ecx
slouken@0
   670
	and ecx,BYTE 1
slouken@0
   671
	jz .L7
slouken@0
   672
	mov ah,[esi+0]    ; blue
slouken@0
   673
	mov al,[esi+1]    ; green
slouken@0
   674
	mov bl,[esi+2]    ; red
slouken@0
   675
	shr ah,3
slouken@0
   676
	and al,11111100b
slouken@0
   677
	shl eax,3
slouken@0
   678
	shr bl,3
slouken@0
   679
	add al,bl
slouken@0
   680
	mov [edi+0],al
slouken@0
   681
	mov [edi+1],ah
slouken@0
   682
	add esi,BYTE 4
slouken@0
   683
	add edi,BYTE 2
slouken@0
   684
slouken@0
   685
.L7 
slouken@0
   686
	jmp _x86return
slouken@0
   687
slouken@0
   688
slouken@0
   689
	
slouken@0
   690
	
slouken@0
   691
;; 32 BIT RGB TO 16 BIT RGB 555
slouken@0
   692
slouken@0
   693
_ConvertX86p32_16RGB555:
slouken@0
   694
slouken@0
   695
	; check short
slouken@0
   696
	cmp ecx,BYTE 16
slouken@0
   697
	ja .L3
slouken@0
   698
slouken@0
   699
.L1	; short loop
slouken@0
   700
	mov bl,[esi+0]    ; blue
slouken@0
   701
	mov al,[esi+1]    ; green
slouken@0
   702
	mov ah,[esi+2]    ; red
slouken@0
   703
	shr ah,3
slouken@0
   704
	and al,11111000b
slouken@0
   705
	shl eax,2
slouken@0
   706
	shr bl,3
slouken@0
   707
	add al,bl
slouken@0
   708
	mov [edi+0],al
slouken@0
   709
	mov [edi+1],ah
slouken@0
   710
	add esi,BYTE 4
slouken@0
   711
	add edi,BYTE 2
slouken@0
   712
	dec ecx
slouken@0
   713
	jnz .L1
slouken@0
   714
.L2
slouken@0
   715
	jmp _x86return
slouken@0
   716
slouken@0
   717
.L3	; head
slouken@0
   718
	mov ebx,edi
slouken@0
   719
        and ebx,BYTE 11b
slouken@0
   720
	jz .L4   
slouken@0
   721
	mov bl,[esi+0]    ; blue
slouken@0
   722
	mov al,[esi+1]    ; green
slouken@0
   723
	mov ah,[esi+2]    ; red
slouken@0
   724
	shr ah,3
slouken@0
   725
	and al,11111000b
slouken@0
   726
	shl eax,2
slouken@0
   727
	shr bl,3
slouken@0
   728
	add al,bl
slouken@0
   729
	mov [edi+0],al
slouken@0
   730
	mov [edi+1],ah
slouken@0
   731
	add esi,BYTE 4
slouken@0
   732
	add edi,BYTE 2
slouken@0
   733
	dec ecx
slouken@0
   734
slouken@0
   735
.L4	; save count
slouken@0
   736
	push ecx
slouken@0
   737
slouken@0
   738
	; unroll twice
slouken@0
   739
	shr ecx,1
slouken@0
   740
    
slouken@0
   741
	; point arrays to end
slouken@0
   742
	lea esi,[esi+ecx*8]
slouken@0
   743
	lea edi,[edi+ecx*4]
slouken@0
   744
slouken@0
   745
	; negative counter 
slouken@0
   746
	neg ecx
slouken@0
   747
	jmp SHORT .L6
slouken@0
   748
slouken@0
   749
.L5     
slouken@0
   750
	mov [edi+ecx*4-4],eax
slouken@0
   751
.L6     
slouken@0
   752
	mov eax,[esi+ecx*8]
slouken@0
   753
slouken@0
   754
        shr ah,3
slouken@0
   755
        mov ebx,[esi+ecx*8+4]
slouken@0
   756
slouken@0
   757
        shr eax,3
slouken@0
   758
        mov edx,[esi+ecx*8+4]
slouken@0
   759
slouken@0
   760
        shr bh,3
slouken@0
   761
        mov dl,[esi+ecx*8+2]
slouken@0
   762
slouken@0
   763
        shl ebx,13
slouken@0
   764
        and eax,000007FFh
slouken@0
   765
        
slouken@0
   766
        shl edx,7
slouken@0
   767
        and ebx,07FF0000h
slouken@0
   768
slouken@0
   769
        and edx,07C007C00h
slouken@0
   770
        add eax,ebx
slouken@0
   771
slouken@0
   772
        add eax,edx
slouken@0
   773
        inc ecx
slouken@0
   774
slouken@0
   775
        jnz .L5                 
slouken@0
   776
slouken@0
   777
	mov [edi+ecx*4-4],eax
slouken@0
   778
slouken@0
   779
	; tail
slouken@0
   780
	pop ecx
slouken@0
   781
	and ecx,BYTE 1
slouken@0
   782
	jz .L7
slouken@0
   783
	mov bl,[esi+0]    ; blue
slouken@0
   784
	mov al,[esi+1]    ; green
slouken@0
   785
	mov ah,[esi+2]    ; red
slouken@0
   786
	shr ah,3
slouken@0
   787
	and al,11111000b
slouken@0
   788
	shl eax,2
slouken@0
   789
	shr bl,3
slouken@0
   790
	add al,bl
slouken@0
   791
	mov [edi+0],al
slouken@0
   792
	mov [edi+1],ah
slouken@0
   793
	add esi,BYTE 4
slouken@0
   794
	add edi,BYTE 2
slouken@0
   795
slouken@0
   796
.L7
slouken@0
   797
	jmp _x86return
slouken@0
   798
slouken@0
   799
slouken@0
   800
slouken@0
   801
slouken@0
   802
;; 32 BIT RGB TO 16 BIT BGR 555
slouken@0
   803
	
slouken@0
   804
_ConvertX86p32_16BGR555:
slouken@0
   805
	
slouken@0
   806
	; check short
slouken@0
   807
	cmp ecx,BYTE 16
slouken@0
   808
	ja .L3
slouken@0
   809
slouken@0
   810
slouken@0
   811
.L1	; short loop
slouken@0
   812
	mov ah,[esi+0]    ; blue
slouken@0
   813
	mov al,[esi+1]    ; green
slouken@0
   814
	mov bl,[esi+2]    ; red
slouken@0
   815
	shr ah,3
slouken@0
   816
	and al,11111000b
slouken@0
   817
	shl eax,2
slouken@0
   818
	shr bl,3
slouken@0
   819
	add al,bl
slouken@0
   820
	mov [edi+0],al
slouken@0
   821
	mov [edi+1],ah
slouken@0
   822
	add esi,BYTE 4
slouken@0
   823
	add edi,BYTE 2
slouken@0
   824
	dec ecx
slouken@0
   825
	jnz .L1
slouken@0
   826
.L2 
slouken@0
   827
	jmp _x86return
slouken@0
   828
slouken@0
   829
.L3	; head
slouken@0
   830
	mov ebx,edi
slouken@0
   831
        and ebx,BYTE 11b
slouken@0
   832
	jz .L4   
slouken@0
   833
	mov ah,[esi+0]    ; blue
slouken@0
   834
	mov al,[esi+1]    ; green
slouken@0
   835
	mov bl,[esi+2]    ; red
slouken@0
   836
	shr ah,3
slouken@0
   837
	and al,11111000b
slouken@0
   838
	shl eax,2
slouken@0
   839
	shr bl,3
slouken@0
   840
	add al,bl
slouken@0
   841
	mov [edi+0],al
slouken@0
   842
	mov [edi+1],ah
slouken@0
   843
	add esi,BYTE 4
slouken@0
   844
	add edi,BYTE 2
slouken@0
   845
	dec ecx
slouken@0
   846
slouken@0
   847
.L4	; save count
slouken@0
   848
	push ecx
slouken@0
   849
slouken@0
   850
	; unroll twice
slouken@0
   851
	shr ecx,1
slouken@0
   852
    
slouken@0
   853
	; point arrays to end
slouken@0
   854
	lea esi,[esi+ecx*8]
slouken@0
   855
	lea edi,[edi+ecx*4]
slouken@0
   856
slouken@0
   857
	; negative counter 
slouken@0
   858
	neg ecx
slouken@0
   859
	jmp SHORT .L6
slouken@0
   860
slouken@0
   861
.L5     
slouken@0
   862
	mov [edi+ecx*4-4],eax            
slouken@0
   863
.L6     
slouken@0
   864
	mov edx,[esi+ecx*8+4]
slouken@0
   865
slouken@0
   866
        mov bh,[esi+ecx*8+4]                       
slouken@0
   867
        mov ah,[esi+ecx*8]                       
slouken@0
   868
slouken@0
   869
        shr bh,3
slouken@0
   870
        mov al,[esi+ecx*8+1]             
slouken@0
   871
slouken@0
   872
        shr ah,3
slouken@0
   873
        mov bl,[esi+ecx*8+5]           
slouken@0
   874
slouken@0
   875
        shl eax,2
slouken@0
   876
        mov dl,[esi+ecx*8+2]
slouken@0
   877
slouken@0
   878
        shl ebx,18
slouken@0
   879
        and eax,00007FE0h              
slouken@0
   880
                
slouken@0
   881
        shr edx,3
slouken@0
   882
        and ebx,07FE00000h             
slouken@0
   883
        
slouken@0
   884
        and edx,001F001Fh               
slouken@0
   885
        add eax,ebx
slouken@0
   886
slouken@0
   887
        add eax,edx
slouken@0
   888
        inc ecx
slouken@0
   889
slouken@0
   890
        jnz .L5                 
slouken@0
   891
slouken@0
   892
	mov [edi+ecx*4-4],eax            
slouken@0
   893
slouken@0
   894
	; tail
slouken@0
   895
	pop ecx
slouken@0
   896
	and ecx,BYTE 1
slouken@0
   897
	jz .L7
slouken@0
   898
	mov ah,[esi+0]    ; blue
slouken@0
   899
	mov al,[esi+1]    ; green
slouken@0
   900
	mov bl,[esi+2]    ; red
slouken@0
   901
	shr ah,3
slouken@0
   902
	and al,11111000b
slouken@0
   903
	shl eax,2
slouken@0
   904
	shr bl,3
slouken@0
   905
	add al,bl
slouken@0
   906
	mov [edi+0],al
slouken@0
   907
	mov [edi+1],ah
slouken@0
   908
	add esi,BYTE 4
slouken@0
   909
	add edi,BYTE 2
slouken@0
   910
slouken@0
   911
.L7
slouken@0
   912
	jmp _x86return
slouken@0
   913
slouken@0
   914
slouken@0
   915
slouken@0
   916
slouken@0
   917
	
slouken@0
   918
;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
slouken@0
   919
;; This routine writes FOUR pixels at once (dword) and then, if they exist
slouken@0
   920
;; the trailing three pixels
slouken@0
   921
_ConvertX86p32_8RGB332:
slouken@0
   922
slouken@0
   923
	
slouken@0
   924
.L_ALIGNED
slouken@0
   925
	push ecx
slouken@0
   926
slouken@0
   927
	shr ecx,2		; We will draw 4 pixels at once
slouken@0
   928
	jnz .L1
slouken@0
   929
	
slouken@0
   930
	jmp .L2			; short jump out of range :(
slouken@0
   931
	
slouken@0
   932
.L1:
slouken@0
   933
	mov eax,[esi]		; first pair of pixels
slouken@0
   934
	mov edx,[esi+4]
slouken@0
   935
slouken@0
   936
	shr dl,6
slouken@0
   937
	mov ebx,eax
slouken@0
   938
slouken@0
   939
	shr al,6
slouken@0
   940
	and ah,0e0h
slouken@0
   941
slouken@0
   942
	shr ebx,16
slouken@0
   943
	and dh,0e0h
slouken@0
   944
	
slouken@0
   945
	shr ah,3
slouken@0
   946
	and bl,0e0h
slouken@0
   947
slouken@0
   948
	shr dh,3
slouken@0
   949
	
slouken@0
   950
	or al,bl
slouken@0
   951
	
slouken@0
   952
	mov ebx,edx	
slouken@0
   953
	or al,ah
slouken@0
   954
	
slouken@0
   955
	shr ebx,16
slouken@0
   956
	or dl,dh
slouken@0
   957
slouken@0
   958
	and bl,0e0h
slouken@0
   959
	
slouken@0
   960
	or dl,bl
slouken@0
   961
slouken@0
   962
	mov ah,dl
slouken@0
   963
slouken@0
   964
	
slouken@0
   965
		
slouken@0
   966
	mov ebx,[esi+8]		; second pair of pixels
slouken@0
   967
slouken@0
   968
	mov edx,ebx
slouken@0
   969
	and bh,0e0h
slouken@0
   970
slouken@0
   971
	shr bl,6
slouken@0
   972
	and edx,0e00000h
slouken@0
   973
slouken@0
   974
	shr edx,16
slouken@0
   975
slouken@0
   976
	shr bh,3
slouken@0
   977
slouken@0
   978
	ror eax,16
slouken@0
   979
	or bl,dl
slouken@0
   980
slouken@0
   981
	mov edx,[esi+12]
slouken@0
   982
	or bl,bh
slouken@0
   983
	
slouken@0
   984
	mov al,bl
slouken@0
   985
slouken@0
   986
	mov ebx,edx
slouken@0
   987
	and dh,0e0h
slouken@0
   988
slouken@0
   989
	shr dl,6
slouken@0
   990
	and ebx,0e00000h
slouken@0
   991
	
slouken@0
   992
	shr dh,3
slouken@0
   993
	mov ah,dl
slouken@0
   994
slouken@0
   995
	shr ebx,16
slouken@0
   996
	or ah,dh
slouken@0
   997
slouken@0
   998
	or ah,bl
slouken@0
   999
slouken@0
  1000
	rol eax,16
slouken@0
  1001
	add esi,BYTE 16
slouken@0
  1002
			
slouken@0
  1003
	mov [edi],eax	
slouken@0
  1004
	add edi,BYTE 4
slouken@0
  1005
	
slouken@0
  1006
	dec ecx
slouken@0
  1007
	jz .L2			; L1 out of range for short jump :(
slouken@0
  1008
	
slouken@0
  1009
	jmp .L1
slouken@0
  1010
.L2:
slouken@0
  1011
	
slouken@0
  1012
	pop ecx
slouken@0
  1013
	and ecx,BYTE 3		; mask out number of pixels to draw
slouken@0
  1014
	
slouken@0
  1015
	jz .L4			; Nothing to do anymore
slouken@0
  1016
slouken@0
  1017
.L3:
slouken@0
  1018
	mov eax,[esi]		; single pixel conversion for trailing pixels
slouken@0
  1019
slouken@0
  1020
        mov ebx,eax
slouken@0
  1021
slouken@0
  1022
        shr al,6
slouken@0
  1023
        and ah,0e0h
slouken@0
  1024
slouken@0
  1025
        shr ebx,16
slouken@0
  1026
slouken@0
  1027
        shr ah,3
slouken@0
  1028
        and bl,0e0h
slouken@0
  1029
slouken@0
  1030
        or al,ah
slouken@0
  1031
        or al,bl
slouken@0
  1032
slouken@0
  1033
        mov [edi],al
slouken@0
  1034
slouken@0
  1035
        inc edi
slouken@0
  1036
        add esi,BYTE 4
slouken@0
  1037
slouken@0
  1038
	dec ecx
slouken@0
  1039
	jnz .L3
slouken@0
  1040
	
slouken@0
  1041
.L4:	
slouken@0
  1042
	jmp _x86return
icculus@1199
  1043
icculus@1199
  1044
%ifidn __OUTPUT_FORMAT__,elf
icculus@1199
  1045
section .note.GNU-stack noalloc noexec nowrite progbits
icculus@1199
  1046
%endif