src/hermes/mmxp2_32.asm
author Ryan C. Gordon
Wed, 29 Nov 2006 10:30:05 +0000
branchSDL-1.2
changeset 3900 ce3a2bd11305
parent 1873 eb4d9d99849b
child 2134 180fa05e98e2
permissions -rw-r--r--
Wrapped some macro params in parentheses for alloca wrappers.
Thansk, Suzuki Masahiro.
slouken@0
     1
;
slouken@0
     2
; pII-optimised MMX format converters for HERMES
slouken@0
     3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
slouken@0
     4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
slouken@0
     5
; This source code is licensed under the GNU LGPL
slouken@0
     6
; 
slouken@0
     7
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     8
; licensing conditions		
slouken@0
     9
;
slouken@0
    10
; COPYRIGHT NOTICE
slouken@0
    11
; 
slouken@0
    12
; This file partly contains code that is (c) Intel Corporation, specifically
slouken@0
    13
; the mode detection routine, and the converter to 15 bit (8 pixel
slouken@0
    14
; conversion routine from the mmx programming tutorial pages).
slouken@0
    15
;
slouken@0
    16
;
slouken@0
    17
; These routines aren't exactly pII optimised - it's just that as they
slouken@0
    18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
slouken@0
    19
; optimise them for p5 MMXs..
slouken@0
    20
slouken@0
    21
BITS 32
slouken@0
    22
slouken@1873
    23
%include "common.inc"
slouken@0
    24
	
slouken@1871
    25
SDL_FUNC _ConvertMMXpII32_24RGB888
slouken@1871
    26
SDL_FUNC _ConvertMMXpII32_16RGB565
slouken@1871
    27
SDL_FUNC _ConvertMMXpII32_16BGR565
slouken@1871
    28
SDL_FUNC _ConvertMMXpII32_16RGB555
slouken@1871
    29
SDL_FUNC _ConvertMMXpII32_16BGR555
slouken@0
    30
slouken@0
    31
EXTERN _mmxreturn
slouken@0
    32
 
icculus@1230
    33
;; Macros for conversion routines
slouken@0
    34
icculus@1230
    35
%macro _push_immq_mask 1
icculus@1230
    36
	push dword %1
icculus@1230
    37
	push dword %1
icculus@1230
    38
%endmacro
slouken@0
    39
icculus@1230
    40
%macro load_immq 2
icculus@1230
    41
	_push_immq_mask %2
icculus@1230
    42
	movq %1, [esp]
icculus@1230
    43
%endmacro
slouken@0
    44
icculus@1230
    45
%macro pand_immq 2
icculus@1230
    46
	_push_immq_mask %2
icculus@1230
    47
	pand %1, [esp]
icculus@1230
    48
%endmacro
icculus@1230
    49
icculus@1230
    50
%define CLEANUP_IMMQ_LOADS(num) \
icculus@1230
    51
	add esp, byte 8 * num
slouken@0
    52
icculus@1230
    53
%define mmx32_rgb888_mask 00ffffffh
icculus@1230
    54
%define mmx32_rgb565_b 000000f8h
icculus@1230
    55
%define mmx32_rgb565_g 0000fc00h
icculus@1230
    56
%define mmx32_rgb565_r 00f80000h
slouken@0
    57
icculus@1230
    58
%define mmx32_rgb555_rb 00f800f8h
icculus@1230
    59
%define mmx32_rgb555_g 0000f800h
icculus@1230
    60
%define mmx32_rgb555_mul 20000008h
icculus@1230
    61
%define mmx32_bgr555_mul 00082000h
slouken@0
    62
slouken@0
    63
SECTION .text
slouken@0
    64
slouken@0
    65
_ConvertMMXpII32_24RGB888:
slouken@0
    66
slouken@0
    67
        ; set up mm6 as the mask, mm7 as zero
icculus@1230
    68
        load_immq mm6, mmx32_rgb888_mask
icculus@1230
    69
        CLEANUP_IMMQ_LOADS(1)
slouken@0
    70
        pxor mm7, mm7
slouken@0
    71
slouken@0
    72
        mov edx, ecx                    ; save ecx
slouken@0
    73
        and ecx, 0fffffffch             ; clear lower two bits
slouken@0
    74
        jnz .L1
slouken@0
    75
        jmp .L2
slouken@0
    76
slouken@0
    77
.L1:
slouken@0
    78
slouken@0
    79
        movq mm0, [esi]                 ; A R G B a r g b
slouken@0
    80
        pand mm0, mm6                   ; 0 R G B 0 r g b
slouken@0
    81
        movq mm1, [esi+8]               ; A R G B a r g b
slouken@0
    82
        pand mm1, mm6                   ; 0 R G B 0 r g b
slouken@0
    83
slouken@0
    84
        movq mm2, mm0                   ; 0 R G B 0 r g b
slouken@0
    85
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
slouken@0
    86
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
slouken@0
    87
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
slouken@0
    88
        por mm0, mm2                    ; 0 0 R G B r g b
slouken@0
    89
slouken@0
    90
        movq mm3, mm1                   ; 0 R G B 0 r g b
slouken@0
    91
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
slouken@0
    92
        por mm0, mm3                    ; g b R G B r g b
slouken@0
    93
slouken@0
    94
        movq mm4, mm1                   ; 0 R G B 0 r g b
slouken@0
    95
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
slouken@0
    96
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
slouken@0
    97
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
slouken@0
    98
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
slouken@0
    99
        por mm1, mm4                    ; 0 0 0 0 R G B r
slouken@0
   100
slouken@0
   101
        movq [edi], mm0
slouken@0
   102
        add esi, BYTE 16
slouken@0
   103
        movd [edi+8], mm1
slouken@0
   104
        add edi, BYTE 12
slouken@0
   105
        sub ecx, BYTE 4
slouken@0
   106
        jnz .L1
slouken@0
   107
slouken@0
   108
.L2:
slouken@0
   109
        mov ecx, edx
slouken@0
   110
        and ecx, BYTE 3
slouken@0
   111
        jz .L4
slouken@0
   112
.L3:
slouken@0
   113
        mov al, [esi]
slouken@0
   114
        mov bl, [esi+1]
slouken@0
   115
        mov dl, [esi+2]
slouken@0
   116
        mov [edi], al
slouken@0
   117
        mov [edi+1], bl
slouken@0
   118
        mov [edi+2], dl
slouken@0
   119
        add esi, BYTE 4
slouken@0
   120
        add edi, BYTE 3
slouken@0
   121
        dec ecx
slouken@0
   122
        jnz .L3
slouken@0
   123
.L4:
slouken@0
   124
        jmp _mmxreturn
slouken@0
   125
slouken@0
   126
slouken@0
   127
slouken@0
   128
_ConvertMMXpII32_16RGB565:
slouken@0
   129
slouken@0
   130
        ; set up masks
icculus@1230
   131
        load_immq mm5, mmx32_rgb565_b
icculus@1230
   132
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   133
        load_immq mm7, mmx32_rgb565_r
icculus@1230
   134
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   135
slouken@0
   136
        mov edx, ecx
slouken@0
   137
        shr ecx, 2
slouken@0
   138
        jnz .L1
slouken@0
   139
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
slouken@0
   140
slouken@0
   141
.L1:
slouken@0
   142
        movq mm0, [esi]         ; argb
slouken@0
   143
        movq mm1, mm0           ; argb
slouken@0
   144
        pand mm0, mm6           ; 00g0
slouken@0
   145
        movq mm3, mm1           ; argb
slouken@0
   146
        pand mm1, mm5           ; 000b
slouken@0
   147
        pand mm3, mm7           ; 0r00
slouken@0
   148
        pslld mm1, 2            ; 0 0 000000bb bbb00000
slouken@0
   149
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
slouken@0
   150
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   151
slouken@0
   152
        movq mm4, [esi+8]       ; argb
slouken@0
   153
        movq mm2, mm4           ; argb
slouken@0
   154
        pand mm4, mm6           ; 00g0
slouken@0
   155
        movq mm1, mm2           ; argb
slouken@0
   156
        pand mm2, mm5           ; 000b
slouken@0
   157
        pand mm1, mm7           ; 0r00
slouken@0
   158
        pslld mm2, 2            ; 0 0 000000bb bbb00000
slouken@0
   159
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
slouken@0
   160
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   161
slouken@0
   162
        packuswb mm3, mm1       ; R 0 r 0
slouken@0
   163
        packssdw mm0, mm4       ; as above.. ish
slouken@0
   164
        por mm0, mm3            ; done.
slouken@0
   165
        movq [edi], mm0
slouken@0
   166
slouken@0
   167
        add esi, 16
slouken@0
   168
        add edi, 8
slouken@0
   169
        dec ecx
slouken@0
   170
        jnz .L1
slouken@0
   171
slouken@0
   172
.L2:
slouken@0
   173
        mov ecx, edx
slouken@0
   174
        and ecx, BYTE 3
slouken@0
   175
        jz .L4
slouken@0
   176
.L3:
slouken@0
   177
        mov al, [esi]
slouken@0
   178
        mov bh, [esi+1]
slouken@0
   179
        mov ah, [esi+2]
slouken@0
   180
        shr al, 3
slouken@0
   181
        and eax, 0F81Fh            ; BYTE?
slouken@0
   182
        shr ebx, 5
slouken@0
   183
        and ebx, 07E0h             ; BYTE?
slouken@0
   184
        add eax, ebx
slouken@0
   185
        mov [edi], al
slouken@0
   186
        mov [edi+1], ah
slouken@0
   187
        add esi, BYTE 4
slouken@0
   188
        add edi, BYTE 2
slouken@0
   189
        dec ecx
slouken@0
   190
        jnz .L3
slouken@0
   191
slouken@0
   192
.L4:
slouken@0
   193
	jmp _mmxreturn
slouken@0
   194
slouken@0
   195
	
slouken@0
   196
_ConvertMMXpII32_16BGR565:
slouken@0
   197
icculus@1230
   198
        load_immq mm5, mmx32_rgb565_r
icculus@1230
   199
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   200
        load_immq mm7, mmx32_rgb565_b
icculus@1230
   201
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   202
slouken@0
   203
        mov edx, ecx
slouken@0
   204
        shr ecx, 2
slouken@0
   205
        jnz .L1
slouken@0
   206
        jmp .L2
slouken@0
   207
slouken@0
   208
.L1:
slouken@0
   209
        movq mm0, [esi]                 ; a r g b
slouken@0
   210
        movq mm1, mm0                   ; a r g b
slouken@0
   211
        pand mm0, mm6                   ; 0 0 g 0
slouken@0
   212
        movq mm3, mm1                   ; a r g b
slouken@0
   213
        pand mm1, mm5                   ; 0 r 0 0
slouken@0
   214
        pand mm3, mm7                   ; 0 0 0 b
slouken@0
   215
slouken@0
   216
        psllq mm3, 16                   ; 0 b 0 0
slouken@0
   217
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
slouken@0
   218
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
slouken@0
   219
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   220
slouken@0
   221
        movq mm4, [esi+8]               ; a r g b
slouken@0
   222
        movq mm2, mm4                   ; a r g b
slouken@0
   223
        pand mm4, mm6                   ; 0 0 g 0
slouken@0
   224
        movq mm1, mm2                   ; a r g b
slouken@0
   225
        pand mm2, mm5                   ; 0 r 0 0
slouken@0
   226
        pand mm1, mm7                   ; 0 0 0 b
slouken@0
   227
slouken@0
   228
        psllq mm1, 16                   ; 0 b 0 0
slouken@0
   229
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
slouken@0
   230
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
slouken@0
   231
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   232
slouken@0
   233
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
slouken@0
   234
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
slouken@0
   235
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
slouken@0
   236
        movq [edi], mm0
slouken@0
   237
slouken@0
   238
        add esi, BYTE 16
slouken@0
   239
        add edi, BYTE 8
slouken@0
   240
        dec ecx
slouken@0
   241
        jnz .L1
slouken@0
   242
slouken@0
   243
.L2:
slouken@0
   244
        and edx, BYTE 3
slouken@0
   245
        jz .L4
slouken@0
   246
.L3:
slouken@0
   247
        mov al, [esi+2]
slouken@0
   248
        mov bh, [esi+1]
slouken@0
   249
        mov ah, [esi]
slouken@0
   250
        shr al, 3
slouken@0
   251
        and eax, 0F81Fh                    ; BYTE ?
slouken@0
   252
        shr ebx, 5
slouken@0
   253
        and ebx, 07E0h                     ; BYTE ?
slouken@0
   254
        add eax, ebx
slouken@0
   255
        mov [edi], al
slouken@0
   256
        mov [edi+1], ah
slouken@0
   257
        add esi, BYTE 4
slouken@0
   258
        add edi, BYTE 2
slouken@0
   259
        dec edx
slouken@0
   260
        jnz .L3
slouken@0
   261
slouken@0
   262
.L4:
slouken@0
   263
        jmp _mmxreturn
slouken@0
   264
slouken@0
   265
_ConvertMMXpII32_16BGR555:
slouken@0
   266
slouken@0
   267
        ; the 16BGR555 converter is identical to the RGB555 one,
slouken@0
   268
        ; except it uses a different multiplier for the pmaddwd
slouken@0
   269
        ; instruction.  cool huh.
slouken@0
   270
icculus@1230
   271
        load_immq mm7, mmx32_bgr555_mul
slouken@0
   272
        jmp _convert_bgr555_cheat
slouken@0
   273
slouken@0
   274
; This is the same as the Intel version.. they obviously went to
slouken@0
   275
; much more trouble to expand/coil the loop than I did, so theirs
slouken@0
   276
; would almost certainly be faster, even if only a little.
slouken@0
   277
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
slouken@0
   278
; (I think) a more accurate name..
slouken@0
   279
_ConvertMMXpII32_16RGB555:
slouken@0
   280
icculus@1230
   281
	load_immq mm7, mmx32_rgb555_mul
slouken@0
   282
_convert_bgr555_cheat:
icculus@1230
   283
	load_immq mm6, mmx32_rgb555_g
icculus@1230
   284
	CLEANUP_IMMQ_LOADS(2)
slouken@0
   285
        
slouken@0
   286
	mov edx,ecx		           ; Save ecx 
slouken@0
   287
slouken@0
   288
        and ecx,BYTE 0fffffff8h            ; clear lower three bits
slouken@0
   289
	jnz .L_OK
slouken@289
   290
        jmp near .L2 
slouken@0
   291
slouken@0
   292
.L_OK:
slouken@0
   293
	
slouken@0
   294
	movq mm2,[esi+8]
slouken@0
   295
slouken@0
   296
	movq mm0,[esi]
slouken@0
   297
	movq mm3,mm2
slouken@0
   298
icculus@1230
   299
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   300
	movq mm1,mm0
slouken@0
   301
icculus@1230
   302
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   303
	pmaddwd mm3,mm7
slouken@0
   304
icculus@1230
   305
	CLEANUP_IMMQ_LOADS(2)
icculus@1230
   306
slouken@0
   307
	pmaddwd mm1,mm7
slouken@0
   308
	pand mm2,mm6
slouken@0
   309
slouken@0
   310
.L1:
slouken@0
   311
	movq mm4,[esi+24]
slouken@0
   312
	pand mm0,mm6
slouken@0
   313
slouken@0
   314
	movq mm5,[esi+16]
slouken@0
   315
	por mm3,mm2
slouken@0
   316
slouken@0
   317
	psrld mm3,6
slouken@0
   318
	por mm1,mm0
slouken@0
   319
slouken@0
   320
	movq mm0,mm4
slouken@0
   321
	psrld mm1,6
slouken@0
   322
icculus@1230
   323
	pand_immq mm0, mmx32_rgb555_rb
slouken@0
   324
	packssdw mm1,mm3
slouken@0
   325
slouken@0
   326
	movq mm3,mm5
slouken@0
   327
	pmaddwd mm0,mm7
slouken@0
   328
icculus@1230
   329
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   330
	pand mm4,mm6
slouken@0
   331
slouken@0
   332
	movq [edi],mm1			
slouken@0
   333
	pmaddwd mm3,mm7
slouken@0
   334
slouken@0
   335
        add esi,BYTE 32
slouken@0
   336
	por mm4,mm0
slouken@0
   337
slouken@0
   338
	pand mm5,mm6
slouken@0
   339
	psrld mm4,6
slouken@0
   340
slouken@0
   341
	movq mm2,[esi+8]
slouken@0
   342
	por mm5,mm3
slouken@0
   343
slouken@0
   344
	movq mm0,[esi]
slouken@0
   345
	psrld mm5,6
slouken@0
   346
slouken@0
   347
	movq mm3,mm2
slouken@0
   348
	movq mm1,mm0
slouken@0
   349
icculus@1230
   350
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   351
	packssdw mm5,mm4
slouken@0
   352
icculus@1230
   353
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   354
	pand mm2,mm6
slouken@0
   355
icculus@1230
   356
	CLEANUP_IMMQ_LOADS(4)
icculus@1230
   357
slouken@0
   358
	movq [edi+8],mm5
slouken@0
   359
	pmaddwd mm3,mm7
slouken@0
   360
slouken@0
   361
	pmaddwd mm1,mm7
slouken@0
   362
        add edi,BYTE 16
slouken@0
   363
	
slouken@0
   364
        sub ecx,BYTE 8
slouken@0
   365
	jz .L2
slouken@0
   366
        jmp .L1
slouken@0
   367
slouken@0
   368
slouken@0
   369
.L2:	
slouken@0
   370
	mov ecx,edx
slouken@0
   371
	
slouken@0
   372
        and ecx,BYTE 7
slouken@0
   373
	jz .L4
slouken@0
   374
	
slouken@0
   375
.L3:	
slouken@0
   376
	mov ebx,[esi]
slouken@0
   377
        add esi,BYTE 4
slouken@0
   378
	
slouken@0
   379
        mov eax,ebx
slouken@0
   380
        mov edx,ebx
slouken@0
   381
slouken@0
   382
        shr eax,3
slouken@0
   383
        shr edx,6
slouken@0
   384
slouken@0
   385
        and eax,BYTE 0000000000011111b
slouken@0
   386
        and edx,     0000001111100000b
slouken@0
   387
slouken@0
   388
        shr ebx,9
slouken@0
   389
slouken@0
   390
        or eax,edx
slouken@0
   391
slouken@0
   392
        and ebx,     0111110000000000b
slouken@0
   393
slouken@0
   394
        or eax,ebx
slouken@0
   395
slouken@0
   396
        mov [edi],ax
slouken@0
   397
        add edi,BYTE 2
slouken@0
   398
slouken@0
   399
	dec ecx
slouken@0
   400
	jnz .L3	
slouken@0
   401
slouken@0
   402
.L4:		
slouken@0
   403
	jmp _mmxreturn
slouken@0
   404
icculus@1199
   405
%ifidn __OUTPUT_FORMAT__,elf
icculus@1199
   406
section .note.GNU-stack noalloc noexec nowrite progbits
icculus@1199
   407
%endif