src/hermes/mmxp2_32.asm
author Sam Lantinga
Mon, 06 Feb 2006 08:28:51 +0000
changeset 1330 450721ad5436
parent 1230 88c2d6aed428
child 1697 393092a3ebf6
permissions -rw-r--r--
It's now possible to build SDL without any C runtime at all on Windows,
using Visual C++ 2005
slouken@0
     1
;
slouken@0
     2
; pII-optimised MMX format converters for HERMES
slouken@0
     3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
slouken@0
     4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
slouken@0
     5
; This source code is licensed under the GNU LGPL
slouken@0
     6
; 
slouken@0
     7
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     8
; licensing conditions		
slouken@0
     9
;
slouken@0
    10
; COPYRIGHT NOTICE
slouken@0
    11
; 
slouken@0
    12
; This file partly contains code that is (c) Intel Corporation, specifically
slouken@0
    13
; the mode detection routine, and the converter to 15 bit (8 pixel
slouken@0
    14
; conversion routine from the mmx programming tutorial pages).
slouken@0
    15
;
slouken@0
    16
;
slouken@0
    17
; These routines aren't exactly pII optimised - it's just that as they
slouken@0
    18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
slouken@0
    19
; optimise them for p5 MMXs..
slouken@0
    20
slouken@0
    21
BITS 32
slouken@0
    22
slouken@0
    23
	
slouken@0
    24
GLOBAL _ConvertMMXpII32_24RGB888
slouken@0
    25
GLOBAL _ConvertMMXpII32_16RGB565
slouken@0
    26
GLOBAL _ConvertMMXpII32_16BGR565
slouken@0
    27
GLOBAL _ConvertMMXpII32_16RGB555
slouken@0
    28
GLOBAL _ConvertMMXpII32_16BGR555
slouken@0
    29
slouken@0
    30
EXTERN _mmxreturn
slouken@0
    31
 
icculus@1230
    32
;; Macros for conversion routines
slouken@0
    33
icculus@1230
    34
%macro _push_immq_mask 1
icculus@1230
    35
	push dword %1
icculus@1230
    36
	push dword %1
icculus@1230
    37
%endmacro
slouken@0
    38
icculus@1230
    39
%macro load_immq 2
icculus@1230
    40
	_push_immq_mask %2
icculus@1230
    41
	movq %1, [esp]
icculus@1230
    42
%endmacro
slouken@0
    43
icculus@1230
    44
%macro pand_immq 2
icculus@1230
    45
	_push_immq_mask %2
icculus@1230
    46
	pand %1, [esp]
icculus@1230
    47
%endmacro
icculus@1230
    48
icculus@1230
    49
%define CLEANUP_IMMQ_LOADS(num) \
icculus@1230
    50
	add esp, byte 8 * num
slouken@0
    51
icculus@1230
    52
%define mmx32_rgb888_mask 00ffffffh
icculus@1230
    53
%define mmx32_rgb565_b 000000f8h
icculus@1230
    54
%define mmx32_rgb565_g 0000fc00h
icculus@1230
    55
%define mmx32_rgb565_r 00f80000h
slouken@0
    56
icculus@1230
    57
%define mmx32_rgb555_rb 00f800f8h
icculus@1230
    58
%define mmx32_rgb555_g 0000f800h
icculus@1230
    59
%define mmx32_rgb555_mul 20000008h
icculus@1230
    60
%define mmx32_bgr555_mul 00082000h
slouken@0
    61
slouken@0
    62
SECTION .text
slouken@0
    63
slouken@0
    64
_ConvertMMXpII32_24RGB888:
slouken@0
    65
slouken@0
    66
        ; set up mm6 as the mask, mm7 as zero
icculus@1230
    67
        load_immq mm6, mmx32_rgb888_mask
icculus@1230
    68
        CLEANUP_IMMQ_LOADS(1)
slouken@0
    69
        pxor mm7, mm7
slouken@0
    70
slouken@0
    71
        mov edx, ecx                    ; save ecx
slouken@0
    72
        and ecx, 0fffffffch             ; clear lower two bits
slouken@0
    73
        jnz .L1
slouken@0
    74
        jmp .L2
slouken@0
    75
slouken@0
    76
.L1:
slouken@0
    77
slouken@0
    78
        movq mm0, [esi]                 ; A R G B a r g b
slouken@0
    79
        pand mm0, mm6                   ; 0 R G B 0 r g b
slouken@0
    80
        movq mm1, [esi+8]               ; A R G B a r g b
slouken@0
    81
        pand mm1, mm6                   ; 0 R G B 0 r g b
slouken@0
    82
slouken@0
    83
        movq mm2, mm0                   ; 0 R G B 0 r g b
slouken@0
    84
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
slouken@0
    85
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
slouken@0
    86
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
slouken@0
    87
        por mm0, mm2                    ; 0 0 R G B r g b
slouken@0
    88
slouken@0
    89
        movq mm3, mm1                   ; 0 R G B 0 r g b
slouken@0
    90
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
slouken@0
    91
        por mm0, mm3                    ; g b R G B r g b
slouken@0
    92
slouken@0
    93
        movq mm4, mm1                   ; 0 R G B 0 r g b
slouken@0
    94
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
slouken@0
    95
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
slouken@0
    96
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
slouken@0
    97
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
slouken@0
    98
        por mm1, mm4                    ; 0 0 0 0 R G B r
slouken@0
    99
slouken@0
   100
        movq [edi], mm0
slouken@0
   101
        add esi, BYTE 16
slouken@0
   102
        movd [edi+8], mm1
slouken@0
   103
        add edi, BYTE 12
slouken@0
   104
        sub ecx, BYTE 4
slouken@0
   105
        jnz .L1
slouken@0
   106
slouken@0
   107
.L2:
slouken@0
   108
        mov ecx, edx
slouken@0
   109
        and ecx, BYTE 3
slouken@0
   110
        jz .L4
slouken@0
   111
.L3:
slouken@0
   112
        mov al, [esi]
slouken@0
   113
        mov bl, [esi+1]
slouken@0
   114
        mov dl, [esi+2]
slouken@0
   115
        mov [edi], al
slouken@0
   116
        mov [edi+1], bl
slouken@0
   117
        mov [edi+2], dl
slouken@0
   118
        add esi, BYTE 4
slouken@0
   119
        add edi, BYTE 3
slouken@0
   120
        dec ecx
slouken@0
   121
        jnz .L3
slouken@0
   122
.L4:
slouken@0
   123
        jmp _mmxreturn
slouken@0
   124
slouken@0
   125
slouken@0
   126
slouken@0
   127
_ConvertMMXpII32_16RGB565:
slouken@0
   128
slouken@0
   129
        ; set up masks
icculus@1230
   130
        load_immq mm5, mmx32_rgb565_b
icculus@1230
   131
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   132
        load_immq mm7, mmx32_rgb565_r
icculus@1230
   133
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   134
slouken@0
   135
        mov edx, ecx
slouken@0
   136
        shr ecx, 2
slouken@0
   137
        jnz .L1
slouken@0
   138
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
slouken@0
   139
slouken@0
   140
.L1:
slouken@0
   141
        movq mm0, [esi]         ; argb
slouken@0
   142
        movq mm1, mm0           ; argb
slouken@0
   143
        pand mm0, mm6           ; 00g0
slouken@0
   144
        movq mm3, mm1           ; argb
slouken@0
   145
        pand mm1, mm5           ; 000b
slouken@0
   146
        pand mm3, mm7           ; 0r00
slouken@0
   147
        pslld mm1, 2            ; 0 0 000000bb bbb00000
slouken@0
   148
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
slouken@0
   149
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   150
slouken@0
   151
        movq mm4, [esi+8]       ; argb
slouken@0
   152
        movq mm2, mm4           ; argb
slouken@0
   153
        pand mm4, mm6           ; 00g0
slouken@0
   154
        movq mm1, mm2           ; argb
slouken@0
   155
        pand mm2, mm5           ; 000b
slouken@0
   156
        pand mm1, mm7           ; 0r00
slouken@0
   157
        pslld mm2, 2            ; 0 0 000000bb bbb00000
slouken@0
   158
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
slouken@0
   159
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   160
slouken@0
   161
        packuswb mm3, mm1       ; R 0 r 0
slouken@0
   162
        packssdw mm0, mm4       ; as above.. ish
slouken@0
   163
        por mm0, mm3            ; done.
slouken@0
   164
        movq [edi], mm0
slouken@0
   165
slouken@0
   166
        add esi, 16
slouken@0
   167
        add edi, 8
slouken@0
   168
        dec ecx
slouken@0
   169
        jnz .L1
slouken@0
   170
slouken@0
   171
.L2:
slouken@0
   172
        mov ecx, edx
slouken@0
   173
        and ecx, BYTE 3
slouken@0
   174
        jz .L4
slouken@0
   175
.L3:
slouken@0
   176
        mov al, [esi]
slouken@0
   177
        mov bh, [esi+1]
slouken@0
   178
        mov ah, [esi+2]
slouken@0
   179
        shr al, 3
slouken@0
   180
        and eax, 0F81Fh            ; BYTE?
slouken@0
   181
        shr ebx, 5
slouken@0
   182
        and ebx, 07E0h             ; BYTE?
slouken@0
   183
        add eax, ebx
slouken@0
   184
        mov [edi], al
slouken@0
   185
        mov [edi+1], ah
slouken@0
   186
        add esi, BYTE 4
slouken@0
   187
        add edi, BYTE 2
slouken@0
   188
        dec ecx
slouken@0
   189
        jnz .L3
slouken@0
   190
slouken@0
   191
.L4:
slouken@0
   192
	jmp _mmxreturn
slouken@0
   193
slouken@0
   194
	
slouken@0
   195
_ConvertMMXpII32_16BGR565:
slouken@0
   196
icculus@1230
   197
        load_immq mm5, mmx32_rgb565_r
icculus@1230
   198
        load_immq mm6, mmx32_rgb565_g
icculus@1230
   199
        load_immq mm7, mmx32_rgb565_b
icculus@1230
   200
        CLEANUP_IMMQ_LOADS(3)
slouken@0
   201
slouken@0
   202
        mov edx, ecx
slouken@0
   203
        shr ecx, 2
slouken@0
   204
        jnz .L1
slouken@0
   205
        jmp .L2
slouken@0
   206
slouken@0
   207
.L1:
slouken@0
   208
        movq mm0, [esi]                 ; a r g b
slouken@0
   209
        movq mm1, mm0                   ; a r g b
slouken@0
   210
        pand mm0, mm6                   ; 0 0 g 0
slouken@0
   211
        movq mm3, mm1                   ; a r g b
slouken@0
   212
        pand mm1, mm5                   ; 0 r 0 0
slouken@0
   213
        pand mm3, mm7                   ; 0 0 0 b
slouken@0
   214
slouken@0
   215
        psllq mm3, 16                   ; 0 b 0 0
slouken@0
   216
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
slouken@0
   217
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
slouken@0
   218
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   219
slouken@0
   220
        movq mm4, [esi+8]               ; a r g b
slouken@0
   221
        movq mm2, mm4                   ; a r g b
slouken@0
   222
        pand mm4, mm6                   ; 0 0 g 0
slouken@0
   223
        movq mm1, mm2                   ; a r g b
slouken@0
   224
        pand mm2, mm5                   ; 0 r 0 0
slouken@0
   225
        pand mm1, mm7                   ; 0 0 0 b
slouken@0
   226
slouken@0
   227
        psllq mm1, 16                   ; 0 b 0 0
slouken@0
   228
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
slouken@0
   229
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
slouken@0
   230
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   231
slouken@0
   232
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
slouken@0
   233
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
slouken@0
   234
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
slouken@0
   235
        movq [edi], mm0
slouken@0
   236
slouken@0
   237
        add esi, BYTE 16
slouken@0
   238
        add edi, BYTE 8
slouken@0
   239
        dec ecx
slouken@0
   240
        jnz .L1
slouken@0
   241
slouken@0
   242
.L2:
slouken@0
   243
        and edx, BYTE 3
slouken@0
   244
        jz .L4
slouken@0
   245
.L3:
slouken@0
   246
        mov al, [esi+2]
slouken@0
   247
        mov bh, [esi+1]
slouken@0
   248
        mov ah, [esi]
slouken@0
   249
        shr al, 3
slouken@0
   250
        and eax, 0F81Fh                    ; BYTE ?
slouken@0
   251
        shr ebx, 5
slouken@0
   252
        and ebx, 07E0h                     ; BYTE ?
slouken@0
   253
        add eax, ebx
slouken@0
   254
        mov [edi], al
slouken@0
   255
        mov [edi+1], ah
slouken@0
   256
        add esi, BYTE 4
slouken@0
   257
        add edi, BYTE 2
slouken@0
   258
        dec edx
slouken@0
   259
        jnz .L3
slouken@0
   260
slouken@0
   261
.L4:
slouken@0
   262
        jmp _mmxreturn
slouken@0
   263
slouken@0
   264
_ConvertMMXpII32_16BGR555:
slouken@0
   265
slouken@0
   266
        ; the 16BGR555 converter is identical to the RGB555 one,
slouken@0
   267
        ; except it uses a different multiplier for the pmaddwd
slouken@0
   268
        ; instruction.  cool huh.
slouken@0
   269
icculus@1230
   270
        load_immq mm7, mmx32_bgr555_mul
slouken@0
   271
        jmp _convert_bgr555_cheat
slouken@0
   272
slouken@0
   273
; This is the same as the Intel version.. they obviously went to
slouken@0
   274
; much more trouble to expand/coil the loop than I did, so theirs
slouken@0
   275
; would almost certainly be faster, even if only a little.
slouken@0
   276
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
slouken@0
   277
; (I think) a more accurate name..
slouken@0
   278
_ConvertMMXpII32_16RGB555:
slouken@0
   279
icculus@1230
   280
	load_immq mm7, mmx32_rgb555_mul
slouken@0
   281
_convert_bgr555_cheat:
icculus@1230
   282
	load_immq mm6, mmx32_rgb555_g
icculus@1230
   283
	CLEANUP_IMMQ_LOADS(2)
slouken@0
   284
        
slouken@0
   285
	mov edx,ecx		           ; Save ecx 
slouken@0
   286
slouken@0
   287
        and ecx,BYTE 0fffffff8h            ; clear lower three bits
slouken@0
   288
	jnz .L_OK
slouken@289
   289
        jmp near .L2 
slouken@0
   290
slouken@0
   291
.L_OK:
slouken@0
   292
	
slouken@0
   293
	movq mm2,[esi+8]
slouken@0
   294
slouken@0
   295
	movq mm0,[esi]
slouken@0
   296
	movq mm3,mm2
slouken@0
   297
icculus@1230
   298
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   299
	movq mm1,mm0
slouken@0
   300
icculus@1230
   301
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   302
	pmaddwd mm3,mm7
slouken@0
   303
icculus@1230
   304
	CLEANUP_IMMQ_LOADS(2)
icculus@1230
   305
slouken@0
   306
	pmaddwd mm1,mm7
slouken@0
   307
	pand mm2,mm6
slouken@0
   308
slouken@0
   309
.L1:
slouken@0
   310
	movq mm4,[esi+24]
slouken@0
   311
	pand mm0,mm6
slouken@0
   312
slouken@0
   313
	movq mm5,[esi+16]
slouken@0
   314
	por mm3,mm2
slouken@0
   315
slouken@0
   316
	psrld mm3,6
slouken@0
   317
	por mm1,mm0
slouken@0
   318
slouken@0
   319
	movq mm0,mm4
slouken@0
   320
	psrld mm1,6
slouken@0
   321
icculus@1230
   322
	pand_immq mm0, mmx32_rgb555_rb
slouken@0
   323
	packssdw mm1,mm3
slouken@0
   324
slouken@0
   325
	movq mm3,mm5
slouken@0
   326
	pmaddwd mm0,mm7
slouken@0
   327
icculus@1230
   328
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   329
	pand mm4,mm6
slouken@0
   330
slouken@0
   331
	movq [edi],mm1			
slouken@0
   332
	pmaddwd mm3,mm7
slouken@0
   333
slouken@0
   334
        add esi,BYTE 32
slouken@0
   335
	por mm4,mm0
slouken@0
   336
slouken@0
   337
	pand mm5,mm6
slouken@0
   338
	psrld mm4,6
slouken@0
   339
slouken@0
   340
	movq mm2,[esi+8]
slouken@0
   341
	por mm5,mm3
slouken@0
   342
slouken@0
   343
	movq mm0,[esi]
slouken@0
   344
	psrld mm5,6
slouken@0
   345
slouken@0
   346
	movq mm3,mm2
slouken@0
   347
	movq mm1,mm0
slouken@0
   348
icculus@1230
   349
	pand_immq mm3, mmx32_rgb555_rb
slouken@0
   350
	packssdw mm5,mm4
slouken@0
   351
icculus@1230
   352
	pand_immq mm1, mmx32_rgb555_rb
slouken@0
   353
	pand mm2,mm6
slouken@0
   354
icculus@1230
   355
	CLEANUP_IMMQ_LOADS(4)
icculus@1230
   356
slouken@0
   357
	movq [edi+8],mm5
slouken@0
   358
	pmaddwd mm3,mm7
slouken@0
   359
slouken@0
   360
	pmaddwd mm1,mm7
slouken@0
   361
        add edi,BYTE 16
slouken@0
   362
	
slouken@0
   363
        sub ecx,BYTE 8
slouken@0
   364
	jz .L2
slouken@0
   365
        jmp .L1
slouken@0
   366
slouken@0
   367
slouken@0
   368
.L2:	
slouken@0
   369
	mov ecx,edx
slouken@0
   370
	
slouken@0
   371
        and ecx,BYTE 7
slouken@0
   372
	jz .L4
slouken@0
   373
	
slouken@0
   374
.L3:	
slouken@0
   375
	mov ebx,[esi]
slouken@0
   376
        add esi,BYTE 4
slouken@0
   377
	
slouken@0
   378
        mov eax,ebx
slouken@0
   379
        mov edx,ebx
slouken@0
   380
slouken@0
   381
        shr eax,3
slouken@0
   382
        shr edx,6
slouken@0
   383
slouken@0
   384
        and eax,BYTE 0000000000011111b
slouken@0
   385
        and edx,     0000001111100000b
slouken@0
   386
slouken@0
   387
        shr ebx,9
slouken@0
   388
slouken@0
   389
        or eax,edx
slouken@0
   390
slouken@0
   391
        and ebx,     0111110000000000b
slouken@0
   392
slouken@0
   393
        or eax,ebx
slouken@0
   394
slouken@0
   395
        mov [edi],ax
slouken@0
   396
        add edi,BYTE 2
slouken@0
   397
slouken@0
   398
	dec ecx
slouken@0
   399
	jnz .L3	
slouken@0
   400
slouken@0
   401
.L4:		
slouken@0
   402
	jmp _mmxreturn
slouken@0
   403
icculus@1199
   404
%ifidn __OUTPUT_FORMAT__,elf
icculus@1199
   405
section .note.GNU-stack noalloc noexec nowrite progbits
icculus@1199
   406
%endif