src/hermes/mmxp2_32.asm
author Sam Lantinga <slouken@libsdl.org>
Sun, 21 Sep 2003 18:32:04 +0000
changeset 720 f90d80d68071
parent 289 77b6110c797d
child 1166 da33b7e6d181
permissions -rw-r--r--
N Sep 17 8791 Sam Lantinga Re: tks source released
Date: Sun, 07 Sep 2003 02:51:58 +0200
From: Stephane Marchesin
Subject: [SDL] Two little patches

Compiling SDL with a recent gcc (gcc 3.3.1, 3.3 doesn't have this
behaviour) gives some nasty warnings :

SDL_blit_A.c: In function `BlitRGBtoRGBSurfaceAlpha128MMX':
SDL_blit_A.c:223: warning: integer constant is too large for "long" type
SDL_blit_A.c:225: warning: integer constant is too large for "long" type
SDL_blit_A.c:227: warning: integer constant is too large for "long" type
[...]

The first attached patch (longlongfix.patch) tells gcc to really treat
those constants as unsigned long long and not long.

The second patch (nasinclude.patch) fixes an include problem I had while
compiling nas audio : when the <audio/audiolib.h> file lies in
/usr/X11R6/include, a -I/usr/X11R6/include option is needed or the file
isn't found.
slouken@0
     1
;
slouken@0
     2
; pII-optimised MMX format converters for HERMES
slouken@0
     3
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
slouken@0
     4
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
slouken@0
     5
; This source code is licensed under the GNU LGPL
slouken@0
     6
; 
slouken@0
     7
; Please refer to the file COPYING.LIB contained in the distribution for
slouken@0
     8
; licensing conditions		
slouken@0
     9
;
slouken@0
    10
; COPYRIGHT NOTICE
slouken@0
    11
; 
slouken@0
    12
; This file partly contains code that is (c) Intel Corporation, specifically
slouken@0
    13
; the mode detection routine, and the converter to 15 bit (8 pixel
slouken@0
    14
; conversion routine from the mmx programming tutorial pages).
slouken@0
    15
;
slouken@0
    16
;
slouken@0
    17
; These routines aren't exactly pII optimised - it's just that as they
slouken@0
    18
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
slouken@0
    19
; optimise them for p5 MMXs..
slouken@0
    20
slouken@0
    21
BITS 32
slouken@0
    22
slouken@0
    23
	
slouken@0
    24
GLOBAL _ConvertMMXpII32_24RGB888
slouken@0
    25
GLOBAL _ConvertMMXpII32_16RGB565
slouken@0
    26
GLOBAL _ConvertMMXpII32_16BGR565
slouken@0
    27
GLOBAL _ConvertMMXpII32_16RGB555
slouken@0
    28
GLOBAL _ConvertMMXpII32_16BGR555
slouken@0
    29
slouken@0
    30
EXTERN _mmxreturn
slouken@0
    31
 
slouken@0
    32
SECTION .data
slouken@0
    33
	
slouken@0
    34
ALIGN 8
slouken@0
    35
slouken@0
    36
;; Constants for conversion routines
slouken@0
    37
slouken@0
    38
mmx32_rgb888_mask dd 00ffffffh,00ffffffh
slouken@0
    39
slouken@0
    40
mmx32_rgb565_b dd 000000f8h, 000000f8h
slouken@0
    41
mmx32_rgb565_g dd 0000fc00h, 0000fc00h
slouken@0
    42
mmx32_rgb565_r dd 00f80000h, 00f80000h
slouken@0
    43
slouken@0
    44
mmx32_rgb555_rb dd 00f800f8h,00f800f8h
slouken@0
    45
mmx32_rgb555_g dd 0000f800h,0000f800h
slouken@0
    46
mmx32_rgb555_mul dd 20000008h,20000008h
slouken@0
    47
mmx32_bgr555_mul dd 00082000h,00082000h
slouken@0
    48
slouken@0
    49
slouken@0
    50
			
slouken@0
    51
SECTION .text
slouken@0
    52
slouken@0
    53
_ConvertMMXpII32_24RGB888:
slouken@0
    54
slouken@0
    55
        ; set up mm6 as the mask, mm7 as zero
slouken@0
    56
        movq mm6, qword [mmx32_rgb888_mask]
slouken@0
    57
        pxor mm7, mm7
slouken@0
    58
slouken@0
    59
        mov edx, ecx                    ; save ecx
slouken@0
    60
        and ecx, 0fffffffch             ; clear lower two bits
slouken@0
    61
        jnz .L1
slouken@0
    62
        jmp .L2
slouken@0
    63
slouken@0
    64
.L1:
slouken@0
    65
slouken@0
    66
        movq mm0, [esi]                 ; A R G B a r g b
slouken@0
    67
        pand mm0, mm6                   ; 0 R G B 0 r g b
slouken@0
    68
        movq mm1, [esi+8]               ; A R G B a r g b
slouken@0
    69
        pand mm1, mm6                   ; 0 R G B 0 r g b
slouken@0
    70
slouken@0
    71
        movq mm2, mm0                   ; 0 R G B 0 r g b
slouken@0
    72
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
slouken@0
    73
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
slouken@0
    74
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
slouken@0
    75
        por mm0, mm2                    ; 0 0 R G B r g b
slouken@0
    76
slouken@0
    77
        movq mm3, mm1                   ; 0 R G B 0 r g b
slouken@0
    78
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
slouken@0
    79
        por mm0, mm3                    ; g b R G B r g b
slouken@0
    80
slouken@0
    81
        movq mm4, mm1                   ; 0 R G B 0 r g b
slouken@0
    82
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
slouken@0
    83
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
slouken@0
    84
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
slouken@0
    85
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
slouken@0
    86
        por mm1, mm4                    ; 0 0 0 0 R G B r
slouken@0
    87
slouken@0
    88
        movq [edi], mm0
slouken@0
    89
        add esi, BYTE 16
slouken@0
    90
        movd [edi+8], mm1
slouken@0
    91
        add edi, BYTE 12
slouken@0
    92
        sub ecx, BYTE 4
slouken@0
    93
        jnz .L1
slouken@0
    94
slouken@0
    95
.L2:
slouken@0
    96
        mov ecx, edx
slouken@0
    97
        and ecx, BYTE 3
slouken@0
    98
        jz .L4
slouken@0
    99
.L3:
slouken@0
   100
        mov al, [esi]
slouken@0
   101
        mov bl, [esi+1]
slouken@0
   102
        mov dl, [esi+2]
slouken@0
   103
        mov [edi], al
slouken@0
   104
        mov [edi+1], bl
slouken@0
   105
        mov [edi+2], dl
slouken@0
   106
        add esi, BYTE 4
slouken@0
   107
        add edi, BYTE 3
slouken@0
   108
        dec ecx
slouken@0
   109
        jnz .L3
slouken@0
   110
.L4:
slouken@0
   111
        jmp _mmxreturn
slouken@0
   112
slouken@0
   113
slouken@0
   114
slouken@0
   115
_ConvertMMXpII32_16RGB565:
slouken@0
   116
slouken@0
   117
        ; set up masks
slouken@0
   118
        movq mm5, [mmx32_rgb565_b]
slouken@0
   119
        movq mm6, [mmx32_rgb565_g]
slouken@0
   120
        movq mm7, [mmx32_rgb565_r]
slouken@0
   121
slouken@0
   122
        mov edx, ecx
slouken@0
   123
        shr ecx, 2
slouken@0
   124
        jnz .L1
slouken@0
   125
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
slouken@0
   126
slouken@0
   127
.L1:
slouken@0
   128
        movq mm0, [esi]         ; argb
slouken@0
   129
        movq mm1, mm0           ; argb
slouken@0
   130
        pand mm0, mm6           ; 00g0
slouken@0
   131
        movq mm3, mm1           ; argb
slouken@0
   132
        pand mm1, mm5           ; 000b
slouken@0
   133
        pand mm3, mm7           ; 0r00
slouken@0
   134
        pslld mm1, 2            ; 0 0 000000bb bbb00000
slouken@0
   135
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
slouken@0
   136
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   137
slouken@0
   138
        movq mm4, [esi+8]       ; argb
slouken@0
   139
        movq mm2, mm4           ; argb
slouken@0
   140
        pand mm4, mm6           ; 00g0
slouken@0
   141
        movq mm1, mm2           ; argb
slouken@0
   142
        pand mm2, mm5           ; 000b
slouken@0
   143
        pand mm1, mm7           ; 0r00
slouken@0
   144
        pslld mm2, 2            ; 0 0 000000bb bbb00000
slouken@0
   145
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
slouken@0
   146
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
slouken@0
   147
slouken@0
   148
        packuswb mm3, mm1       ; R 0 r 0
slouken@0
   149
        packssdw mm0, mm4       ; as above.. ish
slouken@0
   150
        por mm0, mm3            ; done.
slouken@0
   151
        movq [edi], mm0
slouken@0
   152
slouken@0
   153
        add esi, 16
slouken@0
   154
        add edi, 8
slouken@0
   155
        dec ecx
slouken@0
   156
        jnz .L1
slouken@0
   157
slouken@0
   158
.L2:
slouken@0
   159
        mov ecx, edx
slouken@0
   160
        and ecx, BYTE 3
slouken@0
   161
        jz .L4
slouken@0
   162
.L3:
slouken@0
   163
        mov al, [esi]
slouken@0
   164
        mov bh, [esi+1]
slouken@0
   165
        mov ah, [esi+2]
slouken@0
   166
        shr al, 3
slouken@0
   167
        and eax, 0F81Fh            ; BYTE?
slouken@0
   168
        shr ebx, 5
slouken@0
   169
        and ebx, 07E0h             ; BYTE?
slouken@0
   170
        add eax, ebx
slouken@0
   171
        mov [edi], al
slouken@0
   172
        mov [edi+1], ah
slouken@0
   173
        add esi, BYTE 4
slouken@0
   174
        add edi, BYTE 2
slouken@0
   175
        dec ecx
slouken@0
   176
        jnz .L3
slouken@0
   177
slouken@0
   178
.L4:
slouken@0
   179
	jmp _mmxreturn
slouken@0
   180
slouken@0
   181
	
slouken@0
   182
_ConvertMMXpII32_16BGR565:
slouken@0
   183
slouken@0
   184
        movq mm5, [mmx32_rgb565_r]
slouken@0
   185
        movq mm6, [mmx32_rgb565_g]
slouken@0
   186
        movq mm7, [mmx32_rgb565_b]
slouken@0
   187
slouken@0
   188
        mov edx, ecx
slouken@0
   189
        shr ecx, 2
slouken@0
   190
        jnz .L1
slouken@0
   191
        jmp .L2
slouken@0
   192
slouken@0
   193
.L1:
slouken@0
   194
        movq mm0, [esi]                 ; a r g b
slouken@0
   195
        movq mm1, mm0                   ; a r g b
slouken@0
   196
        pand mm0, mm6                   ; 0 0 g 0
slouken@0
   197
        movq mm3, mm1                   ; a r g b
slouken@0
   198
        pand mm1, mm5                   ; 0 r 0 0
slouken@0
   199
        pand mm3, mm7                   ; 0 0 0 b
slouken@0
   200
slouken@0
   201
        psllq mm3, 16                   ; 0 b 0 0
slouken@0
   202
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
slouken@0
   203
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
slouken@0
   204
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   205
slouken@0
   206
        movq mm4, [esi+8]               ; a r g b
slouken@0
   207
        movq mm2, mm4                   ; a r g b
slouken@0
   208
        pand mm4, mm6                   ; 0 0 g 0
slouken@0
   209
        movq mm1, mm2                   ; a r g b
slouken@0
   210
        pand mm2, mm5                   ; 0 r 0 0
slouken@0
   211
        pand mm1, mm7                   ; 0 0 0 b
slouken@0
   212
slouken@0
   213
        psllq mm1, 16                   ; 0 b 0 0
slouken@0
   214
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
slouken@0
   215
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
slouken@0
   216
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
slouken@0
   217
slouken@0
   218
        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
slouken@0
   219
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
slouken@0
   220
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
slouken@0
   221
        movq [edi], mm0
slouken@0
   222
slouken@0
   223
        add esi, BYTE 16
slouken@0
   224
        add edi, BYTE 8
slouken@0
   225
        dec ecx
slouken@0
   226
        jnz .L1
slouken@0
   227
slouken@0
   228
.L2:
slouken@0
   229
        and edx, BYTE 3
slouken@0
   230
        jz .L4
slouken@0
   231
.L3:
slouken@0
   232
        mov al, [esi+2]
slouken@0
   233
        mov bh, [esi+1]
slouken@0
   234
        mov ah, [esi]
slouken@0
   235
        shr al, 3
slouken@0
   236
        and eax, 0F81Fh                    ; BYTE ?
slouken@0
   237
        shr ebx, 5
slouken@0
   238
        and ebx, 07E0h                     ; BYTE ?
slouken@0
   239
        add eax, ebx
slouken@0
   240
        mov [edi], al
slouken@0
   241
        mov [edi+1], ah
slouken@0
   242
        add esi, BYTE 4
slouken@0
   243
        add edi, BYTE 2
slouken@0
   244
        dec edx
slouken@0
   245
        jnz .L3
slouken@0
   246
slouken@0
   247
.L4:
slouken@0
   248
        jmp _mmxreturn
slouken@0
   249
slouken@0
   250
_ConvertMMXpII32_16BGR555:
slouken@0
   251
slouken@0
   252
        ; the 16BGR555 converter is identical to the RGB555 one,
slouken@0
   253
        ; except it uses a different multiplier for the pmaddwd
slouken@0
   254
        ; instruction.  cool huh.
slouken@0
   255
slouken@0
   256
        movq mm7, qword [mmx32_bgr555_mul]
slouken@0
   257
        jmp _convert_bgr555_cheat
slouken@0
   258
slouken@0
   259
; This is the same as the Intel version.. they obviously went to
slouken@0
   260
; much more trouble to expand/coil the loop than I did, so theirs
slouken@0
   261
; would almost certainly be faster, even if only a little.
slouken@0
   262
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
slouken@0
   263
; (I think) a more accurate name..
slouken@0
   264
_ConvertMMXpII32_16RGB555:
slouken@0
   265
slouken@0
   266
        movq mm7,qword [mmx32_rgb555_mul]
slouken@0
   267
_convert_bgr555_cheat:
slouken@0
   268
        movq mm6,qword [mmx32_rgb555_g]
slouken@0
   269
        
slouken@0
   270
	mov edx,ecx		           ; Save ecx 
slouken@0
   271
slouken@0
   272
        and ecx,BYTE 0fffffff8h            ; clear lower three bits
slouken@0
   273
	jnz .L_OK
slouken@289
   274
        jmp near .L2 
slouken@0
   275
slouken@0
   276
.L_OK:
slouken@0
   277
	
slouken@0
   278
	movq mm2,[esi+8]
slouken@0
   279
slouken@0
   280
	movq mm0,[esi]
slouken@0
   281
	movq mm3,mm2
slouken@0
   282
slouken@0
   283
	pand mm3,qword [mmx32_rgb555_rb]
slouken@0
   284
	movq mm1,mm0
slouken@0
   285
slouken@0
   286
	pand mm1,qword [mmx32_rgb555_rb]
slouken@0
   287
	pmaddwd mm3,mm7
slouken@0
   288
slouken@0
   289
	pmaddwd mm1,mm7
slouken@0
   290
	pand mm2,mm6
slouken@0
   291
slouken@0
   292
.L1:
slouken@0
   293
	movq mm4,[esi+24]
slouken@0
   294
	pand mm0,mm6
slouken@0
   295
slouken@0
   296
	movq mm5,[esi+16]
slouken@0
   297
	por mm3,mm2
slouken@0
   298
slouken@0
   299
	psrld mm3,6
slouken@0
   300
	por mm1,mm0
slouken@0
   301
slouken@0
   302
	movq mm0,mm4
slouken@0
   303
	psrld mm1,6
slouken@0
   304
slouken@0
   305
	pand mm0,qword [mmx32_rgb555_rb]
slouken@0
   306
	packssdw mm1,mm3
slouken@0
   307
slouken@0
   308
	movq mm3,mm5
slouken@0
   309
	pmaddwd mm0,mm7
slouken@0
   310
slouken@0
   311
	pand mm3,qword [mmx32_rgb555_rb]
slouken@0
   312
	pand mm4,mm6
slouken@0
   313
slouken@0
   314
	movq [edi],mm1			
slouken@0
   315
	pmaddwd mm3,mm7
slouken@0
   316
slouken@0
   317
        add esi,BYTE 32
slouken@0
   318
	por mm4,mm0
slouken@0
   319
slouken@0
   320
	pand mm5,mm6
slouken@0
   321
	psrld mm4,6
slouken@0
   322
slouken@0
   323
	movq mm2,[esi+8]
slouken@0
   324
	por mm5,mm3
slouken@0
   325
slouken@0
   326
	movq mm0,[esi]
slouken@0
   327
	psrld mm5,6
slouken@0
   328
slouken@0
   329
	movq mm3,mm2
slouken@0
   330
	movq mm1,mm0
slouken@0
   331
slouken@0
   332
	pand mm3,qword [mmx32_rgb555_rb]
slouken@0
   333
	packssdw mm5,mm4
slouken@0
   334
slouken@0
   335
	pand mm1,qword [mmx32_rgb555_rb]
slouken@0
   336
	pand mm2,mm6
slouken@0
   337
slouken@0
   338
	movq [edi+8],mm5
slouken@0
   339
	pmaddwd mm3,mm7
slouken@0
   340
slouken@0
   341
	pmaddwd mm1,mm7
slouken@0
   342
        add edi,BYTE 16
slouken@0
   343
	
slouken@0
   344
        sub ecx,BYTE 8
slouken@0
   345
	jz .L2
slouken@0
   346
        jmp .L1
slouken@0
   347
slouken@0
   348
slouken@0
   349
.L2:	
slouken@0
   350
	mov ecx,edx
slouken@0
   351
	
slouken@0
   352
        and ecx,BYTE 7
slouken@0
   353
	jz .L4
slouken@0
   354
	
slouken@0
   355
.L3:	
slouken@0
   356
	mov ebx,[esi]
slouken@0
   357
        add esi,BYTE 4
slouken@0
   358
	
slouken@0
   359
        mov eax,ebx
slouken@0
   360
        mov edx,ebx
slouken@0
   361
slouken@0
   362
        shr eax,3
slouken@0
   363
        shr edx,6
slouken@0
   364
slouken@0
   365
        and eax,BYTE 0000000000011111b
slouken@0
   366
        and edx,     0000001111100000b
slouken@0
   367
slouken@0
   368
        shr ebx,9
slouken@0
   369
slouken@0
   370
        or eax,edx
slouken@0
   371
slouken@0
   372
        and ebx,     0111110000000000b
slouken@0
   373
slouken@0
   374
        or eax,ebx
slouken@0
   375
slouken@0
   376
        mov [edi],ax
slouken@0
   377
        add edi,BYTE 2
slouken@0
   378
slouken@0
   379
	dec ecx
slouken@0
   380
	jnz .L3	
slouken@0
   381
slouken@0
   382
.L4:		
slouken@0
   383
	jmp _mmxreturn
slouken@0
   384
slouken@0
   385
slouken@0
   386