src/hermes/mmxp2_32.asm
author Sam Lantinga <slouken@libsdl.org>
Sun, 21 Sep 2003 18:32:04 +0000
changeset 720 f90d80d68071
parent 289 77b6110c797d
child 1166 da33b7e6d181
permissions -rw-r--r--
N Sep 17 8791 Sam Lantinga Re: tks source released
Date: Sun, 07 Sep 2003 02:51:58 +0200
From: Stephane Marchesin
Subject: [SDL] Two little patches

Compiling SDL with a recent gcc (gcc 3.3.1, 3.3 doesn't have this
behaviour) gives some nasty warnings :

SDL_blit_A.c: In function `BlitRGBtoRGBSurfaceAlpha128MMX':
SDL_blit_A.c:223: warning: integer constant is too large for "long" type
SDL_blit_A.c:225: warning: integer constant is too large for "long" type
SDL_blit_A.c:227: warning: integer constant is too large for "long" type
[...]

The first attached patch (longlongfix.patch) tells gcc to really treat
those constants as unsigned long long and not long.

The second patch (nasinclude.patch) fixes an include problem I had while
compiling nas audio : when the <audio/audiolib.h> file lies in
/usr/X11R6/include, a -I/usr/X11R6/include option is needed or the file
isn't found.
     1 ;
     2 ; pII-optimised MMX format converters for HERMES
     3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
     4 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
     5 ; This source code is licensed under the GNU LGPL
     6 ; 
     7 ; Please refer to the file COPYING.LIB contained in the distribution for
     8 ; licensing conditions		
     9 ;
    10 ; COPYRIGHT NOTICE
    11 ; 
    12 ; This file partly contains code that is (c) Intel Corporation, specifically
    13 ; the mode detection routine, and the converter to 15 bit (8 pixel
    14 ; conversion routine from the mmx programming tutorial pages).
    15 ;
    16 ;
    17 ; These routines aren't exactly pII optimised - it's just that as they
    18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
    19 ; optimise them for p5 MMXs..
    20 
    21 BITS 32
    22 
    23 	
    24 GLOBAL _ConvertMMXpII32_24RGB888
    25 GLOBAL _ConvertMMXpII32_16RGB565
    26 GLOBAL _ConvertMMXpII32_16BGR565
    27 GLOBAL _ConvertMMXpII32_16RGB555
    28 GLOBAL _ConvertMMXpII32_16BGR555
    29 
    30 EXTERN _mmxreturn
    31  
    32 SECTION .data
    33 	
    34 ALIGN 8
    35 
    36 ;; Constants for conversion routines
    37 
    38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh
    39 
    40 mmx32_rgb565_b dd 000000f8h, 000000f8h
    41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h
    42 mmx32_rgb565_r dd 00f80000h, 00f80000h
    43 
    44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h
    45 mmx32_rgb555_g dd 0000f800h,0000f800h
    46 mmx32_rgb555_mul dd 20000008h,20000008h
    47 mmx32_bgr555_mul dd 00082000h,00082000h
    48 
    49 
    50 			
    51 SECTION .text
    52 
    53 _ConvertMMXpII32_24RGB888:
    54 
    55         ; set up mm6 as the mask, mm7 as zero
    56         movq mm6, qword [mmx32_rgb888_mask]
    57         pxor mm7, mm7
    58 
    59         mov edx, ecx                    ; save ecx
    60         and ecx, 0fffffffch             ; clear lower two bits
    61         jnz .L1
    62         jmp .L2
    63 
    64 .L1:
    65 
    66         movq mm0, [esi]                 ; A R G B a r g b
    67         pand mm0, mm6                   ; 0 R G B 0 r g b
    68         movq mm1, [esi+8]               ; A R G B a r g b
    69         pand mm1, mm6                   ; 0 R G B 0 r g b
    70 
    71         movq mm2, mm0                   ; 0 R G B 0 r g b
    72         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
    73         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
    74         psllq mm2, 24                   ; 0 0 R G B 0 0 0
    75         por mm0, mm2                    ; 0 0 R G B r g b
    76 
    77         movq mm3, mm1                   ; 0 R G B 0 r g b
    78         psllq mm3, 48                   ; g b 0 0 0 0 0 0
    79         por mm0, mm3                    ; g b R G B r g b
    80 
    81         movq mm4, mm1                   ; 0 R G B 0 r g b
    82         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
    83         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
    84         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
    85         psllq mm4, 8                    ; 0 0 0 0 R G B 0
    86         por mm1, mm4                    ; 0 0 0 0 R G B r
    87 
    88         movq [edi], mm0
    89         add esi, BYTE 16
    90         movd [edi+8], mm1
    91         add edi, BYTE 12
    92         sub ecx, BYTE 4
    93         jnz .L1
    94 
    95 .L2:
    96         mov ecx, edx
    97         and ecx, BYTE 3
    98         jz .L4
    99 .L3:
   100         mov al, [esi]
   101         mov bl, [esi+1]
   102         mov dl, [esi+2]
   103         mov [edi], al
   104         mov [edi+1], bl
   105         mov [edi+2], dl
   106         add esi, BYTE 4
   107         add edi, BYTE 3
   108         dec ecx
   109         jnz .L3
   110 .L4:
   111         jmp _mmxreturn
   112 
   113 
   114 
   115 _ConvertMMXpII32_16RGB565:
   116 
   117         ; set up masks
   118         movq mm5, [mmx32_rgb565_b]
   119         movq mm6, [mmx32_rgb565_g]
   120         movq mm7, [mmx32_rgb565_r]
   121 
   122         mov edx, ecx
   123         shr ecx, 2
   124         jnz .L1
   125         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
   126 
   127 .L1:
   128         movq mm0, [esi]         ; argb
   129         movq mm1, mm0           ; argb
   130         pand mm0, mm6           ; 00g0
   131         movq mm3, mm1           ; argb
   132         pand mm1, mm5           ; 000b
   133         pand mm3, mm7           ; 0r00
   134         pslld mm1, 2            ; 0 0 000000bb bbb00000
   135         por mm0, mm1            ; 0 0 ggggggbb bbb00000
   136         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
   137 
   138         movq mm4, [esi+8]       ; argb
   139         movq mm2, mm4           ; argb
   140         pand mm4, mm6           ; 00g0
   141         movq mm1, mm2           ; argb
   142         pand mm2, mm5           ; 000b
   143         pand mm1, mm7           ; 0r00
   144         pslld mm2, 2            ; 0 0 000000bb bbb00000
   145         por mm4, mm2            ; 0 0 ggggggbb bbb00000
   146         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
   147 
   148         packuswb mm3, mm1       ; R 0 r 0
   149         packssdw mm0, mm4       ; as above.. ish
   150         por mm0, mm3            ; done.
   151         movq [edi], mm0
   152 
   153         add esi, 16
   154         add edi, 8
   155         dec ecx
   156         jnz .L1
   157 
   158 .L2:
   159         mov ecx, edx
   160         and ecx, BYTE 3
   161         jz .L4
   162 .L3:
   163         mov al, [esi]
   164         mov bh, [esi+1]
   165         mov ah, [esi+2]
   166         shr al, 3
   167         and eax, 0F81Fh            ; BYTE?
   168         shr ebx, 5
   169         and ebx, 07E0h             ; BYTE?
   170         add eax, ebx
   171         mov [edi], al
   172         mov [edi+1], ah
   173         add esi, BYTE 4
   174         add edi, BYTE 2
   175         dec ecx
   176         jnz .L3
   177 
   178 .L4:
   179 	jmp _mmxreturn
   180 
   181 	
   182 _ConvertMMXpII32_16BGR565:
   183 
   184         movq mm5, [mmx32_rgb565_r]
   185         movq mm6, [mmx32_rgb565_g]
   186         movq mm7, [mmx32_rgb565_b]
   187 
   188         mov edx, ecx
   189         shr ecx, 2
   190         jnz .L1
   191         jmp .L2
   192 
   193 .L1:
   194         movq mm0, [esi]                 ; a r g b
   195         movq mm1, mm0                   ; a r g b
   196         pand mm0, mm6                   ; 0 0 g 0
   197         movq mm3, mm1                   ; a r g b
   198         pand mm1, mm5                   ; 0 r 0 0
   199         pand mm3, mm7                   ; 0 0 0 b
   200 
   201         psllq mm3, 16                   ; 0 b 0 0
   202         psrld mm1, 14                   ; 0 0 000000rr rrr00000
   203         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
   204         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
   205 
   206         movq mm4, [esi+8]               ; a r g b
   207         movq mm2, mm4                   ; a r g b
   208         pand mm4, mm6                   ; 0 0 g 0
   209         movq mm1, mm2                   ; a r g b
   210         pand mm2, mm5                   ; 0 r 0 0
   211         pand mm1, mm7                   ; 0 0 0 b
   212 
   213         psllq mm1, 16                   ; 0 b 0 0
   214         psrld mm2, 14                   ; 0 0 000000rr rrr00000
   215         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
   216         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
   217 
   218         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
   219         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
   220         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
   221         movq [edi], mm0
   222 
   223         add esi, BYTE 16
   224         add edi, BYTE 8
   225         dec ecx
   226         jnz .L1
   227 
   228 .L2:
   229         and edx, BYTE 3
   230         jz .L4
   231 .L3:
   232         mov al, [esi+2]
   233         mov bh, [esi+1]
   234         mov ah, [esi]
   235         shr al, 3
   236         and eax, 0F81Fh                    ; BYTE ?
   237         shr ebx, 5
   238         and ebx, 07E0h                     ; BYTE ?
   239         add eax, ebx
   240         mov [edi], al
   241         mov [edi+1], ah
   242         add esi, BYTE 4
   243         add edi, BYTE 2
   244         dec edx
   245         jnz .L3
   246 
   247 .L4:
   248         jmp _mmxreturn
   249 
   250 _ConvertMMXpII32_16BGR555:
   251 
   252         ; the 16BGR555 converter is identical to the RGB555 one,
   253         ; except it uses a different multiplier for the pmaddwd
   254         ; instruction.  cool huh.
   255 
   256         movq mm7, qword [mmx32_bgr555_mul]
   257         jmp _convert_bgr555_cheat
   258 
   259 ; This is the same as the Intel version.. they obviously went to
   260 ; much more trouble to expand/coil the loop than I did, so theirs
   261 ; would almost certainly be faster, even if only a little.
   262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
   263 ; (I think) a more accurate name..
   264 _ConvertMMXpII32_16RGB555:
   265 
   266         movq mm7,qword [mmx32_rgb555_mul]
   267 _convert_bgr555_cheat:
   268         movq mm6,qword [mmx32_rgb555_g]
   269         
   270 	mov edx,ecx		           ; Save ecx 
   271 
   272         and ecx,BYTE 0fffffff8h            ; clear lower three bits
   273 	jnz .L_OK
   274         jmp near .L2 
   275 
   276 .L_OK:
   277 	
   278 	movq mm2,[esi+8]
   279 
   280 	movq mm0,[esi]
   281 	movq mm3,mm2
   282 
   283 	pand mm3,qword [mmx32_rgb555_rb]
   284 	movq mm1,mm0
   285 
   286 	pand mm1,qword [mmx32_rgb555_rb]
   287 	pmaddwd mm3,mm7
   288 
   289 	pmaddwd mm1,mm7
   290 	pand mm2,mm6
   291 
   292 .L1:
   293 	movq mm4,[esi+24]
   294 	pand mm0,mm6
   295 
   296 	movq mm5,[esi+16]
   297 	por mm3,mm2
   298 
   299 	psrld mm3,6
   300 	por mm1,mm0
   301 
   302 	movq mm0,mm4
   303 	psrld mm1,6
   304 
   305 	pand mm0,qword [mmx32_rgb555_rb]
   306 	packssdw mm1,mm3
   307 
   308 	movq mm3,mm5
   309 	pmaddwd mm0,mm7
   310 
   311 	pand mm3,qword [mmx32_rgb555_rb]
   312 	pand mm4,mm6
   313 
   314 	movq [edi],mm1			
   315 	pmaddwd mm3,mm7
   316 
   317         add esi,BYTE 32
   318 	por mm4,mm0
   319 
   320 	pand mm5,mm6
   321 	psrld mm4,6
   322 
   323 	movq mm2,[esi+8]
   324 	por mm5,mm3
   325 
   326 	movq mm0,[esi]
   327 	psrld mm5,6
   328 
   329 	movq mm3,mm2
   330 	movq mm1,mm0
   331 
   332 	pand mm3,qword [mmx32_rgb555_rb]
   333 	packssdw mm5,mm4
   334 
   335 	pand mm1,qword [mmx32_rgb555_rb]
   336 	pand mm2,mm6
   337 
   338 	movq [edi+8],mm5
   339 	pmaddwd mm3,mm7
   340 
   341 	pmaddwd mm1,mm7
   342         add edi,BYTE 16
   343 	
   344         sub ecx,BYTE 8
   345 	jz .L2
   346         jmp .L1
   347 
   348 
   349 .L2:	
   350 	mov ecx,edx
   351 	
   352         and ecx,BYTE 7
   353 	jz .L4
   354 	
   355 .L3:	
   356 	mov ebx,[esi]
   357         add esi,BYTE 4
   358 	
   359         mov eax,ebx
   360         mov edx,ebx
   361 
   362         shr eax,3
   363         shr edx,6
   364 
   365         and eax,BYTE 0000000000011111b
   366         and edx,     0000001111100000b
   367 
   368         shr ebx,9
   369 
   370         or eax,edx
   371 
   372         and ebx,     0111110000000000b
   373 
   374         or eax,ebx
   375 
   376         mov [edi],ax
   377         add edi,BYTE 2
   378 
   379 	dec ecx
   380 	jnz .L3	
   381 
   382 .L4:		
   383 	jmp _mmxreturn
   384 
   385 
   386