src/hermes/mmxp2_32.asm
author Ryan C. Gordon
Wed, 29 Nov 2006 10:30:05 +0000
branchSDL-1.2
changeset 3900 ce3a2bd11305
parent 1873 eb4d9d99849b
child 2134 180fa05e98e2
permissions -rw-r--r--
Wrapped some macro params in parentheses for alloca wrappers.
Thansk, Suzuki Masahiro.
     1 ;
     2 ; pII-optimised MMX format converters for HERMES
     3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
     4 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
     5 ; This source code is licensed under the GNU LGPL
     6 ; 
     7 ; Please refer to the file COPYING.LIB contained in the distribution for
     8 ; licensing conditions		
     9 ;
    10 ; COPYRIGHT NOTICE
    11 ; 
    12 ; This file partly contains code that is (c) Intel Corporation, specifically
    13 ; the mode detection routine, and the converter to 15 bit (8 pixel
    14 ; conversion routine from the mmx programming tutorial pages).
    15 ;
    16 ;
    17 ; These routines aren't exactly pII optimised - it's just that as they
    18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
    19 ; optimise them for p5 MMXs..
    20 
    21 BITS 32
    22 
    23 %include "common.inc"
    24 	
    25 SDL_FUNC _ConvertMMXpII32_24RGB888
    26 SDL_FUNC _ConvertMMXpII32_16RGB565
    27 SDL_FUNC _ConvertMMXpII32_16BGR565
    28 SDL_FUNC _ConvertMMXpII32_16RGB555
    29 SDL_FUNC _ConvertMMXpII32_16BGR555
    30 
    31 EXTERN _mmxreturn
    32  
    33 ;; Macros for conversion routines
    34 
    35 %macro _push_immq_mask 1
    36 	push dword %1
    37 	push dword %1
    38 %endmacro
    39 
    40 %macro load_immq 2
    41 	_push_immq_mask %2
    42 	movq %1, [esp]
    43 %endmacro
    44 
    45 %macro pand_immq 2
    46 	_push_immq_mask %2
    47 	pand %1, [esp]
    48 %endmacro
    49 
    50 %define CLEANUP_IMMQ_LOADS(num) \
    51 	add esp, byte 8 * num
    52 
    53 %define mmx32_rgb888_mask 00ffffffh
    54 %define mmx32_rgb565_b 000000f8h
    55 %define mmx32_rgb565_g 0000fc00h
    56 %define mmx32_rgb565_r 00f80000h
    57 
    58 %define mmx32_rgb555_rb 00f800f8h
    59 %define mmx32_rgb555_g 0000f800h
    60 %define mmx32_rgb555_mul 20000008h
    61 %define mmx32_bgr555_mul 00082000h
    62 
    63 SECTION .text
    64 
    65 _ConvertMMXpII32_24RGB888:
    66 
    67         ; set up mm6 as the mask, mm7 as zero
    68         load_immq mm6, mmx32_rgb888_mask
    69         CLEANUP_IMMQ_LOADS(1)
    70         pxor mm7, mm7
    71 
    72         mov edx, ecx                    ; save ecx
    73         and ecx, 0fffffffch             ; clear lower two bits
    74         jnz .L1
    75         jmp .L2
    76 
    77 .L1:
    78 
    79         movq mm0, [esi]                 ; A R G B a r g b
    80         pand mm0, mm6                   ; 0 R G B 0 r g b
    81         movq mm1, [esi+8]               ; A R G B a r g b
    82         pand mm1, mm6                   ; 0 R G B 0 r g b
    83 
    84         movq mm2, mm0                   ; 0 R G B 0 r g b
    85         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
    86         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
    87         psllq mm2, 24                   ; 0 0 R G B 0 0 0
    88         por mm0, mm2                    ; 0 0 R G B r g b
    89 
    90         movq mm3, mm1                   ; 0 R G B 0 r g b
    91         psllq mm3, 48                   ; g b 0 0 0 0 0 0
    92         por mm0, mm3                    ; g b R G B r g b
    93 
    94         movq mm4, mm1                   ; 0 R G B 0 r g b
    95         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
    96         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
    97         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
    98         psllq mm4, 8                    ; 0 0 0 0 R G B 0
    99         por mm1, mm4                    ; 0 0 0 0 R G B r
   100 
   101         movq [edi], mm0
   102         add esi, BYTE 16
   103         movd [edi+8], mm1
   104         add edi, BYTE 12
   105         sub ecx, BYTE 4
   106         jnz .L1
   107 
   108 .L2:
   109         mov ecx, edx
   110         and ecx, BYTE 3
   111         jz .L4
   112 .L3:
   113         mov al, [esi]
   114         mov bl, [esi+1]
   115         mov dl, [esi+2]
   116         mov [edi], al
   117         mov [edi+1], bl
   118         mov [edi+2], dl
   119         add esi, BYTE 4
   120         add edi, BYTE 3
   121         dec ecx
   122         jnz .L3
   123 .L4:
   124         jmp _mmxreturn
   125 
   126 
   127 
   128 _ConvertMMXpII32_16RGB565:
   129 
   130         ; set up masks
   131         load_immq mm5, mmx32_rgb565_b
   132         load_immq mm6, mmx32_rgb565_g
   133         load_immq mm7, mmx32_rgb565_r
   134         CLEANUP_IMMQ_LOADS(3)
   135 
   136         mov edx, ecx
   137         shr ecx, 2
   138         jnz .L1
   139         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
   140 
   141 .L1:
   142         movq mm0, [esi]         ; argb
   143         movq mm1, mm0           ; argb
   144         pand mm0, mm6           ; 00g0
   145         movq mm3, mm1           ; argb
   146         pand mm1, mm5           ; 000b
   147         pand mm3, mm7           ; 0r00
   148         pslld mm1, 2            ; 0 0 000000bb bbb00000
   149         por mm0, mm1            ; 0 0 ggggggbb bbb00000
   150         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
   151 
   152         movq mm4, [esi+8]       ; argb
   153         movq mm2, mm4           ; argb
   154         pand mm4, mm6           ; 00g0
   155         movq mm1, mm2           ; argb
   156         pand mm2, mm5           ; 000b
   157         pand mm1, mm7           ; 0r00
   158         pslld mm2, 2            ; 0 0 000000bb bbb00000
   159         por mm4, mm2            ; 0 0 ggggggbb bbb00000
   160         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
   161 
   162         packuswb mm3, mm1       ; R 0 r 0
   163         packssdw mm0, mm4       ; as above.. ish
   164         por mm0, mm3            ; done.
   165         movq [edi], mm0
   166 
   167         add esi, 16
   168         add edi, 8
   169         dec ecx
   170         jnz .L1
   171 
   172 .L2:
   173         mov ecx, edx
   174         and ecx, BYTE 3
   175         jz .L4
   176 .L3:
   177         mov al, [esi]
   178         mov bh, [esi+1]
   179         mov ah, [esi+2]
   180         shr al, 3
   181         and eax, 0F81Fh            ; BYTE?
   182         shr ebx, 5
   183         and ebx, 07E0h             ; BYTE?
   184         add eax, ebx
   185         mov [edi], al
   186         mov [edi+1], ah
   187         add esi, BYTE 4
   188         add edi, BYTE 2
   189         dec ecx
   190         jnz .L3
   191 
   192 .L4:
   193 	jmp _mmxreturn
   194 
   195 	
   196 _ConvertMMXpII32_16BGR565:
   197 
   198         load_immq mm5, mmx32_rgb565_r
   199         load_immq mm6, mmx32_rgb565_g
   200         load_immq mm7, mmx32_rgb565_b
   201         CLEANUP_IMMQ_LOADS(3)
   202 
   203         mov edx, ecx
   204         shr ecx, 2
   205         jnz .L1
   206         jmp .L2
   207 
   208 .L1:
   209         movq mm0, [esi]                 ; a r g b
   210         movq mm1, mm0                   ; a r g b
   211         pand mm0, mm6                   ; 0 0 g 0
   212         movq mm3, mm1                   ; a r g b
   213         pand mm1, mm5                   ; 0 r 0 0
   214         pand mm3, mm7                   ; 0 0 0 b
   215 
   216         psllq mm3, 16                   ; 0 b 0 0
   217         psrld mm1, 14                   ; 0 0 000000rr rrr00000
   218         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
   219         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
   220 
   221         movq mm4, [esi+8]               ; a r g b
   222         movq mm2, mm4                   ; a r g b
   223         pand mm4, mm6                   ; 0 0 g 0
   224         movq mm1, mm2                   ; a r g b
   225         pand mm2, mm5                   ; 0 r 0 0
   226         pand mm1, mm7                   ; 0 0 0 b
   227 
   228         psllq mm1, 16                   ; 0 b 0 0
   229         psrld mm2, 14                   ; 0 0 000000rr rrr00000
   230         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
   231         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
   232 
   233         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
   234         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
   235         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
   236         movq [edi], mm0
   237 
   238         add esi, BYTE 16
   239         add edi, BYTE 8
   240         dec ecx
   241         jnz .L1
   242 
   243 .L2:
   244         and edx, BYTE 3
   245         jz .L4
   246 .L3:
   247         mov al, [esi+2]
   248         mov bh, [esi+1]
   249         mov ah, [esi]
   250         shr al, 3
   251         and eax, 0F81Fh                    ; BYTE ?
   252         shr ebx, 5
   253         and ebx, 07E0h                     ; BYTE ?
   254         add eax, ebx
   255         mov [edi], al
   256         mov [edi+1], ah
   257         add esi, BYTE 4
   258         add edi, BYTE 2
   259         dec edx
   260         jnz .L3
   261 
   262 .L4:
   263         jmp _mmxreturn
   264 
   265 _ConvertMMXpII32_16BGR555:
   266 
   267         ; the 16BGR555 converter is identical to the RGB555 one,
   268         ; except it uses a different multiplier for the pmaddwd
   269         ; instruction.  cool huh.
   270 
   271         load_immq mm7, mmx32_bgr555_mul
   272         jmp _convert_bgr555_cheat
   273 
   274 ; This is the same as the Intel version.. they obviously went to
   275 ; much more trouble to expand/coil the loop than I did, so theirs
   276 ; would almost certainly be faster, even if only a little.
   277 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
   278 ; (I think) a more accurate name..
   279 _ConvertMMXpII32_16RGB555:
   280 
   281 	load_immq mm7, mmx32_rgb555_mul
   282 _convert_bgr555_cheat:
   283 	load_immq mm6, mmx32_rgb555_g
   284 	CLEANUP_IMMQ_LOADS(2)
   285         
   286 	mov edx,ecx		           ; Save ecx 
   287 
   288         and ecx,BYTE 0fffffff8h            ; clear lower three bits
   289 	jnz .L_OK
   290         jmp near .L2 
   291 
   292 .L_OK:
   293 	
   294 	movq mm2,[esi+8]
   295 
   296 	movq mm0,[esi]
   297 	movq mm3,mm2
   298 
   299 	pand_immq mm3, mmx32_rgb555_rb
   300 	movq mm1,mm0
   301 
   302 	pand_immq mm1, mmx32_rgb555_rb
   303 	pmaddwd mm3,mm7
   304 
   305 	CLEANUP_IMMQ_LOADS(2)
   306 
   307 	pmaddwd mm1,mm7
   308 	pand mm2,mm6
   309 
   310 .L1:
   311 	movq mm4,[esi+24]
   312 	pand mm0,mm6
   313 
   314 	movq mm5,[esi+16]
   315 	por mm3,mm2
   316 
   317 	psrld mm3,6
   318 	por mm1,mm0
   319 
   320 	movq mm0,mm4
   321 	psrld mm1,6
   322 
   323 	pand_immq mm0, mmx32_rgb555_rb
   324 	packssdw mm1,mm3
   325 
   326 	movq mm3,mm5
   327 	pmaddwd mm0,mm7
   328 
   329 	pand_immq mm3, mmx32_rgb555_rb
   330 	pand mm4,mm6
   331 
   332 	movq [edi],mm1			
   333 	pmaddwd mm3,mm7
   334 
   335         add esi,BYTE 32
   336 	por mm4,mm0
   337 
   338 	pand mm5,mm6
   339 	psrld mm4,6
   340 
   341 	movq mm2,[esi+8]
   342 	por mm5,mm3
   343 
   344 	movq mm0,[esi]
   345 	psrld mm5,6
   346 
   347 	movq mm3,mm2
   348 	movq mm1,mm0
   349 
   350 	pand_immq mm3, mmx32_rgb555_rb
   351 	packssdw mm5,mm4
   352 
   353 	pand_immq mm1, mmx32_rgb555_rb
   354 	pand mm2,mm6
   355 
   356 	CLEANUP_IMMQ_LOADS(4)
   357 
   358 	movq [edi+8],mm5
   359 	pmaddwd mm3,mm7
   360 
   361 	pmaddwd mm1,mm7
   362         add edi,BYTE 16
   363 	
   364         sub ecx,BYTE 8
   365 	jz .L2
   366         jmp .L1
   367 
   368 
   369 .L2:	
   370 	mov ecx,edx
   371 	
   372         and ecx,BYTE 7
   373 	jz .L4
   374 	
   375 .L3:	
   376 	mov ebx,[esi]
   377         add esi,BYTE 4
   378 	
   379         mov eax,ebx
   380         mov edx,ebx
   381 
   382         shr eax,3
   383         shr edx,6
   384 
   385         and eax,BYTE 0000000000011111b
   386         and edx,     0000001111100000b
   387 
   388         shr ebx,9
   389 
   390         or eax,edx
   391 
   392         and ebx,     0111110000000000b
   393 
   394         or eax,ebx
   395 
   396         mov [edi],ax
   397         add edi,BYTE 2
   398 
   399 	dec ecx
   400 	jnz .L3	
   401 
   402 .L4:		
   403 	jmp _mmxreturn
   404 
   405 %ifidn __OUTPUT_FORMAT__,elf
   406 section .note.GNU-stack noalloc noexec nowrite progbits
   407 %endif