src/hermes/mmxp2_32.asm
author Sam Lantinga
Mon, 06 Feb 2006 08:28:51 +0000
changeset 1330 450721ad5436
parent 1230 88c2d6aed428
child 1697 393092a3ebf6
permissions -rw-r--r--
It's now possible to build SDL without any C runtime at all on Windows,
using Visual C++ 2005
     1 ;
     2 ; pII-optimised MMX format converters for HERMES
     3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
     4 ;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
     5 ; This source code is licensed under the GNU LGPL
     6 ; 
     7 ; Please refer to the file COPYING.LIB contained in the distribution for
     8 ; licensing conditions		
     9 ;
    10 ; COPYRIGHT NOTICE
    11 ; 
    12 ; This file partly contains code that is (c) Intel Corporation, specifically
    13 ; the mode detection routine, and the converter to 15 bit (8 pixel
    14 ; conversion routine from the mmx programming tutorial pages).
    15 ;
    16 ;
    17 ; These routines aren't exactly pII optimised - it's just that as they
    18 ; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
    19 ; optimise them for p5 MMXs..
    20 
    21 BITS 32
    22 
    23 	
    24 GLOBAL _ConvertMMXpII32_24RGB888
    25 GLOBAL _ConvertMMXpII32_16RGB565
    26 GLOBAL _ConvertMMXpII32_16BGR565
    27 GLOBAL _ConvertMMXpII32_16RGB555
    28 GLOBAL _ConvertMMXpII32_16BGR555
    29 
    30 EXTERN _mmxreturn
    31  
    32 ;; Macros for conversion routines
    33 
    34 %macro _push_immq_mask 1
    35 	push dword %1
    36 	push dword %1
    37 %endmacro
    38 
    39 %macro load_immq 2
    40 	_push_immq_mask %2
    41 	movq %1, [esp]
    42 %endmacro
    43 
    44 %macro pand_immq 2
    45 	_push_immq_mask %2
    46 	pand %1, [esp]
    47 %endmacro
    48 
    49 %define CLEANUP_IMMQ_LOADS(num) \
    50 	add esp, byte 8 * num
    51 
    52 %define mmx32_rgb888_mask 00ffffffh
    53 %define mmx32_rgb565_b 000000f8h
    54 %define mmx32_rgb565_g 0000fc00h
    55 %define mmx32_rgb565_r 00f80000h
    56 
    57 %define mmx32_rgb555_rb 00f800f8h
    58 %define mmx32_rgb555_g 0000f800h
    59 %define mmx32_rgb555_mul 20000008h
    60 %define mmx32_bgr555_mul 00082000h
    61 
    62 SECTION .text
    63 
    64 _ConvertMMXpII32_24RGB888:
    65 
    66         ; set up mm6 as the mask, mm7 as zero
    67         load_immq mm6, mmx32_rgb888_mask
    68         CLEANUP_IMMQ_LOADS(1)
    69         pxor mm7, mm7
    70 
    71         mov edx, ecx                    ; save ecx
    72         and ecx, 0fffffffch             ; clear lower two bits
    73         jnz .L1
    74         jmp .L2
    75 
    76 .L1:
    77 
    78         movq mm0, [esi]                 ; A R G B a r g b
    79         pand mm0, mm6                   ; 0 R G B 0 r g b
    80         movq mm1, [esi+8]               ; A R G B a r g b
    81         pand mm1, mm6                   ; 0 R G B 0 r g b
    82 
    83         movq mm2, mm0                   ; 0 R G B 0 r g b
    84         punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
    85         punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
    86         psllq mm2, 24                   ; 0 0 R G B 0 0 0
    87         por mm0, mm2                    ; 0 0 R G B r g b
    88 
    89         movq mm3, mm1                   ; 0 R G B 0 r g b
    90         psllq mm3, 48                   ; g b 0 0 0 0 0 0
    91         por mm0, mm3                    ; g b R G B r g b
    92 
    93         movq mm4, mm1                   ; 0 R G B 0 r g b
    94         punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
    95         punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
    96         psrlq mm1, 16                   ; 0 0 0 R G B 0 r
    97         psllq mm4, 8                    ; 0 0 0 0 R G B 0
    98         por mm1, mm4                    ; 0 0 0 0 R G B r
    99 
   100         movq [edi], mm0
   101         add esi, BYTE 16
   102         movd [edi+8], mm1
   103         add edi, BYTE 12
   104         sub ecx, BYTE 4
   105         jnz .L1
   106 
   107 .L2:
   108         mov ecx, edx
   109         and ecx, BYTE 3
   110         jz .L4
   111 .L3:
   112         mov al, [esi]
   113         mov bl, [esi+1]
   114         mov dl, [esi+2]
   115         mov [edi], al
   116         mov [edi+1], bl
   117         mov [edi+2], dl
   118         add esi, BYTE 4
   119         add edi, BYTE 3
   120         dec ecx
   121         jnz .L3
   122 .L4:
   123         jmp _mmxreturn
   124 
   125 
   126 
   127 _ConvertMMXpII32_16RGB565:
   128 
   129         ; set up masks
   130         load_immq mm5, mmx32_rgb565_b
   131         load_immq mm6, mmx32_rgb565_g
   132         load_immq mm7, mmx32_rgb565_r
   133         CLEANUP_IMMQ_LOADS(3)
   134 
   135         mov edx, ecx
   136         shr ecx, 2
   137         jnz .L1
   138         jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
   139 
   140 .L1:
   141         movq mm0, [esi]         ; argb
   142         movq mm1, mm0           ; argb
   143         pand mm0, mm6           ; 00g0
   144         movq mm3, mm1           ; argb
   145         pand mm1, mm5           ; 000b
   146         pand mm3, mm7           ; 0r00
   147         pslld mm1, 2            ; 0 0 000000bb bbb00000
   148         por mm0, mm1            ; 0 0 ggggggbb bbb00000
   149         psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
   150 
   151         movq mm4, [esi+8]       ; argb
   152         movq mm2, mm4           ; argb
   153         pand mm4, mm6           ; 00g0
   154         movq mm1, mm2           ; argb
   155         pand mm2, mm5           ; 000b
   156         pand mm1, mm7           ; 0r00
   157         pslld mm2, 2            ; 0 0 000000bb bbb00000
   158         por mm4, mm2            ; 0 0 ggggggbb bbb00000
   159         psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
   160 
   161         packuswb mm3, mm1       ; R 0 r 0
   162         packssdw mm0, mm4       ; as above.. ish
   163         por mm0, mm3            ; done.
   164         movq [edi], mm0
   165 
   166         add esi, 16
   167         add edi, 8
   168         dec ecx
   169         jnz .L1
   170 
   171 .L2:
   172         mov ecx, edx
   173         and ecx, BYTE 3
   174         jz .L4
   175 .L3:
   176         mov al, [esi]
   177         mov bh, [esi+1]
   178         mov ah, [esi+2]
   179         shr al, 3
   180         and eax, 0F81Fh            ; BYTE?
   181         shr ebx, 5
   182         and ebx, 07E0h             ; BYTE?
   183         add eax, ebx
   184         mov [edi], al
   185         mov [edi+1], ah
   186         add esi, BYTE 4
   187         add edi, BYTE 2
   188         dec ecx
   189         jnz .L3
   190 
   191 .L4:
   192 	jmp _mmxreturn
   193 
   194 	
   195 _ConvertMMXpII32_16BGR565:
   196 
   197         load_immq mm5, mmx32_rgb565_r
   198         load_immq mm6, mmx32_rgb565_g
   199         load_immq mm7, mmx32_rgb565_b
   200         CLEANUP_IMMQ_LOADS(3)
   201 
   202         mov edx, ecx
   203         shr ecx, 2
   204         jnz .L1
   205         jmp .L2
   206 
   207 .L1:
   208         movq mm0, [esi]                 ; a r g b
   209         movq mm1, mm0                   ; a r g b
   210         pand mm0, mm6                   ; 0 0 g 0
   211         movq mm3, mm1                   ; a r g b
   212         pand mm1, mm5                   ; 0 r 0 0
   213         pand mm3, mm7                   ; 0 0 0 b
   214 
   215         psllq mm3, 16                   ; 0 b 0 0
   216         psrld mm1, 14                   ; 0 0 000000rr rrr00000
   217         por mm0, mm1                    ; 0 0 ggggggrr rrr00000
   218         psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
   219 
   220         movq mm4, [esi+8]               ; a r g b
   221         movq mm2, mm4                   ; a r g b
   222         pand mm4, mm6                   ; 0 0 g 0
   223         movq mm1, mm2                   ; a r g b
   224         pand mm2, mm5                   ; 0 r 0 0
   225         pand mm1, mm7                   ; 0 0 0 b
   226 
   227         psllq mm1, 16                   ; 0 b 0 0
   228         psrld mm2, 14                   ; 0 0 000000rr rrr00000
   229         por mm4, mm2                    ; 0 0 ggggggrr rrr00000
   230         psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
   231 
   232         packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
   233         packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
   234         por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
   235         movq [edi], mm0
   236 
   237         add esi, BYTE 16
   238         add edi, BYTE 8
   239         dec ecx
   240         jnz .L1
   241 
   242 .L2:
   243         and edx, BYTE 3
   244         jz .L4
   245 .L3:
   246         mov al, [esi+2]
   247         mov bh, [esi+1]
   248         mov ah, [esi]
   249         shr al, 3
   250         and eax, 0F81Fh                    ; BYTE ?
   251         shr ebx, 5
   252         and ebx, 07E0h                     ; BYTE ?
   253         add eax, ebx
   254         mov [edi], al
   255         mov [edi+1], ah
   256         add esi, BYTE 4
   257         add edi, BYTE 2
   258         dec edx
   259         jnz .L3
   260 
   261 .L4:
   262         jmp _mmxreturn
   263 
   264 _ConvertMMXpII32_16BGR555:
   265 
   266         ; the 16BGR555 converter is identical to the RGB555 one,
   267         ; except it uses a different multiplier for the pmaddwd
   268         ; instruction.  cool huh.
   269 
   270         load_immq mm7, mmx32_bgr555_mul
   271         jmp _convert_bgr555_cheat
   272 
   273 ; This is the same as the Intel version.. they obviously went to
   274 ; much more trouble to expand/coil the loop than I did, so theirs
   275 ; would almost certainly be faster, even if only a little.
   276 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
   277 ; (I think) a more accurate name..
   278 _ConvertMMXpII32_16RGB555:
   279 
   280 	load_immq mm7, mmx32_rgb555_mul
   281 _convert_bgr555_cheat:
   282 	load_immq mm6, mmx32_rgb555_g
   283 	CLEANUP_IMMQ_LOADS(2)
   284         
   285 	mov edx,ecx		           ; Save ecx 
   286 
   287         and ecx,BYTE 0fffffff8h            ; clear lower three bits
   288 	jnz .L_OK
   289         jmp near .L2 
   290 
   291 .L_OK:
   292 	
   293 	movq mm2,[esi+8]
   294 
   295 	movq mm0,[esi]
   296 	movq mm3,mm2
   297 
   298 	pand_immq mm3, mmx32_rgb555_rb
   299 	movq mm1,mm0
   300 
   301 	pand_immq mm1, mmx32_rgb555_rb
   302 	pmaddwd mm3,mm7
   303 
   304 	CLEANUP_IMMQ_LOADS(2)
   305 
   306 	pmaddwd mm1,mm7
   307 	pand mm2,mm6
   308 
   309 .L1:
   310 	movq mm4,[esi+24]
   311 	pand mm0,mm6
   312 
   313 	movq mm5,[esi+16]
   314 	por mm3,mm2
   315 
   316 	psrld mm3,6
   317 	por mm1,mm0
   318 
   319 	movq mm0,mm4
   320 	psrld mm1,6
   321 
   322 	pand_immq mm0, mmx32_rgb555_rb
   323 	packssdw mm1,mm3
   324 
   325 	movq mm3,mm5
   326 	pmaddwd mm0,mm7
   327 
   328 	pand_immq mm3, mmx32_rgb555_rb
   329 	pand mm4,mm6
   330 
   331 	movq [edi],mm1			
   332 	pmaddwd mm3,mm7
   333 
   334         add esi,BYTE 32
   335 	por mm4,mm0
   336 
   337 	pand mm5,mm6
   338 	psrld mm4,6
   339 
   340 	movq mm2,[esi+8]
   341 	por mm5,mm3
   342 
   343 	movq mm0,[esi]
   344 	psrld mm5,6
   345 
   346 	movq mm3,mm2
   347 	movq mm1,mm0
   348 
   349 	pand_immq mm3, mmx32_rgb555_rb
   350 	packssdw mm5,mm4
   351 
   352 	pand_immq mm1, mmx32_rgb555_rb
   353 	pand mm2,mm6
   354 
   355 	CLEANUP_IMMQ_LOADS(4)
   356 
   357 	movq [edi+8],mm5
   358 	pmaddwd mm3,mm7
   359 
   360 	pmaddwd mm1,mm7
   361         add edi,BYTE 16
   362 	
   363         sub ecx,BYTE 8
   364 	jz .L2
   365         jmp .L1
   366 
   367 
   368 .L2:	
   369 	mov ecx,edx
   370 	
   371         and ecx,BYTE 7
   372 	jz .L4
   373 	
   374 .L3:	
   375 	mov ebx,[esi]
   376         add esi,BYTE 4
   377 	
   378         mov eax,ebx
   379         mov edx,ebx
   380 
   381         shr eax,3
   382         shr edx,6
   383 
   384         and eax,BYTE 0000000000011111b
   385         and edx,     0000001111100000b
   386 
   387         shr ebx,9
   388 
   389         or eax,edx
   390 
   391         and ebx,     0111110000000000b
   392 
   393         or eax,ebx
   394 
   395         mov [edi],ax
   396         add edi,BYTE 2
   397 
   398 	dec ecx
   399 	jnz .L3	
   400 
   401 .L4:		
   402 	jmp _mmxreturn
   403 
   404 %ifidn __OUTPUT_FORMAT__,elf
   405 section .note.GNU-stack noalloc noexec nowrite progbits
   406 %endif