src/hermes/x86p_32.asm
author Sam Lantinga
Mon, 06 Feb 2006 08:28:51 +0000
changeset 1330 450721ad5436
parent 1199 2d6dc7de1145
child 1697 393092a3ebf6
permissions -rw-r--r--
It's now possible to build SDL without any C runtime at all on Windows,
using Visual C++ 2005
     1 ;
     2 ; x86 format converters for HERMES
     3 ; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
     4 ; This source code is licensed under the GNU LGPL
     5 ; 
     6 ; Please refer to the file COPYING.LIB contained in the distribution for
     7 ; licensing conditions		
     8 ;
     9 ; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
    10 ; 
    11 
    12 	
    13 BITS 32
    14 
    15 GLOBAL _ConvertX86p32_32BGR888
    16 GLOBAL _ConvertX86p32_32RGBA888
    17 GLOBAL _ConvertX86p32_32BGRA888
    18 GLOBAL _ConvertX86p32_24RGB888	
    19 GLOBAL _ConvertX86p32_24BGR888
    20 GLOBAL _ConvertX86p32_16RGB565
    21 GLOBAL _ConvertX86p32_16BGR565
    22 GLOBAL _ConvertX86p32_16RGB555
    23 GLOBAL _ConvertX86p32_16BGR555
    24 GLOBAL _ConvertX86p32_8RGB332
    25 
    26 EXTERN _x86return
    27 	
    28 SECTION .text
    29 
    30 ;; _Convert_*
    31 ;; Paramters:	
    32 ;;   ESI = source 
    33 ;;   EDI = dest
    34 ;;   ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
    35 ;; Destroys:
    36 ;;   EAX, EBX, EDX
    37 
    38 
    39 _ConvertX86p32_32BGR888:
    40 
    41     ; check short
    42     cmp ecx,BYTE 32
    43     ja .L3
    44 
    45 .L1 ; short loop
    46     mov edx,[esi]
    47     bswap edx
    48     ror edx,8
    49     mov [edi],edx
    50     add esi,BYTE 4
    51     add edi,BYTE 4
    52     dec ecx
    53     jnz .L1
    54 .L2
    55     jmp _x86return
    56 
    57 .L3 ; save ebp
    58     push ebp
    59 
    60     ; unroll four times
    61     mov ebp,ecx
    62     shr ebp,2
    63     
    64     ; save count
    65     push ecx
    66 
    67 .L4     mov eax,[esi]
    68         mov ebx,[esi+4]
    69 
    70         bswap eax
    71 
    72         bswap ebx
    73 
    74         ror eax,8
    75         mov ecx,[esi+8]
    76 
    77         ror ebx,8
    78         mov edx,[esi+12]
    79 
    80         bswap ecx
    81 
    82         bswap edx
    83 
    84         ror ecx,8
    85         mov [edi+0],eax
    86 
    87         ror edx,8
    88         mov [edi+4],ebx
    89 
    90         mov [edi+8],ecx
    91         mov [edi+12],edx
    92 
    93         add esi,BYTE 16
    94         add edi,BYTE 16
    95 
    96         dec ebp
    97         jnz .L4                 
    98 
    99     ; check tail
   100     pop ecx
   101     and ecx,BYTE 11b
   102     jz .L6
   103 
   104 .L5 ; tail loop
   105     mov edx,[esi]
   106     bswap edx
   107     ror edx,8
   108     mov [edi],edx
   109     add esi,BYTE 4
   110     add edi,BYTE 4
   111     dec ecx
   112     jnz .L5
   113 
   114 .L6 pop ebp
   115     jmp _x86return
   116 	
   117 
   118 	
   119 		
   120 _ConvertX86p32_32RGBA888:
   121 	
   122     ; check short
   123     cmp ecx,BYTE 32
   124     ja .L3
   125 
   126 .L1 ; short loop
   127     mov edx,[esi]
   128     rol edx,8
   129     mov [edi],edx
   130     add esi,BYTE 4
   131     add edi,BYTE 4
   132     dec ecx
   133     jnz .L1
   134 .L2
   135     jmp _x86return
   136 
   137 .L3 ; save ebp
   138     push ebp
   139 
   140     ; unroll four times
   141     mov ebp,ecx
   142     shr ebp,2
   143     
   144     ; save count
   145     push ecx
   146 
   147 .L4     mov eax,[esi]
   148         mov ebx,[esi+4]
   149 
   150         rol eax,8
   151         mov ecx,[esi+8]
   152 
   153         rol ebx,8
   154         mov edx,[esi+12]
   155 
   156         rol ecx,8
   157         mov [edi+0],eax
   158 
   159         rol edx,8
   160         mov [edi+4],ebx
   161 
   162         mov [edi+8],ecx
   163         mov [edi+12],edx
   164 
   165         add esi,BYTE 16
   166         add edi,BYTE 16
   167 
   168         dec ebp
   169         jnz .L4                 
   170 
   171     ; check tail
   172     pop ecx
   173     and ecx,BYTE 11b
   174     jz .L6
   175 
   176 .L5 ; tail loop
   177     mov edx,[esi]
   178     rol edx,8
   179     mov [edi],edx
   180     add esi,BYTE 4
   181     add edi,BYTE 4
   182     dec ecx
   183     jnz .L5
   184 
   185 .L6 pop ebp
   186     jmp _x86return
   187 
   188 	
   189 
   190 
   191 _ConvertX86p32_32BGRA888:
   192 
   193     ; check short
   194     cmp ecx,BYTE 32
   195     ja .L3
   196 
   197 .L1 ; short loop
   198     mov edx,[esi]
   199     bswap edx
   200     mov [edi],edx
   201     add esi,BYTE 4
   202     add edi,BYTE 4
   203     dec ecx
   204     jnz .L1
   205 .L2
   206     jmp _x86return
   207 
   208 .L3 ; save ebp
   209     push ebp
   210 
   211     ; unroll four times
   212     mov ebp,ecx
   213     shr ebp,2
   214     
   215     ; save count
   216     push ecx
   217 
   218 .L4     mov eax,[esi]
   219         mov ebx,[esi+4]
   220 
   221         mov ecx,[esi+8]
   222         mov edx,[esi+12]
   223 
   224         bswap eax
   225 
   226         bswap ebx
   227 
   228         bswap ecx
   229 
   230         bswap edx
   231 
   232         mov [edi+0],eax
   233         mov [edi+4],ebx
   234 
   235         mov [edi+8],ecx
   236         mov [edi+12],edx
   237 
   238         add esi,BYTE 16
   239         add edi,BYTE 16
   240 
   241         dec ebp
   242         jnz .L4                 
   243 
   244     ; check tail
   245     pop ecx
   246     and ecx,BYTE 11b
   247     jz .L6
   248 
   249 .L5 ; tail loop
   250     mov edx,[esi]
   251     bswap edx
   252     mov [edi],edx
   253     add esi,BYTE 4
   254     add edi,BYTE 4
   255     dec ecx
   256     jnz .L5
   257 
   258 .L6 pop ebp
   259     jmp _x86return
   260 
   261 
   262 	
   263 	
   264 ;; 32 bit RGB 888 to 24 BIT RGB 888
   265 
   266 _ConvertX86p32_24RGB888:
   267 
   268 	; check short
   269 	cmp ecx,BYTE 32
   270 	ja .L3
   271 
   272 .L1	; short loop
   273 	mov al,[esi]
   274 	mov bl,[esi+1]
   275 	mov dl,[esi+2]
   276 	mov [edi],al
   277 	mov [edi+1],bl
   278 	mov [edi+2],dl
   279 	add esi,BYTE 4
   280 	add edi,BYTE 3
   281 	dec ecx
   282 	jnz .L1
   283 .L2 
   284 	jmp _x86return
   285 
   286 .L3	;	 head
   287 	mov edx,edi
   288 	and edx,BYTE 11b
   289 	jz .L4
   290 	mov al,[esi]
   291 	mov bl,[esi+1]
   292 	mov dl,[esi+2]
   293 	mov [edi],al
   294 	mov [edi+1],bl
   295 	mov [edi+2],dl
   296 	add esi,BYTE 4
   297 	add edi,BYTE 3
   298 	dec ecx
   299 	jmp SHORT .L3
   300 
   301 .L4 ; unroll 4 times
   302 	push ebp
   303 	mov ebp,ecx
   304 	shr ebp,2
   305 
   306     ; save count
   307 	push ecx
   308 
   309 .L5     mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
   310         mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
   311 
   312         shl eax,8                       ;                        eax = [R][G][B][.]
   313         mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
   314 
   315         shl ebx,8                       ;                        ebx = [r][g][b][.]
   316         mov al,[esi+4]                  ;                        eax = [R][G][B][b]
   317 
   318         ror eax,8                       ;                        eax = [b][R][G][B] (done)
   319         mov bh,[esi+8+1]                ;                        ebx = [r][g][G][.]
   320 
   321         mov [edi],eax
   322         add edi,BYTE 3*4
   323 
   324         shl ecx,8                       ;                        ecx = [r][g][b][.]
   325         mov bl,[esi+8+0]                ;                        ebx = [r][g][G][B]
   326 
   327         rol ebx,16                      ;                        ebx = [G][B][r][g] (done)
   328         mov cl,[esi+8+2]                ;                        ecx = [r][g][b][R] (done)
   329 
   330         mov [edi+4-3*4],ebx
   331         add esi,BYTE 4*4
   332         
   333         mov [edi+8-3*4],ecx
   334         dec ebp
   335 
   336         jnz .L5
   337 
   338     ; check tail
   339 	pop ecx
   340 	and ecx,BYTE 11b
   341 	jz .L7
   342 
   343 .L6 ; tail loop
   344 	mov al,[esi]
   345 	mov bl,[esi+1]
   346 	mov dl,[esi+2]
   347 	mov [edi],al
   348 	mov [edi+1],bl
   349 	mov [edi+2],dl
   350 	add esi,BYTE 4
   351 	add edi,BYTE 3
   352 	dec ecx
   353 	jnz .L6
   354 
   355 .L7	pop ebp
   356 	jmp _x86return
   357 
   358 
   359 
   360 
   361 ;; 32 bit RGB 888 to 24 bit BGR 888
   362 
   363 _ConvertX86p32_24BGR888:
   364 
   365 	; check short
   366 	cmp ecx,BYTE 32
   367 	ja .L3
   368 
   369 	
   370 .L1	; short loop
   371 	mov dl,[esi]
   372 	mov bl,[esi+1]
   373 	mov al,[esi+2]
   374 	mov [edi],al
   375 	mov [edi+1],bl
   376 	mov [edi+2],dl
   377 	add esi,BYTE 4
   378 	add edi,BYTE 3
   379 	dec ecx
   380 	jnz .L1
   381 .L2
   382 	jmp _x86return
   383 
   384 .L3 ; head
   385 	mov edx,edi
   386 	and edx,BYTE 11b
   387 	jz .L4
   388 	mov dl,[esi]
   389 	mov bl,[esi+1]
   390 	mov al,[esi+2]
   391 	mov [edi],al
   392 	mov [edi+1],bl
   393 	mov [edi+2],dl
   394 	add esi,BYTE 4
   395 	add edi,BYTE 3
   396 	dec ecx
   397 	jmp SHORT .L3
   398 
   399 .L4	; unroll 4 times
   400 	push ebp
   401 	mov ebp,ecx
   402 	shr ebp,2
   403 
   404 	; save count
   405 	push ecx
   406 
   407 .L5     
   408 	mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
   409         mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
   410         
   411         bswap eax                       ;                        eax = [B][G][R][A]
   412 
   413         bswap ebx                       ;                        ebx = [b][g][r][a]
   414 
   415         mov al,[esi+4+2]                ;                        eax = [B][G][R][r] 
   416         mov bh,[esi+4+4+1]              ;                        ebx = [b][g][G][a]
   417 
   418         ror eax,8                       ;                        eax = [r][B][G][R] (done)
   419         mov bl,[esi+4+4+2]              ;                        ebx = [b][g][G][R]
   420 
   421         ror ebx,16                      ;                        ebx = [G][R][b][g] (done)
   422         mov [edi],eax
   423     
   424         mov [edi+4],ebx
   425         mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
   426         
   427         bswap ecx                       ;                        ecx = [b][g][r][a]
   428         
   429         mov cl,[esi+8]                  ;                        ecx = [b][g][r][B] (done)
   430         add esi,BYTE 4*4
   431 
   432         mov [edi+8],ecx
   433         add edi,BYTE 3*4
   434 
   435         dec ebp
   436         jnz .L5
   437 
   438 	; check tail
   439 	pop ecx
   440 	and ecx,BYTE 11b
   441 	jz .L7
   442 
   443 .L6	; tail loop
   444 	mov dl,[esi]
   445 	mov bl,[esi+1]
   446 	mov al,[esi+2]
   447 	mov [edi],al
   448 	mov [edi+1],bl
   449 	mov [edi+2],dl
   450 	add esi,BYTE 4
   451 	add edi,BYTE 3
   452 	dec ecx
   453 	jnz .L6
   454 
   455 .L7 
   456 	pop ebp
   457 	jmp _x86return
   458  
   459 
   460 	
   461 		
   462 ;; 32 bit RGB 888 to 16 BIT RGB 565 
   463 
   464 _ConvertX86p32_16RGB565:
   465 	; check short
   466 	cmp ecx,BYTE 16
   467 	ja .L3
   468 
   469 .L1 ; short loop
   470 	mov bl,[esi+0]    ; blue
   471 	mov al,[esi+1]    ; green
   472 	mov ah,[esi+2]    ; red
   473 	shr ah,3
   474         and al,11111100b
   475 	shl eax,3
   476 	shr bl,3
   477 	add al,bl
   478 	mov [edi+0],al
   479 	mov [edi+1],ah
   480 	add esi,BYTE 4
   481 	add edi,BYTE 2
   482 	dec ecx
   483 	jnz .L1
   484 
   485 .L2:				; End of short loop
   486 	jmp _x86return
   487 
   488 	
   489 .L3	; head
   490 	mov ebx,edi
   491 	and ebx,BYTE 11b
   492 	jz .L4
   493 	
   494 	mov bl,[esi+0]    ; blue
   495 	mov al,[esi+1]    ; green
   496 	mov ah,[esi+2]    ; red
   497 	shr ah,3
   498 	and al,11111100b
   499 	shl eax,3
   500 	shr bl,3
   501 	add al,bl
   502 	mov [edi+0],al
   503 	mov [edi+1],ah
   504 	add esi,BYTE 4
   505 	add edi,BYTE 2
   506 	dec ecx
   507 
   508 .L4:	 
   509     ; save count
   510 	push ecx
   511 
   512     ; unroll twice
   513 	shr ecx,1
   514     
   515     ; point arrays to end
   516 	lea esi,[esi+ecx*8]
   517 	lea edi,[edi+ecx*4]
   518 
   519     ; negative counter 
   520 	neg ecx
   521 	jmp SHORT .L6
   522 
   523 .L5:	    
   524 	mov [edi+ecx*4-4],eax
   525 .L6:	
   526 	mov eax,[esi+ecx*8]
   527 
   528         shr ah,2
   529         mov ebx,[esi+ecx*8+4]
   530 
   531         shr eax,3
   532         mov edx,[esi+ecx*8+4]
   533 
   534         shr bh,2
   535         mov dl,[esi+ecx*8+2]
   536 
   537         shl ebx,13
   538         and eax,000007FFh
   539         
   540         shl edx,8
   541         and ebx,07FF0000h
   542 
   543         and edx,0F800F800h
   544         add eax,ebx
   545 
   546         add eax,edx
   547         inc ecx
   548 
   549         jnz .L5                 
   550 
   551 	mov [edi+ecx*4-4],eax
   552 
   553     ; tail
   554 	pop ecx
   555 	test cl,1
   556 	jz .L7
   557 	
   558 	mov bl,[esi+0]    ; blue
   559 	mov al,[esi+1]    ; green
   560 	mov ah,[esi+2]    ; red
   561 	shr ah,3
   562 	and al,11111100b
   563 	shl eax,3
   564 	shr bl,3
   565 	add al,bl
   566 	mov [edi+0],al
   567 	mov [edi+1],ah
   568 	add esi,BYTE 4
   569 	add edi,BYTE 2
   570 
   571 .L7:	
   572 	jmp _x86return
   573 
   574 
   575 
   576 	
   577 ;; 32 bit RGB 888 to 16 BIT BGR 565 
   578 
   579 _ConvertX86p32_16BGR565:
   580 	
   581 	; check short
   582 	cmp ecx,BYTE 16
   583 	ja .L3
   584 
   585 .L1	; short loop
   586 	mov ah,[esi+0]    ; blue
   587 	mov al,[esi+1]    ; green
   588 	mov bl,[esi+2]    ; red
   589 	shr ah,3
   590 	and al,11111100b
   591 	shl eax,3
   592 	shr bl,3
   593 	add al,bl
   594 	mov [edi+0],al
   595 	mov [edi+1],ah
   596 	add esi,BYTE 4
   597 	add edi,BYTE 2
   598 	dec ecx
   599 	jnz .L1
   600 .L2
   601 	jmp _x86return
   602 
   603 .L3	; head
   604 	mov ebx,edi
   605 	and ebx,BYTE 11b
   606 	jz .L4   
   607 	mov ah,[esi+0]    ; blue
   608 	mov al,[esi+1]    ; green
   609 	mov bl,[esi+2]    ; red
   610 	shr ah,3
   611 	and al,11111100b
   612 	shl eax,3
   613 	shr bl,3
   614 	add al,bl
   615 	mov [edi+0],al
   616 	mov [edi+1],ah
   617 	add esi,BYTE 4
   618 	add edi,BYTE 2
   619 	dec ecx
   620 
   621 .L4	; save count
   622 	push ecx
   623 
   624 	; unroll twice
   625 	shr ecx,1
   626     
   627 	; point arrays to end
   628 	lea esi,[esi+ecx*8]
   629 	lea edi,[edi+ecx*4]
   630 
   631 	; negative count
   632 	neg ecx
   633 	jmp SHORT .L6
   634 
   635 .L5     
   636 	mov [edi+ecx*4-4],eax            
   637 .L6     
   638 	mov edx,[esi+ecx*8+4]
   639 
   640         mov bh,[esi+ecx*8+4]                       
   641         mov ah,[esi+ecx*8]                       
   642 
   643         shr bh,3
   644         mov al,[esi+ecx*8+1]             
   645 
   646         shr ah,3
   647         mov bl,[esi+ecx*8+5]           
   648 
   649         shl eax,3
   650         mov dl,[esi+ecx*8+2]
   651 
   652         shl ebx,19
   653         and eax,0000FFE0h              
   654                 
   655         shr edx,3
   656         and ebx,0FFE00000h             
   657         
   658         and edx,001F001Fh               
   659         add eax,ebx
   660 
   661         add eax,edx
   662         inc ecx
   663 
   664         jnz .L5                 
   665 
   666 	mov [edi+ecx*4-4],eax            
   667 
   668 	; tail
   669 	pop ecx
   670 	and ecx,BYTE 1
   671 	jz .L7
   672 	mov ah,[esi+0]    ; blue
   673 	mov al,[esi+1]    ; green
   674 	mov bl,[esi+2]    ; red
   675 	shr ah,3
   676 	and al,11111100b
   677 	shl eax,3
   678 	shr bl,3
   679 	add al,bl
   680 	mov [edi+0],al
   681 	mov [edi+1],ah
   682 	add esi,BYTE 4
   683 	add edi,BYTE 2
   684 
   685 .L7 
   686 	jmp _x86return
   687 
   688 
   689 	
   690 	
   691 ;; 32 BIT RGB TO 16 BIT RGB 555
   692 
   693 _ConvertX86p32_16RGB555:
   694 
   695 	; check short
   696 	cmp ecx,BYTE 16
   697 	ja .L3
   698 
   699 .L1	; short loop
   700 	mov bl,[esi+0]    ; blue
   701 	mov al,[esi+1]    ; green
   702 	mov ah,[esi+2]    ; red
   703 	shr ah,3
   704 	and al,11111000b
   705 	shl eax,2
   706 	shr bl,3
   707 	add al,bl
   708 	mov [edi+0],al
   709 	mov [edi+1],ah
   710 	add esi,BYTE 4
   711 	add edi,BYTE 2
   712 	dec ecx
   713 	jnz .L1
   714 .L2
   715 	jmp _x86return
   716 
   717 .L3	; head
   718 	mov ebx,edi
   719         and ebx,BYTE 11b
   720 	jz .L4   
   721 	mov bl,[esi+0]    ; blue
   722 	mov al,[esi+1]    ; green
   723 	mov ah,[esi+2]    ; red
   724 	shr ah,3
   725 	and al,11111000b
   726 	shl eax,2
   727 	shr bl,3
   728 	add al,bl
   729 	mov [edi+0],al
   730 	mov [edi+1],ah
   731 	add esi,BYTE 4
   732 	add edi,BYTE 2
   733 	dec ecx
   734 
   735 .L4	; save count
   736 	push ecx
   737 
   738 	; unroll twice
   739 	shr ecx,1
   740     
   741 	; point arrays to end
   742 	lea esi,[esi+ecx*8]
   743 	lea edi,[edi+ecx*4]
   744 
   745 	; negative counter 
   746 	neg ecx
   747 	jmp SHORT .L6
   748 
   749 .L5     
   750 	mov [edi+ecx*4-4],eax
   751 .L6     
   752 	mov eax,[esi+ecx*8]
   753 
   754         shr ah,3
   755         mov ebx,[esi+ecx*8+4]
   756 
   757         shr eax,3
   758         mov edx,[esi+ecx*8+4]
   759 
   760         shr bh,3
   761         mov dl,[esi+ecx*8+2]
   762 
   763         shl ebx,13
   764         and eax,000007FFh
   765         
   766         shl edx,7
   767         and ebx,07FF0000h
   768 
   769         and edx,07C007C00h
   770         add eax,ebx
   771 
   772         add eax,edx
   773         inc ecx
   774 
   775         jnz .L5                 
   776 
   777 	mov [edi+ecx*4-4],eax
   778 
   779 	; tail
   780 	pop ecx
   781 	and ecx,BYTE 1
   782 	jz .L7
   783 	mov bl,[esi+0]    ; blue
   784 	mov al,[esi+1]    ; green
   785 	mov ah,[esi+2]    ; red
   786 	shr ah,3
   787 	and al,11111000b
   788 	shl eax,2
   789 	shr bl,3
   790 	add al,bl
   791 	mov [edi+0],al
   792 	mov [edi+1],ah
   793 	add esi,BYTE 4
   794 	add edi,BYTE 2
   795 
   796 .L7
   797 	jmp _x86return
   798 
   799 
   800 
   801 
   802 ;; 32 BIT RGB TO 16 BIT BGR 555
   803 	
   804 _ConvertX86p32_16BGR555:
   805 	
   806 	; check short
   807 	cmp ecx,BYTE 16
   808 	ja .L3
   809 
   810 
   811 .L1	; short loop
   812 	mov ah,[esi+0]    ; blue
   813 	mov al,[esi+1]    ; green
   814 	mov bl,[esi+2]    ; red
   815 	shr ah,3
   816 	and al,11111000b
   817 	shl eax,2
   818 	shr bl,3
   819 	add al,bl
   820 	mov [edi+0],al
   821 	mov [edi+1],ah
   822 	add esi,BYTE 4
   823 	add edi,BYTE 2
   824 	dec ecx
   825 	jnz .L1
   826 .L2 
   827 	jmp _x86return
   828 
   829 .L3	; head
   830 	mov ebx,edi
   831         and ebx,BYTE 11b
   832 	jz .L4   
   833 	mov ah,[esi+0]    ; blue
   834 	mov al,[esi+1]    ; green
   835 	mov bl,[esi+2]    ; red
   836 	shr ah,3
   837 	and al,11111000b
   838 	shl eax,2
   839 	shr bl,3
   840 	add al,bl
   841 	mov [edi+0],al
   842 	mov [edi+1],ah
   843 	add esi,BYTE 4
   844 	add edi,BYTE 2
   845 	dec ecx
   846 
   847 .L4	; save count
   848 	push ecx
   849 
   850 	; unroll twice
   851 	shr ecx,1
   852     
   853 	; point arrays to end
   854 	lea esi,[esi+ecx*8]
   855 	lea edi,[edi+ecx*4]
   856 
   857 	; negative counter 
   858 	neg ecx
   859 	jmp SHORT .L6
   860 
   861 .L5     
   862 	mov [edi+ecx*4-4],eax            
   863 .L6     
   864 	mov edx,[esi+ecx*8+4]
   865 
   866         mov bh,[esi+ecx*8+4]                       
   867         mov ah,[esi+ecx*8]                       
   868 
   869         shr bh,3
   870         mov al,[esi+ecx*8+1]             
   871 
   872         shr ah,3
   873         mov bl,[esi+ecx*8+5]           
   874 
   875         shl eax,2
   876         mov dl,[esi+ecx*8+2]
   877 
   878         shl ebx,18
   879         and eax,00007FE0h              
   880                 
   881         shr edx,3
   882         and ebx,07FE00000h             
   883         
   884         and edx,001F001Fh               
   885         add eax,ebx
   886 
   887         add eax,edx
   888         inc ecx
   889 
   890         jnz .L5                 
   891 
   892 	mov [edi+ecx*4-4],eax            
   893 
   894 	; tail
   895 	pop ecx
   896 	and ecx,BYTE 1
   897 	jz .L7
   898 	mov ah,[esi+0]    ; blue
   899 	mov al,[esi+1]    ; green
   900 	mov bl,[esi+2]    ; red
   901 	shr ah,3
   902 	and al,11111000b
   903 	shl eax,2
   904 	shr bl,3
   905 	add al,bl
   906 	mov [edi+0],al
   907 	mov [edi+1],ah
   908 	add esi,BYTE 4
   909 	add edi,BYTE 2
   910 
   911 .L7
   912 	jmp _x86return
   913 
   914 
   915 
   916 
   917 	
   918 ;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
   919 ;; This routine writes FOUR pixels at once (dword) and then, if they exist
   920 ;; the trailing three pixels
   921 _ConvertX86p32_8RGB332:
   922 
   923 	
   924 .L_ALIGNED
   925 	push ecx
   926 
   927 	shr ecx,2		; We will draw 4 pixels at once
   928 	jnz .L1
   929 	
   930 	jmp .L2			; short jump out of range :(
   931 	
   932 .L1:
   933 	mov eax,[esi]		; first pair of pixels
   934 	mov edx,[esi+4]
   935 
   936 	shr dl,6
   937 	mov ebx,eax
   938 
   939 	shr al,6
   940 	and ah,0e0h
   941 
   942 	shr ebx,16
   943 	and dh,0e0h
   944 	
   945 	shr ah,3
   946 	and bl,0e0h
   947 
   948 	shr dh,3
   949 	
   950 	or al,bl
   951 	
   952 	mov ebx,edx	
   953 	or al,ah
   954 	
   955 	shr ebx,16
   956 	or dl,dh
   957 
   958 	and bl,0e0h
   959 	
   960 	or dl,bl
   961 
   962 	mov ah,dl
   963 
   964 	
   965 		
   966 	mov ebx,[esi+8]		; second pair of pixels
   967 
   968 	mov edx,ebx
   969 	and bh,0e0h
   970 
   971 	shr bl,6
   972 	and edx,0e00000h
   973 
   974 	shr edx,16
   975 
   976 	shr bh,3
   977 
   978 	ror eax,16
   979 	or bl,dl
   980 
   981 	mov edx,[esi+12]
   982 	or bl,bh
   983 	
   984 	mov al,bl
   985 
   986 	mov ebx,edx
   987 	and dh,0e0h
   988 
   989 	shr dl,6
   990 	and ebx,0e00000h
   991 	
   992 	shr dh,3
   993 	mov ah,dl
   994 
   995 	shr ebx,16
   996 	or ah,dh
   997 
   998 	or ah,bl
   999 
  1000 	rol eax,16
  1001 	add esi,BYTE 16
  1002 			
  1003 	mov [edi],eax	
  1004 	add edi,BYTE 4
  1005 	
  1006 	dec ecx
  1007 	jz .L2			; L1 out of range for short jump :(
  1008 	
  1009 	jmp .L1
  1010 .L2:
  1011 	
  1012 	pop ecx
  1013 	and ecx,BYTE 3		; mask out number of pixels to draw
  1014 	
  1015 	jz .L4			; Nothing to do anymore
  1016 
  1017 .L3:
  1018 	mov eax,[esi]		; single pixel conversion for trailing pixels
  1019 
  1020         mov ebx,eax
  1021 
  1022         shr al,6
  1023         and ah,0e0h
  1024 
  1025         shr ebx,16
  1026 
  1027         shr ah,3
  1028         and bl,0e0h
  1029 
  1030         or al,ah
  1031         or al,bl
  1032 
  1033         mov [edi],al
  1034 
  1035         inc edi
  1036         add esi,BYTE 4
  1037 
  1038 	dec ecx
  1039 	jnz .L3
  1040 	
  1041 .L4:	
  1042 	jmp _x86return
  1043 
  1044 %ifidn __OUTPUT_FORMAT__,elf
  1045 section .note.GNU-stack noalloc noexec nowrite progbits
  1046 %endif