Skip to content

Latest commit

 

History

History
268 lines (255 loc) · 5.58 KB

synth_sse3d.h

File metadata and controls

268 lines (255 loc) · 5.58 KB
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/*
decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
see COPYING and AUTHORS files in distribution or http://mpg123.org
initially written by the mysterious higway for MMX (apparently)
then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
Both have agreed to distribution under LGPL 2.1 .
Transformed back into standalone asm, with help of
gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
Maybe I'll change it in future, but now I need something that works.
Original comment from MPlayer source follows. Regarding the license history see
synth_mmx.S, which the original comment about this being licensed under GPL is
relating to.
*/
/*
* This code was taken from http://www.mpg123.org
* See ChangeLog of mpg123-0.59s-pre.1 for detail
* Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
*
* Local ChangeLog:
* - Partial loops unrolling and removing MOVW insn from loops
*/
#include "mangle.h"
.data
ALIGN8
one_null:
.long -65536
.long -65536
ALIGN8
null_one:
.long 65535
.long 65535
.text
ALIGN16
/* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
.globl SYNTH_NAME
SYNTH_NAME:
pushl %ebp
/* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
movl %esp, %ebp
/* Now the old stack addresses are preserved via %epb. */
#ifdef PIC
subl $8,%esp /* What has been called temp before. */
#else
subl $4,%esp /* What has been called temp before. */
#endif
pushl %edi
pushl %esi
pushl %ebx
#ifdef PIC
#undef _EBX_
#define _EBX_ %eax
GET_GOT
#define EBXSAVE -4(%ebp)
movl _EBX_, EBXSAVE /* save PIC register */
#endif
#define TEMP 12(%esp)
/* APP */
movl 12(%ebp),%ecx
movl 16(%ebp),%edi
movl $15,%ebx
movl 24(%ebp),%edx
leal (%edi,%ecx,2),%edi
decl %ecx
movl 20(%ebp),%esi
movl (%edx),%eax
jecxz 1f
decl %eax
andl %ebx,%eax
leal 1088(%esi),%esi
movl %eax,(%edx)
1:
leal (%esi,%eax,2),%edx
movl %eax,TEMP
incl %eax
andl %ebx,%eax
leal 544(%esi,%eax,2),%ecx
incl %ebx
testl $1, %eax
jnz 2f
xchgl %edx,%ecx
incl TEMP
leal 544(%esi),%esi
2:
pushl 8(%ebp)
pushl %edx
pushl %ecx
call MPL_DCT64
addl $12, %esp
leal 1(%ebx), %ecx
subl TEMP,%ebx
pushl %ecx
/* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
movl 28(%ebp),%ecx
leal (%ecx,%ebx,2), %edx
movl (%esp),%ecx /* restore, but leave value on stack */
shrl $1, %ecx
#ifdef PIC
movl EBXSAVE, _EBX_
#endif
ALIGN16
3:
movq (%edx),%mm0
movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
pmaddwd 32(%esi),%mm4
movq 8(%edx),%mm1
movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
pmaddwd 40(%esi),%mm5
movq 16(%edx),%mm2
movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
pmaddwd 48(%esi),%mm6
movq 24(%edx),%mm3
movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
pmaddwd 56(%esi),%mm7
paddd %mm1,%mm0
paddd %mm5,%mm4
paddd %mm2,%mm0
paddd %mm6,%mm4
paddd %mm3,%mm0
paddd %mm7,%mm4
movq %mm0,%mm1
movq %mm4,%mm5
psrlq $32,%mm1
psrlq $32,%mm5
paddd %mm1,%mm0
paddd %mm5,%mm4
psrad $13,%mm0
psrad $13,%mm4
packssdw %mm0,%mm0
packssdw %mm4,%mm4
movq (%edi), %mm1
punpckldq %mm4, %mm0
pand LOCAL_VAR(one_null), %mm1
pand LOCAL_VAR(null_one), %mm0
por %mm0, %mm1
movq %mm1,(%edi)
leal 64(%esi),%esi
leal 128(%edx),%edx
leal 8(%edi),%edi
decl %ecx
jnz 3b
popl %ecx
andl $1, %ecx
jecxz 4f
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm1,%mm0
psrad $13,%mm0
packssdw %mm0,%mm0
movd %mm0,%eax
movw %ax, (%edi)
leal 32(%esi),%esi
leal 64(%edx),%edx
leal 4(%edi),%edi
4:
subl $64,%esi
movl $7,%ecx
#ifdef PIC
movl EBXSAVE, _EBX_
#endif
ALIGN16
5:
movq (%edx),%mm0
movq 64(%edx),%mm4
pmaddwd (%esi),%mm0
pmaddwd -32(%esi),%mm4
movq 8(%edx),%mm1
movq 72(%edx),%mm5
pmaddwd 8(%esi),%mm1
pmaddwd -24(%esi),%mm5
movq 16(%edx),%mm2
movq 80(%edx),%mm6
pmaddwd 16(%esi),%mm2
pmaddwd -16(%esi),%mm6
movq 24(%edx),%mm3
movq 88(%edx),%mm7
pmaddwd 24(%esi),%mm3
pmaddwd -8(%esi),%mm7
paddd %mm1,%mm0
paddd %mm5,%mm4
paddd %mm2,%mm0
paddd %mm6,%mm4
paddd %mm3,%mm0
paddd %mm7,%mm4
movq %mm0,%mm1
movq %mm4,%mm5
psrlq $32,%mm1
psrlq $32,%mm5
paddd %mm0,%mm1
paddd %mm4,%mm5
psrad $13,%mm1
psrad $13,%mm5
packssdw %mm1,%mm1
packssdw %mm5,%mm5
psubd %mm0,%mm0
psubd %mm4,%mm4
psubsw %mm1,%mm0
psubsw %mm5,%mm4
movq (%edi), %mm1
punpckldq %mm4, %mm0
pand LOCAL_VAR(one_null), %mm1
pand LOCAL_VAR(null_one), %mm0
por %mm0, %mm1
movq %mm1,(%edi)
subl $64,%esi
addl $128,%edx
leal 8(%edi),%edi
decl %ecx
jnz 5b
movq (%edx),%mm0
pmaddwd (%esi),%mm0
movq 8(%edx),%mm1
pmaddwd 8(%esi),%mm1
movq 16(%edx),%mm2
pmaddwd 16(%esi),%mm2
movq 24(%edx),%mm3
pmaddwd 24(%esi),%mm3
paddd %mm1,%mm0
paddd %mm2,%mm0
paddd %mm3,%mm0
movq %mm0,%mm1
psrlq $32,%mm1
paddd %mm0,%mm1
psrad $13,%mm1
packssdw %mm1,%mm1
psubd %mm0,%mm0
psubsw %mm1,%mm0
movd %mm0,%eax
movw %ax,(%edi)
emms
/* NO_APP */
popl %ebx
popl %esi
popl %edi
mov %ebp, %esp
popl %ebp
ret