src/video/mmx.h
author Sam Lantinga <slouken@libsdl.org>
Fri, 22 Aug 2003 05:51:19 +0000
changeset 689 5bb080d35049
child 738 82b85b731fe3
permissions -rw-r--r--
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection

I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch

If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !

I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).

I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
slouken@689
     1
/*	mmx.h
slouken@689
     2
slouken@689
     3
	MultiMedia eXtensions GCC interface library for IA32.
slouken@689
     4
slouken@689
     5
	To use this library, simply include this header file
slouken@689
     6
	and compile with GCC.  You MUST have inlining enabled
slouken@689
     7
	in order for mmx_ok() to work; this can be done by
slouken@689
     8
	simply using -O on the GCC command line.
slouken@689
     9
slouken@689
    10
	Compiling with -DMMX_TRACE will cause detailed trace
slouken@689
    11
	output to be sent to stderr for each mmx operation.
slouken@689
    12
	This adds lots of code, and obviously slows execution to
slouken@689
    13
	a crawl, but can be very useful for debugging.
slouken@689
    14
slouken@689
    15
	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
slouken@689
    16
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
slouken@689
    17
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
slouken@689
    18
	AND FITNESS FOR ANY PARTICULAR PURPOSE.
slouken@689
    19
slouken@689
    20
	1997-99 by H. Dietz and R. Fisher
slouken@689
    21
slouken@689
    22
 Notes:
slouken@689
    23
	It appears that the latest gas has the pand problem fixed, therefore
slouken@689
    24
	  I'll undefine BROKEN_PAND by default.
slouken@689
    25
*/
slouken@689
    26
slouken@689
    27
#ifndef _MMX_H
slouken@689
    28
#define _MMX_H
slouken@689
    29
slouken@689
    30
slouken@689
    31
/*	Warning:  at this writing, the version of GAS packaged
slouken@689
    32
	with most Linux distributions does not handle the
slouken@689
    33
	parallel AND operation mnemonic correctly.  If the
slouken@689
    34
	symbol BROKEN_PAND is defined, a slower alternative
slouken@689
    35
	coding will be used.  If execution of mmxtest results
slouken@689
    36
	in an illegal instruction fault, define this symbol.
slouken@689
    37
*/
slouken@689
    38
#undef	BROKEN_PAND
slouken@689
    39
slouken@689
    40
slouken@689
    41
/*	The type of an value that fits in an MMX register
slouken@689
    42
	(note that long long constant values MUST be suffixed
slouken@689
    43
	 by LL and unsigned long long values by ULL, lest
slouken@689
    44
	 they be truncated by the compiler)
slouken@689
    45
*/
slouken@689
    46
typedef	union {
slouken@689
    47
	long long		q;	/* Quadword (64-bit) value */
slouken@689
    48
	unsigned long long	uq;	/* Unsigned Quadword */
slouken@689
    49
	int			d[2];	/* 2 Doubleword (32-bit) values */
slouken@689
    50
	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
slouken@689
    51
	short			w[4];	/* 4 Word (16-bit) values */
slouken@689
    52
	unsigned short		uw[4];	/* 4 Unsigned Word */
slouken@689
    53
	char			b[8];	/* 8 Byte (8-bit) values */
slouken@689
    54
	unsigned char		ub[8];	/* 8 Unsigned Byte */
slouken@689
    55
	float			s[2];	/* Single-precision (32-bit) value */
slouken@689
    56
} __attribute__ ((aligned (8))) mmx_t;	/* On an 8-byte (64-bit) boundary */
slouken@689
    57
slouken@689
    58
slouken@689
    59
#if 0
slouken@689
    60
/*	Function to test if multimedia instructions are supported...
slouken@689
    61
*/
slouken@689
    62
inline extern int
slouken@689
    63
mm_support(void)
slouken@689
    64
{
slouken@689
    65
	/* Returns 1 if MMX instructions are supported,
slouken@689
    66
	   3 if Cyrix MMX and Extended MMX instructions are supported
slouken@689
    67
	   5 if AMD MMX and 3DNow! instructions are supported
slouken@689
    68
	   0 if hardware does not support any of these
slouken@689
    69
	*/
slouken@689
    70
	register int rval = 0;
slouken@689
    71
slouken@689
    72
	__asm__ __volatile__ (
slouken@689
    73
		/* See if CPUID instruction is supported ... */
slouken@689
    74
		/* ... Get copies of EFLAGS into eax and ecx */
slouken@689
    75
		"pushf\n\t"
slouken@689
    76
		"popl %%eax\n\t"
slouken@689
    77
		"movl %%eax, %%ecx\n\t"
slouken@689
    78
slouken@689
    79
		/* ... Toggle the ID bit in one copy and store */
slouken@689
    80
		/*     to the EFLAGS reg */
slouken@689
    81
		"xorl $0x200000, %%eax\n\t"
slouken@689
    82
		"push %%eax\n\t"
slouken@689
    83
		"popf\n\t"
slouken@689
    84
slouken@689
    85
		/* ... Get the (hopefully modified) EFLAGS */
slouken@689
    86
		"pushf\n\t"
slouken@689
    87
		"popl %%eax\n\t"
slouken@689
    88
slouken@689
    89
		/* ... Compare and test result */
slouken@689
    90
		"xorl %%eax, %%ecx\n\t"
slouken@689
    91
		"testl $0x200000, %%ecx\n\t"
slouken@689
    92
		"jz NotSupported1\n\t"		/* CPUID not supported */
slouken@689
    93
slouken@689
    94
slouken@689
    95
		/* Get standard CPUID information, and
slouken@689
    96
		       go to a specific vendor section */
slouken@689
    97
		"movl $0, %%eax\n\t"
slouken@689
    98
		"cpuid\n\t"
slouken@689
    99
slouken@689
   100
		/* Check for Intel */
slouken@689
   101
		"cmpl $0x756e6547, %%ebx\n\t"
slouken@689
   102
		"jne TryAMD\n\t"
slouken@689
   103
		"cmpl $0x49656e69, %%edx\n\t"
slouken@689
   104
		"jne TryAMD\n\t"
slouken@689
   105
		"cmpl $0x6c65746e, %%ecx\n"
slouken@689
   106
		"jne TryAMD\n\t"
slouken@689
   107
		"jmp Intel\n\t"
slouken@689
   108
slouken@689
   109
		/* Check for AMD */
slouken@689
   110
		"\nTryAMD:\n\t"
slouken@689
   111
		"cmpl $0x68747541, %%ebx\n\t"
slouken@689
   112
		"jne TryCyrix\n\t"
slouken@689
   113
		"cmpl $0x69746e65, %%edx\n\t"
slouken@689
   114
		"jne TryCyrix\n\t"
slouken@689
   115
		"cmpl $0x444d4163, %%ecx\n"
slouken@689
   116
		"jne TryCyrix\n\t"
slouken@689
   117
		"jmp AMD\n\t"
slouken@689
   118
slouken@689
   119
		/* Check for Cyrix */
slouken@689
   120
		"\nTryCyrix:\n\t"
slouken@689
   121
		"cmpl $0x69727943, %%ebx\n\t"
slouken@689
   122
		"jne NotSupported2\n\t"
slouken@689
   123
		"cmpl $0x736e4978, %%edx\n\t"
slouken@689
   124
		"jne NotSupported3\n\t"
slouken@689
   125
		"cmpl $0x64616574, %%ecx\n\t"
slouken@689
   126
		"jne NotSupported4\n\t"
slouken@689
   127
		/* Drop through to Cyrix... */
slouken@689
   128
slouken@689
   129
slouken@689
   130
		/* Cyrix Section */
slouken@689
   131
		/* See if extended CPUID level 80000001 is supported */
slouken@689
   132
		/* The value of CPUID/80000001 for the 6x86MX is undefined
slouken@689
   133
		   according to the Cyrix CPU Detection Guide (Preliminary
slouken@689
   134
		   Rev. 1.01 table 1), so we'll check the value of eax for
slouken@689
   135
		   CPUID/0 to see if standard CPUID level 2 is supported.
slouken@689
   136
		   According to the table, the only CPU which supports level
slouken@689
   137
		   2 is also the only one which supports extended CPUID levels.
slouken@689
   138
		*/
slouken@689
   139
		"cmpl $0x2, %%eax\n\t"
slouken@689
   140
		"jne MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   141
slouken@689
   142
		/* Extended CPUID supported (in theory), so get extended
slouken@689
   143
		   features */
slouken@689
   144
		"movl $0x80000001, %%eax\n\t"
slouken@689
   145
		"cpuid\n\t"
slouken@689
   146
		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
slouken@689
   147
		"jz NotSupported5\n\t"		/* MMX not supported */
slouken@689
   148
		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
slouken@689
   149
		"jnz EMMXSupported\n\t"
slouken@689
   150
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   151
		"jmp Return\n\n"
slouken@689
   152
		"EMMXSupported:\n\t"
slouken@689
   153
		"movl $3, %0:\n\n\t"		/* EMMX and MMX Supported */
slouken@689
   154
		"jmp Return\n\t"
slouken@689
   155
slouken@689
   156
slouken@689
   157
		/* AMD Section */
slouken@689
   158
		"AMD:\n\t"
slouken@689
   159
slouken@689
   160
		/* See if extended CPUID is supported */
slouken@689
   161
		"movl $0x80000000, %%eax\n\t"
slouken@689
   162
		"cpuid\n\t"
slouken@689
   163
		"cmpl $0x80000000, %%eax\n\t"
slouken@689
   164
		"jl MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   165
slouken@689
   166
		/* Extended CPUID supported, so get extended features */
slouken@689
   167
		"movl $0x80000001, %%eax\n\t"
slouken@689
   168
		"cpuid\n\t"
slouken@689
   169
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   170
		"jz NotSupported6\n\t"		/* MMX not supported */
slouken@689
   171
		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
slouken@689
   172
		"jnz ThreeDNowSupported\n\t"
slouken@689
   173
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   174
		"jmp Return\n\n"
slouken@689
   175
		"ThreeDNowSupported:\n\t"
slouken@689
   176
		"movl $5, %0:\n\n\t"		/* 3DNow! and MMX Supported */
slouken@689
   177
		"jmp Return\n\t"
slouken@689
   178
slouken@689
   179
slouken@689
   180
		/* Intel Section */
slouken@689
   181
		"Intel:\n\t"
slouken@689
   182
slouken@689
   183
		/* Check for MMX */
slouken@689
   184
		"MMXtest:\n\t"
slouken@689
   185
		"movl $1, %%eax\n\t"
slouken@689
   186
		"cpuid\n\t"
slouken@689
   187
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   188
		"jz NotSupported7\n\t"		/* MMX Not supported */
slouken@689
   189
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   190
		"jmp Return\n\t"
slouken@689
   191
slouken@689
   192
		/* Nothing supported */
slouken@689
   193
		"\nNotSupported1:\n\t"
slouken@689
   194
		"#movl $101, %0:\n\n\t"
slouken@689
   195
		"\nNotSupported2:\n\t"
slouken@689
   196
		"#movl $102, %0:\n\n\t"
slouken@689
   197
		"\nNotSupported3:\n\t"
slouken@689
   198
		"#movl $103, %0:\n\n\t"
slouken@689
   199
		"\nNotSupported4:\n\t"
slouken@689
   200
		"#movl $104, %0:\n\n\t"
slouken@689
   201
		"\nNotSupported5:\n\t"
slouken@689
   202
		"#movl $105, %0:\n\n\t"
slouken@689
   203
		"\nNotSupported6:\n\t"
slouken@689
   204
		"#movl $106, %0:\n\n\t"
slouken@689
   205
		"\nNotSupported7:\n\t"
slouken@689
   206
		"#movl $107, %0:\n\n\t"
slouken@689
   207
		"movl $0, %0:\n\n\t"
slouken@689
   208
slouken@689
   209
		"Return:\n\t"
slouken@689
   210
		: "=a" (rval)
slouken@689
   211
		: /* no input */
slouken@689
   212
		: "eax", "ebx", "ecx", "edx"
slouken@689
   213
	);
slouken@689
   214
slouken@689
   215
	/* Return */
slouken@689
   216
	return(rval);
slouken@689
   217
}
slouken@689
   218
slouken@689
   219
/*	Function to test if mmx instructions are supported...
slouken@689
   220
*/
slouken@689
   221
inline extern int
slouken@689
   222
mmx_ok(void)
slouken@689
   223
{
slouken@689
   224
	/* Returns 1 if MMX instructions are supported, 0 otherwise */
slouken@689
   225
	return ( mm_support() & 0x1 );
slouken@689
   226
}
slouken@689
   227
#endif
slouken@689
   228
slouken@689
   229
/*	Helper functions for the instruction macros that follow...
slouken@689
   230
	(note that memory-to-register, m2r, instructions are nearly
slouken@689
   231
	 as efficient as register-to-register, r2r, instructions;
slouken@689
   232
	 however, memory-to-memory instructions are really simulated
slouken@689
   233
	 as a convenience, and are only 1/3 as efficient)
slouken@689
   234
*/
slouken@689
   235
#ifdef	MMX_TRACE
slouken@689
   236
slouken@689
   237
/*	Include the stuff for printing a trace to stderr...
slouken@689
   238
*/
slouken@689
   239
slouken@689
   240
#include <stdio.h>
slouken@689
   241
slouken@689
   242
#define	mmx_i2r(op, imm, reg) \
slouken@689
   243
	{ \
slouken@689
   244
		mmx_t mmx_trace; \
slouken@689
   245
		mmx_trace.uq = (imm); \
slouken@689
   246
		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
slouken@689
   247
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   248
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   249
				      : "=X" (mmx_trace) \
slouken@689
   250
				      : /* nothing */ ); \
slouken@689
   251
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   252
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   253
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   254
				      : /* nothing */ \
slouken@689
   255
				      : "X" (imm)); \
slouken@689
   256
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   257
				      : "=X" (mmx_trace) \
slouken@689
   258
				      : /* nothing */ ); \
slouken@689
   259
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   260
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   261
	}
slouken@689
   262
slouken@689
   263
#define	mmx_m2r(op, mem, reg) \
slouken@689
   264
	{ \
slouken@689
   265
		mmx_t mmx_trace; \
slouken@689
   266
		mmx_trace = (mem); \
slouken@689
   267
		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
slouken@689
   268
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   269
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   270
				      : "=X" (mmx_trace) \
slouken@689
   271
				      : /* nothing */ ); \
slouken@689
   272
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   273
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   274
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   275
				      : /* nothing */ \
slouken@689
   276
				      : "X" (mem)); \
slouken@689
   277
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   278
				      : "=X" (mmx_trace) \
slouken@689
   279
				      : /* nothing */ ); \
slouken@689
   280
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   281
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   282
	}
slouken@689
   283
slouken@689
   284
#define	mmx_r2m(op, reg, mem) \
slouken@689
   285
	{ \
slouken@689
   286
		mmx_t mmx_trace; \
slouken@689
   287
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   288
				      : "=X" (mmx_trace) \
slouken@689
   289
				      : /* nothing */ ); \
slouken@689
   290
		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
slouken@689
   291
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   292
		mmx_trace = (mem); \
slouken@689
   293
		printf(#mem "=0x%08x%08x) => ", \
slouken@689
   294
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   295
		__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   296
				      : "=X" (mem) \
slouken@689
   297
				      : /* nothing */ ); \
slouken@689
   298
		mmx_trace = (mem); \
slouken@689
   299
		printf(#mem "=0x%08x%08x\n", \
slouken@689
   300
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   301
	}
slouken@689
   302
slouken@689
   303
#define	mmx_r2r(op, regs, regd) \
slouken@689
   304
	{ \
slouken@689
   305
		mmx_t mmx_trace; \
slouken@689
   306
		__asm__ __volatile__ ("movq %%" #regs ", %0" \
slouken@689
   307
				      : "=X" (mmx_trace) \
slouken@689
   308
				      : /* nothing */ ); \
slouken@689
   309
		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
slouken@689
   310
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   311
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   312
				      : "=X" (mmx_trace) \
slouken@689
   313
				      : /* nothing */ ); \
slouken@689
   314
		printf(#regd "=0x%08x%08x) => ", \
slouken@689
   315
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   316
		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
slouken@689
   317
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   318
				      : "=X" (mmx_trace) \
slouken@689
   319
				      : /* nothing */ ); \
slouken@689
   320
		printf(#regd "=0x%08x%08x\n", \
slouken@689
   321
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   322
	}
slouken@689
   323
slouken@689
   324
#define	mmx_m2m(op, mems, memd) \
slouken@689
   325
	{ \
slouken@689
   326
		mmx_t mmx_trace; \
slouken@689
   327
		mmx_trace = (mems); \
slouken@689
   328
		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
slouken@689
   329
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   330
		mmx_trace = (memd); \
slouken@689
   331
		printf(#memd "=0x%08x%08x) => ", \
slouken@689
   332
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   333
		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   334
				      #op " %1, %%mm0\n\t" \
slouken@689
   335
				      "movq %%mm0, %0" \
slouken@689
   336
				      : "=X" (memd) \
slouken@689
   337
				      : "X" (mems)); \
slouken@689
   338
		mmx_trace = (memd); \
slouken@689
   339
		printf(#memd "=0x%08x%08x\n", \
slouken@689
   340
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   341
	}
slouken@689
   342
slouken@689
   343
#else
slouken@689
   344
slouken@689
   345
/*	These macros are a lot simpler without the tracing...
slouken@689
   346
*/
slouken@689
   347
slouken@689
   348
#define	mmx_i2r(op, imm, reg) \
slouken@689
   349
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   350
			      : /* nothing */ \
slouken@689
   351
			      : "X" (imm) )
slouken@689
   352
slouken@689
   353
#define	mmx_m2r(op, mem, reg) \
slouken@689
   354
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   355
			      : /* nothing */ \
slouken@689
   356
			      : "X" (mem))
slouken@689
   357
slouken@689
   358
#define	mmx_r2m(op, reg, mem) \
slouken@689
   359
	__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   360
			      : "=X" (mem) \
slouken@689
   361
			      : /* nothing */ )
slouken@689
   362
slouken@689
   363
#define	mmx_r2r(op, regs, regd) \
slouken@689
   364
	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
slouken@689
   365
slouken@689
   366
#define	mmx_m2m(op, mems, memd) \
slouken@689
   367
	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   368
			      #op " %1, %%mm0\n\t" \
slouken@689
   369
			      "movq %%mm0, %0" \
slouken@689
   370
			      : "=X" (memd) \
slouken@689
   371
			      : "X" (mems))
slouken@689
   372
slouken@689
   373
#endif
slouken@689
   374
slouken@689
   375
slouken@689
   376
/*	1x64 MOVe Quadword
slouken@689
   377
	(this is both a load and a store...
slouken@689
   378
	 in fact, it is the only way to store)
slouken@689
   379
*/
slouken@689
   380
#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
slouken@689
   381
#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
slouken@689
   382
#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
slouken@689
   383
#define	movq(vars, vard) \
slouken@689
   384
	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
slouken@689
   385
			      "movq %%mm0, %0" \
slouken@689
   386
			      : "=X" (vard) \
slouken@689
   387
			      : "X" (vars))
slouken@689
   388
slouken@689
   389
slouken@689
   390
/*	1x32 MOVe Doubleword
slouken@689
   391
	(like movq, this is both load and store...
slouken@689
   392
	 but is most useful for moving things between
slouken@689
   393
	 mmx registers and ordinary registers)
slouken@689
   394
*/
slouken@689
   395
#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
slouken@689
   396
#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
slouken@689
   397
#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
slouken@689
   398
#define	movd(vars, vard) \
slouken@689
   399
	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
slouken@689
   400
			      "movd %%mm0, %0" \
slouken@689
   401
			      : "=X" (vard) \
slouken@689
   402
			      : "X" (vars))
slouken@689
   403
slouken@689
   404
slouken@689
   405
/*	2x32, 4x16, and 8x8 Parallel ADDs
slouken@689
   406
*/
slouken@689
   407
#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
slouken@689
   408
#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
slouken@689
   409
#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
slouken@689
   410
slouken@689
   411
#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
slouken@689
   412
#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
slouken@689
   413
#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
slouken@689
   414
slouken@689
   415
#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
slouken@689
   416
#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
slouken@689
   417
#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
slouken@689
   418
slouken@689
   419
slouken@689
   420
/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
slouken@689
   421
*/
slouken@689
   422
#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
slouken@689
   423
#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
slouken@689
   424
#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
slouken@689
   425
slouken@689
   426
#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
slouken@689
   427
#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
slouken@689
   428
#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
slouken@689
   429
slouken@689
   430
slouken@689
   431
/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
slouken@689
   432
*/
slouken@689
   433
#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
slouken@689
   434
#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
slouken@689
   435
#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
slouken@689
   436
slouken@689
   437
#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
slouken@689
   438
#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
slouken@689
   439
#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
slouken@689
   440
slouken@689
   441
slouken@689
   442
/*	2x32, 4x16, and 8x8 Parallel SUBs
slouken@689
   443
*/
slouken@689
   444
#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
slouken@689
   445
#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
slouken@689
   446
#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
slouken@689
   447
slouken@689
   448
#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
slouken@689
   449
#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
slouken@689
   450
#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
slouken@689
   451
slouken@689
   452
#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
slouken@689
   453
#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
slouken@689
   454
#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
slouken@689
   455
slouken@689
   456
slouken@689
   457
/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
slouken@689
   458
*/
slouken@689
   459
#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
slouken@689
   460
#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
slouken@689
   461
#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
slouken@689
   462
slouken@689
   463
#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
slouken@689
   464
#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
slouken@689
   465
#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
slouken@689
   466
slouken@689
   467
slouken@689
   468
/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
slouken@689
   469
*/
slouken@689
   470
#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
slouken@689
   471
#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
slouken@689
   472
#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
slouken@689
   473
slouken@689
   474
#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
slouken@689
   475
#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
slouken@689
   476
#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
slouken@689
   477
slouken@689
   478
slouken@689
   479
/*	4x16 Parallel MULs giving Low 4x16 portions of results
slouken@689
   480
*/
slouken@689
   481
#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
slouken@689
   482
#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
slouken@689
   483
#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
slouken@689
   484
slouken@689
   485
slouken@689
   486
/*	4x16 Parallel MULs giving High 4x16 portions of results
slouken@689
   487
*/
slouken@689
   488
#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
slouken@689
   489
#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
slouken@689
   490
#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
slouken@689
   491
slouken@689
   492
slouken@689
   493
/*	4x16->2x32 Parallel Mul-ADD
slouken@689
   494
	(muls like pmullw, then adds adjacent 16-bit fields
slouken@689
   495
	 in the multiply result to make the final 2x32 result)
slouken@689
   496
*/
slouken@689
   497
#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
slouken@689
   498
#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
slouken@689
   499
#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
slouken@689
   500
slouken@689
   501
slouken@689
   502
/*	1x64 bitwise AND
slouken@689
   503
*/
slouken@689
   504
#ifdef	BROKEN_PAND
slouken@689
   505
#define	pand_m2r(var, reg) \
slouken@689
   506
	{ \
slouken@689
   507
		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
slouken@689
   508
		mmx_m2r(pandn, var, reg); \
slouken@689
   509
	}
slouken@689
   510
#define	pand_r2r(regs, regd) \
slouken@689
   511
	{ \
slouken@689
   512
		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
slouken@689
   513
		mmx_r2r(pandn, regs, regd) \
slouken@689
   514
	}
slouken@689
   515
#define	pand(vars, vard) \
slouken@689
   516
	{ \
slouken@689
   517
		movq_m2r(vard, mm0); \
slouken@689
   518
		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
slouken@689
   519
		mmx_m2r(pandn, vars, mm0); \
slouken@689
   520
		movq_r2m(mm0, vard); \
slouken@689
   521
	}
slouken@689
   522
#else
slouken@689
   523
#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
slouken@689
   524
#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
slouken@689
   525
#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
slouken@689
   526
#endif
slouken@689
   527
slouken@689
   528
slouken@689
   529
/*	1x64 bitwise AND with Not the destination
slouken@689
   530
*/
slouken@689
   531
#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
slouken@689
   532
#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
slouken@689
   533
#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
slouken@689
   534
slouken@689
   535
slouken@689
   536
/*	1x64 bitwise OR
slouken@689
   537
*/
slouken@689
   538
#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
slouken@689
   539
#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
slouken@689
   540
#define	por(vars, vard)	mmx_m2m(por, vars, vard)
slouken@689
   541
slouken@689
   542
slouken@689
   543
/*	1x64 bitwise eXclusive OR
slouken@689
   544
*/
slouken@689
   545
#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
slouken@689
   546
#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
slouken@689
   547
#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
slouken@689
   548
slouken@689
   549
slouken@689
   550
/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
slouken@689
   551
	(resulting fields are either 0 or -1)
slouken@689
   552
*/
slouken@689
   553
#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
slouken@689
   554
#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
slouken@689
   555
#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
slouken@689
   556
slouken@689
   557
#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
slouken@689
   558
#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
slouken@689
   559
#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
slouken@689
   560
slouken@689
   561
#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
slouken@689
   562
#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
slouken@689
   563
#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
slouken@689
   564
slouken@689
   565
slouken@689
   566
/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
slouken@689
   567
	(resulting fields are either 0 or -1)
slouken@689
   568
*/
slouken@689
   569
#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
slouken@689
   570
#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
slouken@689
   571
#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
slouken@689
   572
slouken@689
   573
#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
slouken@689
   574
#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
slouken@689
   575
#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
slouken@689
   576
slouken@689
   577
#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
slouken@689
   578
#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
slouken@689
   579
#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
slouken@689
   580
slouken@689
   581
slouken@689
   582
/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
slouken@689
   583
*/
slouken@689
   584
#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
slouken@689
   585
#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
slouken@689
   586
#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
slouken@689
   587
#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
slouken@689
   588
slouken@689
   589
#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
slouken@689
   590
#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
slouken@689
   591
#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
slouken@689
   592
#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
slouken@689
   593
slouken@689
   594
#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
slouken@689
   595
#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
slouken@689
   596
#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
slouken@689
   597
#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
slouken@689
   598
slouken@689
   599
slouken@689
   600
/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
slouken@689
   601
*/
slouken@689
   602
#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
slouken@689
   603
#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
slouken@689
   604
#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
slouken@689
   605
#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
slouken@689
   606
slouken@689
   607
#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
slouken@689
   608
#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
slouken@689
   609
#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
slouken@689
   610
#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
slouken@689
   611
slouken@689
   612
#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
slouken@689
   613
#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
slouken@689
   614
#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
slouken@689
   615
#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
slouken@689
   616
slouken@689
   617
slouken@689
   618
/*	2x32 and 4x16 Parallel Shift Right Arithmetic
slouken@689
   619
*/
slouken@689
   620
#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
slouken@689
   621
#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
slouken@689
   622
#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
slouken@689
   623
#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
slouken@689
   624
slouken@689
   625
#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
slouken@689
   626
#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
slouken@689
   627
#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
slouken@689
   628
#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
slouken@689
   629
slouken@689
   630
slouken@689
   631
/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
slouken@689
   632
	(packs source and dest fields into dest in that order)
slouken@689
   633
*/
slouken@689
   634
#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
slouken@689
   635
#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
slouken@689
   636
#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
slouken@689
   637
slouken@689
   638
#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
slouken@689
   639
#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
slouken@689
   640
#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
slouken@689
   641
slouken@689
   642
slouken@689
   643
/*	4x16->8x8 PACK and Unsigned Saturate
slouken@689
   644
	(packs source and dest fields into dest in that order)
slouken@689
   645
*/
slouken@689
   646
#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
slouken@689
   647
#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
slouken@689
   648
#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
slouken@689
   649
slouken@689
   650
slouken@689
   651
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
slouken@689
   652
	(interleaves low half of dest with low half of source
slouken@689
   653
	 as padding in each result field)
slouken@689
   654
*/
slouken@689
   655
#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
slouken@689
   656
#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
slouken@689
   657
#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
slouken@689
   658
slouken@689
   659
#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
slouken@689
   660
#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
slouken@689
   661
#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
slouken@689
   662
slouken@689
   663
#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
slouken@689
   664
#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
slouken@689
   665
#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
slouken@689
   666
slouken@689
   667
slouken@689
   668
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
slouken@689
   669
	(interleaves high half of dest with high half of source
slouken@689
   670
	 as padding in each result field)
slouken@689
   671
*/
slouken@689
   672
#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
slouken@689
   673
#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
slouken@689
   674
#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
slouken@689
   675
slouken@689
   676
#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
slouken@689
   677
#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
slouken@689
   678
#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
slouken@689
   679
slouken@689
   680
#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
slouken@689
   681
#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
slouken@689
   682
#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
slouken@689
   683
slouken@689
   684
slouken@689
   685
/*	Empty MMx State
slouken@689
   686
	(used to clean-up when going from mmx to float use
slouken@689
   687
	 of the registers that are shared by both; note that
slouken@689
   688
	 there is no float-to-mmx operation needed, because
slouken@689
   689
	 only the float tag word info is corruptible)
slouken@689
   690
*/
slouken@689
   691
#ifdef	MMX_TRACE
slouken@689
   692
slouken@689
   693
#define	emms() \
slouken@689
   694
	{ \
slouken@689
   695
		printf("emms()\n"); \
slouken@689
   696
		__asm__ __volatile__ ("emms"); \
slouken@689
   697
	}
slouken@689
   698
slouken@689
   699
#else
slouken@689
   700
slouken@689
   701
#define	emms()			__asm__ __volatile__ ("emms")
slouken@689
   702
slouken@689
   703
#endif
slouken@689
   704
slouken@689
   705
#endif
slouken@689
   706