src/video/mmx.h
author Sam Lantinga <slouken@libsdl.org>
Sun, 30 Jan 2011 13:38:57 -0800
branchSDL-1.2
changeset 5127 32f0f603a0c8
parent 1330 450721ad5436
child 5872 819e85555d4d
permissions -rw-r--r--
Fixed bug #1111

kwm@rainbow-runner.nl 2011-01-30 06:28:27 PST

When building sdl 1.2.14 with the Clang compiler http://clang.llvm.org .
The build fails in src/video/mmx.h with the following error:

--------------------------------------------------
./src/video/SDL_RLEaccel.c:831:5: error: invalid operand for instruction
CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
^
./src/video/SDL_RLEaccel.c:831:17: note: instantiated from:
CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
^
./src/video/SDL_RLEaccel.c:831:5: note: instantiated from:
CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
^
./src/video/SDL_RLEaccel.c:647:23: note: instantiated from:
blitter(2, Uint8, ALPHA_BLIT16_565MMX); \
^
./src/video/SDL_RLEaccel.c:282:4: note: instantiated from:
movq_r2m(mm3, *dstp); \
^
In file included from ./src/video/SDL_RLEaccel.c:99:
./src/video/mmx.h:379:28: note: instantiated from:
#define movq_r2m(reg, var) mmx_r2m(movq, reg, var)
^
<scratch space>:192:1: note: instantiated from:
"movq"
^
<inline asm>:1:2: note: instantiated into assembly here
movq %mm3, %dx
^
--------------------------------------------------

According to the clang developers this is a invalid inline assembly.
Using the attached patch from the last commit in the below bug report fixes the
compile.
More details from: http://llvm.org/bugs/show_bug.cgi?id=6730
slouken@689
     1
/*	mmx.h
slouken@689
     2
slouken@689
     3
	MultiMedia eXtensions GCC interface library for IA32.
slouken@689
     4
slouken@689
     5
	To use this library, simply include this header file
slouken@689
     6
	and compile with GCC.  You MUST have inlining enabled
slouken@689
     7
	in order for mmx_ok() to work; this can be done by
slouken@689
     8
	simply using -O on the GCC command line.
slouken@689
     9
slouken@689
    10
	Compiling with -DMMX_TRACE will cause detailed trace
slouken@689
    11
	output to be sent to stderr for each mmx operation.
slouken@689
    12
	This adds lots of code, and obviously slows execution to
slouken@689
    13
	a crawl, but can be very useful for debugging.
slouken@689
    14
slouken@689
    15
	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
slouken@689
    16
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
slouken@689
    17
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
slouken@689
    18
	AND FITNESS FOR ANY PARTICULAR PURPOSE.
slouken@689
    19
slouken@689
    20
	1997-99 by H. Dietz and R. Fisher
slouken@689
    21
slouken@689
    22
 Notes:
slouken@689
    23
	It appears that the latest gas has the pand problem fixed, therefore
slouken@689
    24
	  I'll undefine BROKEN_PAND by default.
slouken@689
    25
*/
slouken@689
    26
slouken@689
    27
#ifndef _MMX_H
slouken@689
    28
#define _MMX_H
slouken@689
    29
slouken@689
    30
slouken@689
    31
/*	Warning:  at this writing, the version of GAS packaged
slouken@689
    32
	with most Linux distributions does not handle the
slouken@689
    33
	parallel AND operation mnemonic correctly.  If the
slouken@689
    34
	symbol BROKEN_PAND is defined, a slower alternative
slouken@689
    35
	coding will be used.  If execution of mmxtest results
slouken@689
    36
	in an illegal instruction fault, define this symbol.
slouken@689
    37
*/
slouken@689
    38
#undef	BROKEN_PAND
slouken@689
    39
slouken@689
    40
slouken@689
    41
/*	The type of an value that fits in an MMX register
slouken@689
    42
	(note that long long constant values MUST be suffixed
slouken@689
    43
	 by LL and unsigned long long values by ULL, lest
slouken@689
    44
	 they be truncated by the compiler)
slouken@689
    45
*/
slouken@689
    46
typedef	union {
slouken@689
    47
	long long		q;	/* Quadword (64-bit) value */
slouken@689
    48
	unsigned long long	uq;	/* Unsigned Quadword */
slouken@689
    49
	int			d[2];	/* 2 Doubleword (32-bit) values */
slouken@689
    50
	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
slouken@689
    51
	short			w[4];	/* 4 Word (16-bit) values */
slouken@689
    52
	unsigned short		uw[4];	/* 4 Unsigned Word */
slouken@689
    53
	char			b[8];	/* 8 Byte (8-bit) values */
slouken@689
    54
	unsigned char		ub[8];	/* 8 Unsigned Byte */
slouken@689
    55
	float			s[2];	/* Single-precision (32-bit) value */
slouken@689
    56
} __attribute__ ((aligned (8))) mmx_t;	/* On an 8-byte (64-bit) boundary */
slouken@689
    57
slouken@689
    58
slouken@689
    59
#if 0
slouken@689
    60
/*	Function to test if multimedia instructions are supported...
slouken@689
    61
*/
slouken@689
    62
inline extern int
slouken@689
    63
mm_support(void)
slouken@689
    64
{
slouken@689
    65
	/* Returns 1 if MMX instructions are supported,
slouken@689
    66
	   3 if Cyrix MMX and Extended MMX instructions are supported
slouken@689
    67
	   5 if AMD MMX and 3DNow! instructions are supported
slouken@689
    68
	   0 if hardware does not support any of these
slouken@689
    69
	*/
slouken@689
    70
	register int rval = 0;
slouken@689
    71
slouken@689
    72
	__asm__ __volatile__ (
slouken@689
    73
		/* See if CPUID instruction is supported ... */
slouken@689
    74
		/* ... Get copies of EFLAGS into eax and ecx */
slouken@689
    75
		"pushf\n\t"
slouken@689
    76
		"popl %%eax\n\t"
slouken@689
    77
		"movl %%eax, %%ecx\n\t"
slouken@689
    78
slouken@689
    79
		/* ... Toggle the ID bit in one copy and store */
slouken@689
    80
		/*     to the EFLAGS reg */
slouken@689
    81
		"xorl $0x200000, %%eax\n\t"
slouken@689
    82
		"push %%eax\n\t"
slouken@689
    83
		"popf\n\t"
slouken@689
    84
slouken@689
    85
		/* ... Get the (hopefully modified) EFLAGS */
slouken@689
    86
		"pushf\n\t"
slouken@689
    87
		"popl %%eax\n\t"
slouken@689
    88
slouken@689
    89
		/* ... Compare and test result */
slouken@689
    90
		"xorl %%eax, %%ecx\n\t"
slouken@689
    91
		"testl $0x200000, %%ecx\n\t"
slouken@689
    92
		"jz NotSupported1\n\t"		/* CPUID not supported */
slouken@689
    93
slouken@689
    94
slouken@689
    95
		/* Get standard CPUID information, and
slouken@689
    96
		       go to a specific vendor section */
slouken@689
    97
		"movl $0, %%eax\n\t"
slouken@689
    98
		"cpuid\n\t"
slouken@689
    99
slouken@689
   100
		/* Check for Intel */
slouken@689
   101
		"cmpl $0x756e6547, %%ebx\n\t"
slouken@689
   102
		"jne TryAMD\n\t"
slouken@689
   103
		"cmpl $0x49656e69, %%edx\n\t"
slouken@689
   104
		"jne TryAMD\n\t"
slouken@689
   105
		"cmpl $0x6c65746e, %%ecx\n"
slouken@689
   106
		"jne TryAMD\n\t"
slouken@689
   107
		"jmp Intel\n\t"
slouken@689
   108
slouken@689
   109
		/* Check for AMD */
slouken@689
   110
		"\nTryAMD:\n\t"
slouken@689
   111
		"cmpl $0x68747541, %%ebx\n\t"
slouken@689
   112
		"jne TryCyrix\n\t"
slouken@689
   113
		"cmpl $0x69746e65, %%edx\n\t"
slouken@689
   114
		"jne TryCyrix\n\t"
slouken@689
   115
		"cmpl $0x444d4163, %%ecx\n"
slouken@689
   116
		"jne TryCyrix\n\t"
slouken@689
   117
		"jmp AMD\n\t"
slouken@689
   118
slouken@689
   119
		/* Check for Cyrix */
slouken@689
   120
		"\nTryCyrix:\n\t"
slouken@689
   121
		"cmpl $0x69727943, %%ebx\n\t"
slouken@689
   122
		"jne NotSupported2\n\t"
slouken@689
   123
		"cmpl $0x736e4978, %%edx\n\t"
slouken@689
   124
		"jne NotSupported3\n\t"
slouken@689
   125
		"cmpl $0x64616574, %%ecx\n\t"
slouken@689
   126
		"jne NotSupported4\n\t"
slouken@689
   127
		/* Drop through to Cyrix... */
slouken@689
   128
slouken@689
   129
slouken@689
   130
		/* Cyrix Section */
slouken@689
   131
		/* See if extended CPUID level 80000001 is supported */
slouken@689
   132
		/* The value of CPUID/80000001 for the 6x86MX is undefined
slouken@689
   133
		   according to the Cyrix CPU Detection Guide (Preliminary
slouken@689
   134
		   Rev. 1.01 table 1), so we'll check the value of eax for
slouken@689
   135
		   CPUID/0 to see if standard CPUID level 2 is supported.
slouken@689
   136
		   According to the table, the only CPU which supports level
slouken@689
   137
		   2 is also the only one which supports extended CPUID levels.
slouken@689
   138
		*/
slouken@689
   139
		"cmpl $0x2, %%eax\n\t"
slouken@689
   140
		"jne MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   141
slouken@689
   142
		/* Extended CPUID supported (in theory), so get extended
slouken@689
   143
		   features */
slouken@689
   144
		"movl $0x80000001, %%eax\n\t"
slouken@689
   145
		"cpuid\n\t"
slouken@689
   146
		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
slouken@689
   147
		"jz NotSupported5\n\t"		/* MMX not supported */
slouken@689
   148
		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
slouken@689
   149
		"jnz EMMXSupported\n\t"
slouken@689
   150
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   151
		"jmp Return\n\n"
slouken@689
   152
		"EMMXSupported:\n\t"
slouken@689
   153
		"movl $3, %0:\n\n\t"		/* EMMX and MMX Supported */
slouken@689
   154
		"jmp Return\n\t"
slouken@689
   155
slouken@689
   156
slouken@689
   157
		/* AMD Section */
slouken@689
   158
		"AMD:\n\t"
slouken@689
   159
slouken@689
   160
		/* See if extended CPUID is supported */
slouken@689
   161
		"movl $0x80000000, %%eax\n\t"
slouken@689
   162
		"cpuid\n\t"
slouken@689
   163
		"cmpl $0x80000000, %%eax\n\t"
slouken@689
   164
		"jl MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   165
slouken@689
   166
		/* Extended CPUID supported, so get extended features */
slouken@689
   167
		"movl $0x80000001, %%eax\n\t"
slouken@689
   168
		"cpuid\n\t"
slouken@689
   169
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   170
		"jz NotSupported6\n\t"		/* MMX not supported */
slouken@689
   171
		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
slouken@689
   172
		"jnz ThreeDNowSupported\n\t"
slouken@689
   173
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   174
		"jmp Return\n\n"
slouken@689
   175
		"ThreeDNowSupported:\n\t"
slouken@689
   176
		"movl $5, %0:\n\n\t"		/* 3DNow! and MMX Supported */
slouken@689
   177
		"jmp Return\n\t"
slouken@689
   178
slouken@689
   179
slouken@689
   180
		/* Intel Section */
slouken@689
   181
		"Intel:\n\t"
slouken@689
   182
slouken@689
   183
		/* Check for MMX */
slouken@689
   184
		"MMXtest:\n\t"
slouken@689
   185
		"movl $1, %%eax\n\t"
slouken@689
   186
		"cpuid\n\t"
slouken@689
   187
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   188
		"jz NotSupported7\n\t"		/* MMX Not supported */
slouken@689
   189
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   190
		"jmp Return\n\t"
slouken@689
   191
slouken@689
   192
		/* Nothing supported */
slouken@689
   193
		"\nNotSupported1:\n\t"
slouken@689
   194
		"#movl $101, %0:\n\n\t"
slouken@689
   195
		"\nNotSupported2:\n\t"
slouken@689
   196
		"#movl $102, %0:\n\n\t"
slouken@689
   197
		"\nNotSupported3:\n\t"
slouken@689
   198
		"#movl $103, %0:\n\n\t"
slouken@689
   199
		"\nNotSupported4:\n\t"
slouken@689
   200
		"#movl $104, %0:\n\n\t"
slouken@689
   201
		"\nNotSupported5:\n\t"
slouken@689
   202
		"#movl $105, %0:\n\n\t"
slouken@689
   203
		"\nNotSupported6:\n\t"
slouken@689
   204
		"#movl $106, %0:\n\n\t"
slouken@689
   205
		"\nNotSupported7:\n\t"
slouken@689
   206
		"#movl $107, %0:\n\n\t"
slouken@689
   207
		"movl $0, %0:\n\n\t"
slouken@689
   208
slouken@689
   209
		"Return:\n\t"
slouken@689
   210
		: "=a" (rval)
slouken@689
   211
		: /* no input */
slouken@689
   212
		: "eax", "ebx", "ecx", "edx"
slouken@689
   213
	);
slouken@689
   214
slouken@689
   215
	/* Return */
slouken@689
   216
	return(rval);
slouken@689
   217
}
slouken@689
   218
slouken@689
   219
/*	Function to test if mmx instructions are supported...
slouken@689
   220
*/
slouken@689
   221
inline extern int
slouken@689
   222
mmx_ok(void)
slouken@689
   223
{
slouken@689
   224
	/* Returns 1 if MMX instructions are supported, 0 otherwise */
slouken@689
   225
	return ( mm_support() & 0x1 );
slouken@689
   226
}
slouken@689
   227
#endif
slouken@689
   228
slouken@689
   229
/*	Helper functions for the instruction macros that follow...
slouken@689
   230
	(note that memory-to-register, m2r, instructions are nearly
slouken@689
   231
	 as efficient as register-to-register, r2r, instructions;
slouken@689
   232
	 however, memory-to-memory instructions are really simulated
slouken@689
   233
	 as a convenience, and are only 1/3 as efficient)
slouken@689
   234
*/
slouken@689
   235
#ifdef	MMX_TRACE
slouken@689
   236
slouken@689
   237
/*	Include the stuff for printing a trace to stderr...
slouken@689
   238
*/
slouken@689
   239
slouken@689
   240
#define	mmx_i2r(op, imm, reg) \
slouken@689
   241
	{ \
slouken@689
   242
		mmx_t mmx_trace; \
slouken@689
   243
		mmx_trace.uq = (imm); \
slouken@689
   244
		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
slouken@689
   245
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   246
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   247
				      : "=X" (mmx_trace) \
slouken@689
   248
				      : /* nothing */ ); \
slouken@689
   249
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   250
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   251
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   252
				      : /* nothing */ \
slouken@689
   253
				      : "X" (imm)); \
slouken@689
   254
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   255
				      : "=X" (mmx_trace) \
slouken@689
   256
				      : /* nothing */ ); \
slouken@689
   257
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   258
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   259
	}
slouken@689
   260
slouken@689
   261
#define	mmx_m2r(op, mem, reg) \
slouken@689
   262
	{ \
slouken@689
   263
		mmx_t mmx_trace; \
slouken@689
   264
		mmx_trace = (mem); \
slouken@689
   265
		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
slouken@689
   266
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   267
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   268
				      : "=X" (mmx_trace) \
slouken@689
   269
				      : /* nothing */ ); \
slouken@689
   270
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   271
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   272
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   273
				      : /* nothing */ \
slouken@689
   274
				      : "X" (mem)); \
slouken@689
   275
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   276
				      : "=X" (mmx_trace) \
slouken@689
   277
				      : /* nothing */ ); \
slouken@689
   278
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   279
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   280
	}
slouken@689
   281
slouken@689
   282
#define	mmx_r2m(op, reg, mem) \
slouken@689
   283
	{ \
slouken@689
   284
		mmx_t mmx_trace; \
slouken@689
   285
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   286
				      : "=X" (mmx_trace) \
slouken@689
   287
				      : /* nothing */ ); \
slouken@689
   288
		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
slouken@689
   289
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   290
		mmx_trace = (mem); \
slouken@689
   291
		printf(#mem "=0x%08x%08x) => ", \
slouken@689
   292
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   293
		__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   294
				      : "=X" (mem) \
slouken@689
   295
				      : /* nothing */ ); \
slouken@689
   296
		mmx_trace = (mem); \
slouken@689
   297
		printf(#mem "=0x%08x%08x\n", \
slouken@689
   298
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   299
	}
slouken@689
   300
slouken@689
   301
#define	mmx_r2r(op, regs, regd) \
slouken@689
   302
	{ \
slouken@689
   303
		mmx_t mmx_trace; \
slouken@689
   304
		__asm__ __volatile__ ("movq %%" #regs ", %0" \
slouken@689
   305
				      : "=X" (mmx_trace) \
slouken@689
   306
				      : /* nothing */ ); \
slouken@689
   307
		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
slouken@689
   308
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   309
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   310
				      : "=X" (mmx_trace) \
slouken@689
   311
				      : /* nothing */ ); \
slouken@689
   312
		printf(#regd "=0x%08x%08x) => ", \
slouken@689
   313
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   314
		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
slouken@689
   315
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   316
				      : "=X" (mmx_trace) \
slouken@689
   317
				      : /* nothing */ ); \
slouken@689
   318
		printf(#regd "=0x%08x%08x\n", \
slouken@689
   319
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   320
	}
slouken@689
   321
slouken@689
   322
#define	mmx_m2m(op, mems, memd) \
slouken@689
   323
	{ \
slouken@689
   324
		mmx_t mmx_trace; \
slouken@689
   325
		mmx_trace = (mems); \
slouken@689
   326
		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
slouken@689
   327
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   328
		mmx_trace = (memd); \
slouken@689
   329
		printf(#memd "=0x%08x%08x) => ", \
slouken@689
   330
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   331
		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   332
				      #op " %1, %%mm0\n\t" \
slouken@689
   333
				      "movq %%mm0, %0" \
slouken@689
   334
				      : "=X" (memd) \
slouken@689
   335
				      : "X" (mems)); \
slouken@689
   336
		mmx_trace = (memd); \
slouken@689
   337
		printf(#memd "=0x%08x%08x\n", \
slouken@689
   338
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   339
	}
slouken@689
   340
slouken@689
   341
#else
slouken@689
   342
slouken@689
   343
/*	These macros are a lot simpler without the tracing...
slouken@689
   344
*/
slouken@689
   345
slouken@689
   346
#define	mmx_i2r(op, imm, reg) \
slouken@689
   347
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   348
			      : /* nothing */ \
slouken@689
   349
			      : "X" (imm) )
slouken@689
   350
slouken@689
   351
#define	mmx_m2r(op, mem, reg) \
slouken@689
   352
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   353
			      : /* nothing */ \
slouken@738
   354
			      : "m" (mem))
slouken@689
   355
slouken@689
   356
#define	mmx_r2m(op, reg, mem) \
slouken@689
   357
	__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@5127
   358
			      : "=m" (mem) \
slouken@689
   359
			      : /* nothing */ )
slouken@689
   360
slouken@689
   361
#define	mmx_r2r(op, regs, regd) \
slouken@689
   362
	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
slouken@689
   363
slouken@689
   364
#define	mmx_m2m(op, mems, memd) \
slouken@689
   365
	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   366
			      #op " %1, %%mm0\n\t" \
slouken@689
   367
			      "movq %%mm0, %0" \
slouken@689
   368
			      : "=X" (memd) \
slouken@689
   369
			      : "X" (mems))
slouken@689
   370
slouken@689
   371
#endif
slouken@689
   372
slouken@689
   373
slouken@689
   374
/*	1x64 MOVe Quadword
slouken@689
   375
	(this is both a load and a store...
slouken@689
   376
	 in fact, it is the only way to store)
slouken@689
   377
*/
slouken@689
   378
#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
slouken@689
   379
#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
slouken@689
   380
#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
slouken@689
   381
#define	movq(vars, vard) \
slouken@689
   382
	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
slouken@689
   383
			      "movq %%mm0, %0" \
slouken@689
   384
			      : "=X" (vard) \
slouken@689
   385
			      : "X" (vars))
slouken@689
   386
slouken@689
   387
slouken@689
   388
/*	1x32 MOVe Doubleword
slouken@689
   389
	(like movq, this is both load and store...
slouken@689
   390
	 but is most useful for moving things between
slouken@689
   391
	 mmx registers and ordinary registers)
slouken@689
   392
*/
slouken@689
   393
#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
slouken@689
   394
#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
slouken@689
   395
#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
slouken@689
   396
#define	movd(vars, vard) \
slouken@689
   397
	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
slouken@689
   398
			      "movd %%mm0, %0" \
slouken@689
   399
			      : "=X" (vard) \
slouken@689
   400
			      : "X" (vars))
slouken@689
   401
slouken@689
   402
slouken@689
   403
/*	2x32, 4x16, and 8x8 Parallel ADDs
slouken@689
   404
*/
slouken@689
   405
#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
slouken@689
   406
#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
slouken@689
   407
#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
slouken@689
   408
slouken@689
   409
#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
slouken@689
   410
#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
slouken@689
   411
#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
slouken@689
   412
slouken@689
   413
#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
slouken@689
   414
#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
slouken@689
   415
#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
slouken@689
   416
slouken@689
   417
slouken@689
   418
/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
slouken@689
   419
*/
slouken@689
   420
#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
slouken@689
   421
#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
slouken@689
   422
#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
slouken@689
   423
slouken@689
   424
#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
slouken@689
   425
#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
slouken@689
   426
#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
slouken@689
   427
slouken@689
   428
slouken@689
   429
/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
slouken@689
   430
*/
slouken@689
   431
#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
slouken@689
   432
#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
slouken@689
   433
#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
slouken@689
   434
slouken@689
   435
#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
slouken@689
   436
#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
slouken@689
   437
#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
slouken@689
   438
slouken@689
   439
slouken@689
   440
/*	2x32, 4x16, and 8x8 Parallel SUBs
slouken@689
   441
*/
slouken@689
   442
#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
slouken@689
   443
#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
slouken@689
   444
#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
slouken@689
   445
slouken@689
   446
#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
slouken@689
   447
#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
slouken@689
   448
#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
slouken@689
   449
slouken@689
   450
#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
slouken@689
   451
#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
slouken@689
   452
#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
slouken@689
   453
slouken@689
   454
slouken@689
   455
/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
slouken@689
   456
*/
slouken@689
   457
#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
slouken@689
   458
#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
slouken@689
   459
#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
slouken@689
   460
slouken@689
   461
#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
slouken@689
   462
#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
slouken@689
   463
#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
slouken@689
   464
slouken@689
   465
slouken@689
   466
/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
slouken@689
   467
*/
slouken@689
   468
#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
slouken@689
   469
#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
slouken@689
   470
#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
slouken@689
   471
slouken@689
   472
#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
slouken@689
   473
#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
slouken@689
   474
#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
slouken@689
   475
slouken@689
   476
slouken@689
   477
/*	4x16 Parallel MULs giving Low 4x16 portions of results
slouken@689
   478
*/
slouken@689
   479
#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
slouken@689
   480
#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
slouken@689
   481
#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
slouken@689
   482
slouken@689
   483
slouken@689
   484
/*	4x16 Parallel MULs giving High 4x16 portions of results
slouken@689
   485
*/
slouken@689
   486
#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
slouken@689
   487
#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
slouken@689
   488
#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
slouken@689
   489
slouken@689
   490
slouken@689
   491
/*	4x16->2x32 Parallel Mul-ADD
slouken@689
   492
	(muls like pmullw, then adds adjacent 16-bit fields
slouken@689
   493
	 in the multiply result to make the final 2x32 result)
slouken@689
   494
*/
slouken@689
   495
#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
slouken@689
   496
#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
slouken@689
   497
#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
slouken@689
   498
slouken@689
   499
slouken@689
   500
/*	1x64 bitwise AND
slouken@689
   501
*/
slouken@689
   502
#ifdef	BROKEN_PAND
slouken@689
   503
#define	pand_m2r(var, reg) \
slouken@689
   504
	{ \
slouken@689
   505
		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
slouken@689
   506
		mmx_m2r(pandn, var, reg); \
slouken@689
   507
	}
slouken@689
   508
#define	pand_r2r(regs, regd) \
slouken@689
   509
	{ \
slouken@689
   510
		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
slouken@689
   511
		mmx_r2r(pandn, regs, regd) \
slouken@689
   512
	}
slouken@689
   513
#define	pand(vars, vard) \
slouken@689
   514
	{ \
slouken@689
   515
		movq_m2r(vard, mm0); \
slouken@689
   516
		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
slouken@689
   517
		mmx_m2r(pandn, vars, mm0); \
slouken@689
   518
		movq_r2m(mm0, vard); \
slouken@689
   519
	}
slouken@689
   520
#else
slouken@689
   521
#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
slouken@689
   522
#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
slouken@689
   523
#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
slouken@689
   524
#endif
slouken@689
   525
slouken@689
   526
slouken@689
   527
/*	1x64 bitwise AND with Not the destination
slouken@689
   528
*/
slouken@689
   529
#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
slouken@689
   530
#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
slouken@689
   531
#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
slouken@689
   532
slouken@689
   533
slouken@689
   534
/*	1x64 bitwise OR
slouken@689
   535
*/
slouken@689
   536
#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
slouken@689
   537
#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
slouken@689
   538
#define	por(vars, vard)	mmx_m2m(por, vars, vard)
slouken@689
   539
slouken@689
   540
slouken@689
   541
/*	1x64 bitwise eXclusive OR
slouken@689
   542
*/
slouken@689
   543
#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
slouken@689
   544
#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
slouken@689
   545
#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
slouken@689
   546
slouken@689
   547
slouken@689
   548
/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
slouken@689
   549
	(resulting fields are either 0 or -1)
slouken@689
   550
*/
slouken@689
   551
#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
slouken@689
   552
#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
slouken@689
   553
#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
slouken@689
   554
slouken@689
   555
#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
slouken@689
   556
#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
slouken@689
   557
#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
slouken@689
   558
slouken@689
   559
#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
slouken@689
   560
#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
slouken@689
   561
#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
slouken@689
   562
slouken@689
   563
slouken@689
   564
/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
slouken@689
   565
	(resulting fields are either 0 or -1)
slouken@689
   566
*/
slouken@689
   567
#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
slouken@689
   568
#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
slouken@689
   569
#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
slouken@689
   570
slouken@689
   571
#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
slouken@689
   572
#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
slouken@689
   573
#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
slouken@689
   574
slouken@689
   575
#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
slouken@689
   576
#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
slouken@689
   577
#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
slouken@689
   578
slouken@689
   579
slouken@689
   580
/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
slouken@689
   581
*/
slouken@689
   582
#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
slouken@689
   583
#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
slouken@689
   584
#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
slouken@689
   585
#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
slouken@689
   586
slouken@689
   587
#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
slouken@689
   588
#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
slouken@689
   589
#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
slouken@689
   590
#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
slouken@689
   591
slouken@689
   592
#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
slouken@689
   593
#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
slouken@689
   594
#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
slouken@689
   595
#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
slouken@689
   596
slouken@689
   597
slouken@689
   598
/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
slouken@689
   599
*/
slouken@689
   600
#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
slouken@689
   601
#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
slouken@689
   602
#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
slouken@689
   603
#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
slouken@689
   604
slouken@689
   605
#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
slouken@689
   606
#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
slouken@689
   607
#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
slouken@689
   608
#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
slouken@689
   609
slouken@689
   610
#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
slouken@689
   611
#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
slouken@689
   612
#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
slouken@689
   613
#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
slouken@689
   614
slouken@689
   615
slouken@689
   616
/*	2x32 and 4x16 Parallel Shift Right Arithmetic
slouken@689
   617
*/
slouken@689
   618
#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
slouken@689
   619
#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
slouken@689
   620
#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
slouken@689
   621
#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
slouken@689
   622
slouken@689
   623
#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
slouken@689
   624
#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
slouken@689
   625
#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
slouken@689
   626
#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
slouken@689
   627
slouken@689
   628
slouken@689
   629
/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
slouken@689
   630
	(packs source and dest fields into dest in that order)
slouken@689
   631
*/
slouken@689
   632
#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
slouken@689
   633
#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
slouken@689
   634
#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
slouken@689
   635
slouken@689
   636
#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
slouken@689
   637
#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
slouken@689
   638
#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
slouken@689
   639
slouken@689
   640
slouken@689
   641
/*	4x16->8x8 PACK and Unsigned Saturate
slouken@689
   642
	(packs source and dest fields into dest in that order)
slouken@689
   643
*/
slouken@689
   644
#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
slouken@689
   645
#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
slouken@689
   646
#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
slouken@689
   647
slouken@689
   648
slouken@689
   649
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
slouken@689
   650
	(interleaves low half of dest with low half of source
slouken@689
   651
	 as padding in each result field)
slouken@689
   652
*/
slouken@689
   653
#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
slouken@689
   654
#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
slouken@689
   655
#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
slouken@689
   656
slouken@689
   657
#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
slouken@689
   658
#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
slouken@689
   659
#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
slouken@689
   660
slouken@689
   661
#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
slouken@689
   662
#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
slouken@689
   663
#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
slouken@689
   664
slouken@689
   665
slouken@689
   666
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
slouken@689
   667
	(interleaves high half of dest with high half of source
slouken@689
   668
	 as padding in each result field)
slouken@689
   669
*/
slouken@689
   670
#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
slouken@689
   671
#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
slouken@689
   672
#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
slouken@689
   673
slouken@689
   674
#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
slouken@689
   675
#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
slouken@689
   676
#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
slouken@689
   677
slouken@689
   678
#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
slouken@689
   679
#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
slouken@689
   680
#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
slouken@689
   681
slouken@689
   682
slouken@689
   683
/*	Empty MMx State
slouken@689
   684
	(used to clean-up when going from mmx to float use
slouken@689
   685
	 of the registers that are shared by both; note that
slouken@689
   686
	 there is no float-to-mmx operation needed, because
slouken@689
   687
	 only the float tag word info is corruptible)
slouken@689
   688
*/
slouken@689
   689
#ifdef	MMX_TRACE
slouken@689
   690
slouken@689
   691
#define	emms() \
slouken@689
   692
	{ \
slouken@689
   693
		printf("emms()\n"); \
slouken@689
   694
		__asm__ __volatile__ ("emms"); \
slouken@689
   695
	}
slouken@689
   696
slouken@689
   697
#else
slouken@689
   698
slouken@689
   699
#define	emms()			__asm__ __volatile__ ("emms")
slouken@689
   700
slouken@689
   701
#endif
slouken@689
   702
slouken@689
   703
#endif
slouken@689
   704