src/video/mmx.h
author Sam Lantinga <slouken@libsdl.org>
Fri, 14 Nov 2003 20:21:22 +0000
changeset 738 82b85b731fe3
parent 689 5bb080d35049
child 1330 450721ad5436
permissions -rw-r--r--
Date: Mon, 10 Nov 2003 19:57:56 +0100
From: Stephane Marchesin <stephane.marchesin@wanadoo.fr>
Subject: [SDL] SDL and gcc >= 3.3.1 fix

Lately, I upgraded my gcc compiler and could trigger the error described
in this thread :
http://www.libsdl.org/pipermail/sdl/2003-September/056163.html

SDL_RLEaccel.c: In function `RLEClipBlit':
SDL_RLEaccel.c:845: error: invalid `asm': invalid expression as operand
SDL_RLEaccel.c:845: error: invalid `asm': invalid expression as operand
SDL_RLEaccel.c:845: error: invalid `asm': invalid expression as operand


The attached patch fixes this issue, and is smaller than the previous
one, as I was able to trigger the issue, so I could narrow it.
slouken@689
     1
/*	mmx.h
slouken@689
     2
slouken@689
     3
	MultiMedia eXtensions GCC interface library for IA32.
slouken@689
     4
slouken@689
     5
	To use this library, simply include this header file
slouken@689
     6
	and compile with GCC.  You MUST have inlining enabled
slouken@689
     7
	in order for mmx_ok() to work; this can be done by
slouken@689
     8
	simply using -O on the GCC command line.
slouken@689
     9
slouken@689
    10
	Compiling with -DMMX_TRACE will cause detailed trace
slouken@689
    11
	output to be sent to stderr for each mmx operation.
slouken@689
    12
	This adds lots of code, and obviously slows execution to
slouken@689
    13
	a crawl, but can be very useful for debugging.
slouken@689
    14
slouken@689
    15
	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
slouken@689
    16
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
slouken@689
    17
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
slouken@689
    18
	AND FITNESS FOR ANY PARTICULAR PURPOSE.
slouken@689
    19
slouken@689
    20
	1997-99 by H. Dietz and R. Fisher
slouken@689
    21
slouken@689
    22
 Notes:
slouken@689
    23
	It appears that the latest gas has the pand problem fixed, therefore
slouken@689
    24
	  I'll undefine BROKEN_PAND by default.
slouken@689
    25
*/
slouken@689
    26
slouken@689
    27
#ifndef _MMX_H
slouken@689
    28
#define _MMX_H
slouken@689
    29
slouken@689
    30
slouken@689
    31
/*	Warning:  at this writing, the version of GAS packaged
slouken@689
    32
	with most Linux distributions does not handle the
slouken@689
    33
	parallel AND operation mnemonic correctly.  If the
slouken@689
    34
	symbol BROKEN_PAND is defined, a slower alternative
slouken@689
    35
	coding will be used.  If execution of mmxtest results
slouken@689
    36
	in an illegal instruction fault, define this symbol.
slouken@689
    37
*/
slouken@689
    38
#undef	BROKEN_PAND
slouken@689
    39
slouken@689
    40
slouken@689
    41
/*	The type of an value that fits in an MMX register
slouken@689
    42
	(note that long long constant values MUST be suffixed
slouken@689
    43
	 by LL and unsigned long long values by ULL, lest
slouken@689
    44
	 they be truncated by the compiler)
slouken@689
    45
*/
slouken@689
    46
typedef	union {
slouken@689
    47
	long long		q;	/* Quadword (64-bit) value */
slouken@689
    48
	unsigned long long	uq;	/* Unsigned Quadword */
slouken@689
    49
	int			d[2];	/* 2 Doubleword (32-bit) values */
slouken@689
    50
	unsigned int		ud[2];	/* 2 Unsigned Doubleword */
slouken@689
    51
	short			w[4];	/* 4 Word (16-bit) values */
slouken@689
    52
	unsigned short		uw[4];	/* 4 Unsigned Word */
slouken@689
    53
	char			b[8];	/* 8 Byte (8-bit) values */
slouken@689
    54
	unsigned char		ub[8];	/* 8 Unsigned Byte */
slouken@689
    55
	float			s[2];	/* Single-precision (32-bit) value */
slouken@689
    56
} __attribute__ ((aligned (8))) mmx_t;	/* On an 8-byte (64-bit) boundary */
slouken@689
    57
slouken@689
    58
slouken@689
    59
#if 0
slouken@689
    60
/*	Function to test if multimedia instructions are supported...
slouken@689
    61
*/
slouken@689
    62
inline extern int
slouken@689
    63
mm_support(void)
slouken@689
    64
{
slouken@689
    65
	/* Returns 1 if MMX instructions are supported,
slouken@689
    66
	   3 if Cyrix MMX and Extended MMX instructions are supported
slouken@689
    67
	   5 if AMD MMX and 3DNow! instructions are supported
slouken@689
    68
	   0 if hardware does not support any of these
slouken@689
    69
	*/
slouken@689
    70
	register int rval = 0;
slouken@689
    71
slouken@689
    72
	__asm__ __volatile__ (
slouken@689
    73
		/* See if CPUID instruction is supported ... */
slouken@689
    74
		/* ... Get copies of EFLAGS into eax and ecx */
slouken@689
    75
		"pushf\n\t"
slouken@689
    76
		"popl %%eax\n\t"
slouken@689
    77
		"movl %%eax, %%ecx\n\t"
slouken@689
    78
slouken@689
    79
		/* ... Toggle the ID bit in one copy and store */
slouken@689
    80
		/*     to the EFLAGS reg */
slouken@689
    81
		"xorl $0x200000, %%eax\n\t"
slouken@689
    82
		"push %%eax\n\t"
slouken@689
    83
		"popf\n\t"
slouken@689
    84
slouken@689
    85
		/* ... Get the (hopefully modified) EFLAGS */
slouken@689
    86
		"pushf\n\t"
slouken@689
    87
		"popl %%eax\n\t"
slouken@689
    88
slouken@689
    89
		/* ... Compare and test result */
slouken@689
    90
		"xorl %%eax, %%ecx\n\t"
slouken@689
    91
		"testl $0x200000, %%ecx\n\t"
slouken@689
    92
		"jz NotSupported1\n\t"		/* CPUID not supported */
slouken@689
    93
slouken@689
    94
slouken@689
    95
		/* Get standard CPUID information, and
slouken@689
    96
		       go to a specific vendor section */
slouken@689
    97
		"movl $0, %%eax\n\t"
slouken@689
    98
		"cpuid\n\t"
slouken@689
    99
slouken@689
   100
		/* Check for Intel */
slouken@689
   101
		"cmpl $0x756e6547, %%ebx\n\t"
slouken@689
   102
		"jne TryAMD\n\t"
slouken@689
   103
		"cmpl $0x49656e69, %%edx\n\t"
slouken@689
   104
		"jne TryAMD\n\t"
slouken@689
   105
		"cmpl $0x6c65746e, %%ecx\n"
slouken@689
   106
		"jne TryAMD\n\t"
slouken@689
   107
		"jmp Intel\n\t"
slouken@689
   108
slouken@689
   109
		/* Check for AMD */
slouken@689
   110
		"\nTryAMD:\n\t"
slouken@689
   111
		"cmpl $0x68747541, %%ebx\n\t"
slouken@689
   112
		"jne TryCyrix\n\t"
slouken@689
   113
		"cmpl $0x69746e65, %%edx\n\t"
slouken@689
   114
		"jne TryCyrix\n\t"
slouken@689
   115
		"cmpl $0x444d4163, %%ecx\n"
slouken@689
   116
		"jne TryCyrix\n\t"
slouken@689
   117
		"jmp AMD\n\t"
slouken@689
   118
slouken@689
   119
		/* Check for Cyrix */
slouken@689
   120
		"\nTryCyrix:\n\t"
slouken@689
   121
		"cmpl $0x69727943, %%ebx\n\t"
slouken@689
   122
		"jne NotSupported2\n\t"
slouken@689
   123
		"cmpl $0x736e4978, %%edx\n\t"
slouken@689
   124
		"jne NotSupported3\n\t"
slouken@689
   125
		"cmpl $0x64616574, %%ecx\n\t"
slouken@689
   126
		"jne NotSupported4\n\t"
slouken@689
   127
		/* Drop through to Cyrix... */
slouken@689
   128
slouken@689
   129
slouken@689
   130
		/* Cyrix Section */
slouken@689
   131
		/* See if extended CPUID level 80000001 is supported */
slouken@689
   132
		/* The value of CPUID/80000001 for the 6x86MX is undefined
slouken@689
   133
		   according to the Cyrix CPU Detection Guide (Preliminary
slouken@689
   134
		   Rev. 1.01 table 1), so we'll check the value of eax for
slouken@689
   135
		   CPUID/0 to see if standard CPUID level 2 is supported.
slouken@689
   136
		   According to the table, the only CPU which supports level
slouken@689
   137
		   2 is also the only one which supports extended CPUID levels.
slouken@689
   138
		*/
slouken@689
   139
		"cmpl $0x2, %%eax\n\t"
slouken@689
   140
		"jne MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   141
slouken@689
   142
		/* Extended CPUID supported (in theory), so get extended
slouken@689
   143
		   features */
slouken@689
   144
		"movl $0x80000001, %%eax\n\t"
slouken@689
   145
		"cpuid\n\t"
slouken@689
   146
		"testl $0x00800000, %%eax\n\t"	/* Test for MMX */
slouken@689
   147
		"jz NotSupported5\n\t"		/* MMX not supported */
slouken@689
   148
		"testl $0x01000000, %%eax\n\t"	/* Test for Ext'd MMX */
slouken@689
   149
		"jnz EMMXSupported\n\t"
slouken@689
   150
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   151
		"jmp Return\n\n"
slouken@689
   152
		"EMMXSupported:\n\t"
slouken@689
   153
		"movl $3, %0:\n\n\t"		/* EMMX and MMX Supported */
slouken@689
   154
		"jmp Return\n\t"
slouken@689
   155
slouken@689
   156
slouken@689
   157
		/* AMD Section */
slouken@689
   158
		"AMD:\n\t"
slouken@689
   159
slouken@689
   160
		/* See if extended CPUID is supported */
slouken@689
   161
		"movl $0x80000000, %%eax\n\t"
slouken@689
   162
		"cpuid\n\t"
slouken@689
   163
		"cmpl $0x80000000, %%eax\n\t"
slouken@689
   164
		"jl MMXtest\n\t"	/* Use standard CPUID instead */
slouken@689
   165
slouken@689
   166
		/* Extended CPUID supported, so get extended features */
slouken@689
   167
		"movl $0x80000001, %%eax\n\t"
slouken@689
   168
		"cpuid\n\t"
slouken@689
   169
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   170
		"jz NotSupported6\n\t"		/* MMX not supported */
slouken@689
   171
		"testl $0x80000000, %%edx\n\t"	/* Test for 3DNow! */
slouken@689
   172
		"jnz ThreeDNowSupported\n\t"
slouken@689
   173
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   174
		"jmp Return\n\n"
slouken@689
   175
		"ThreeDNowSupported:\n\t"
slouken@689
   176
		"movl $5, %0:\n\n\t"		/* 3DNow! and MMX Supported */
slouken@689
   177
		"jmp Return\n\t"
slouken@689
   178
slouken@689
   179
slouken@689
   180
		/* Intel Section */
slouken@689
   181
		"Intel:\n\t"
slouken@689
   182
slouken@689
   183
		/* Check for MMX */
slouken@689
   184
		"MMXtest:\n\t"
slouken@689
   185
		"movl $1, %%eax\n\t"
slouken@689
   186
		"cpuid\n\t"
slouken@689
   187
		"testl $0x00800000, %%edx\n\t"	/* Test for MMX */
slouken@689
   188
		"jz NotSupported7\n\t"		/* MMX Not supported */
slouken@689
   189
		"movl $1, %0:\n\n\t"		/* MMX Supported */
slouken@689
   190
		"jmp Return\n\t"
slouken@689
   191
slouken@689
   192
		/* Nothing supported */
slouken@689
   193
		"\nNotSupported1:\n\t"
slouken@689
   194
		"#movl $101, %0:\n\n\t"
slouken@689
   195
		"\nNotSupported2:\n\t"
slouken@689
   196
		"#movl $102, %0:\n\n\t"
slouken@689
   197
		"\nNotSupported3:\n\t"
slouken@689
   198
		"#movl $103, %0:\n\n\t"
slouken@689
   199
		"\nNotSupported4:\n\t"
slouken@689
   200
		"#movl $104, %0:\n\n\t"
slouken@689
   201
		"\nNotSupported5:\n\t"
slouken@689
   202
		"#movl $105, %0:\n\n\t"
slouken@689
   203
		"\nNotSupported6:\n\t"
slouken@689
   204
		"#movl $106, %0:\n\n\t"
slouken@689
   205
		"\nNotSupported7:\n\t"
slouken@689
   206
		"#movl $107, %0:\n\n\t"
slouken@689
   207
		"movl $0, %0:\n\n\t"
slouken@689
   208
slouken@689
   209
		"Return:\n\t"
slouken@689
   210
		: "=a" (rval)
slouken@689
   211
		: /* no input */
slouken@689
   212
		: "eax", "ebx", "ecx", "edx"
slouken@689
   213
	);
slouken@689
   214
slouken@689
   215
	/* Return */
slouken@689
   216
	return(rval);
slouken@689
   217
}
slouken@689
   218
slouken@689
   219
/*	Function to test if mmx instructions are supported...
slouken@689
   220
*/
slouken@689
   221
inline extern int
slouken@689
   222
mmx_ok(void)
slouken@689
   223
{
slouken@689
   224
	/* Returns 1 if MMX instructions are supported, 0 otherwise */
slouken@689
   225
	return ( mm_support() & 0x1 );
slouken@689
   226
}
slouken@689
   227
#endif
slouken@689
   228
slouken@689
   229
/*	Helper functions for the instruction macros that follow...
slouken@689
   230
	(note that memory-to-register, m2r, instructions are nearly
slouken@689
   231
	 as efficient as register-to-register, r2r, instructions;
slouken@689
   232
	 however, memory-to-memory instructions are really simulated
slouken@689
   233
	 as a convenience, and are only 1/3 as efficient)
slouken@689
   234
*/
slouken@689
   235
#ifdef	MMX_TRACE
slouken@689
   236
slouken@689
   237
/*	Include the stuff for printing a trace to stderr...
slouken@689
   238
*/
slouken@689
   239
slouken@689
   240
#include <stdio.h>
slouken@689
   241
slouken@689
   242
#define	mmx_i2r(op, imm, reg) \
slouken@689
   243
	{ \
slouken@689
   244
		mmx_t mmx_trace; \
slouken@689
   245
		mmx_trace.uq = (imm); \
slouken@689
   246
		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
slouken@689
   247
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   248
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   249
				      : "=X" (mmx_trace) \
slouken@689
   250
				      : /* nothing */ ); \
slouken@689
   251
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   252
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   253
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   254
				      : /* nothing */ \
slouken@689
   255
				      : "X" (imm)); \
slouken@689
   256
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   257
				      : "=X" (mmx_trace) \
slouken@689
   258
				      : /* nothing */ ); \
slouken@689
   259
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   260
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   261
	}
slouken@689
   262
slouken@689
   263
#define	mmx_m2r(op, mem, reg) \
slouken@689
   264
	{ \
slouken@689
   265
		mmx_t mmx_trace; \
slouken@689
   266
		mmx_trace = (mem); \
slouken@689
   267
		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
slouken@689
   268
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   269
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   270
				      : "=X" (mmx_trace) \
slouken@689
   271
				      : /* nothing */ ); \
slouken@689
   272
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   273
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   274
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   275
				      : /* nothing */ \
slouken@689
   276
				      : "X" (mem)); \
slouken@689
   277
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   278
				      : "=X" (mmx_trace) \
slouken@689
   279
				      : /* nothing */ ); \
slouken@689
   280
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   281
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   282
	}
slouken@689
   283
slouken@689
   284
#define	mmx_r2m(op, reg, mem) \
slouken@689
   285
	{ \
slouken@689
   286
		mmx_t mmx_trace; \
slouken@689
   287
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   288
				      : "=X" (mmx_trace) \
slouken@689
   289
				      : /* nothing */ ); \
slouken@689
   290
		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
slouken@689
   291
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   292
		mmx_trace = (mem); \
slouken@689
   293
		printf(#mem "=0x%08x%08x) => ", \
slouken@689
   294
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   295
		__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   296
				      : "=X" (mem) \
slouken@689
   297
				      : /* nothing */ ); \
slouken@689
   298
		mmx_trace = (mem); \
slouken@689
   299
		printf(#mem "=0x%08x%08x\n", \
slouken@689
   300
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   301
	}
slouken@689
   302
slouken@689
   303
#define	mmx_r2r(op, regs, regd) \
slouken@689
   304
	{ \
slouken@689
   305
		mmx_t mmx_trace; \
slouken@689
   306
		__asm__ __volatile__ ("movq %%" #regs ", %0" \
slouken@689
   307
				      : "=X" (mmx_trace) \
slouken@689
   308
				      : /* nothing */ ); \
slouken@689
   309
		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
slouken@689
   310
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   311
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   312
				      : "=X" (mmx_trace) \
slouken@689
   313
				      : /* nothing */ ); \
slouken@689
   314
		printf(#regd "=0x%08x%08x) => ", \
slouken@689
   315
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   316
		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
slouken@689
   317
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   318
				      : "=X" (mmx_trace) \
slouken@689
   319
				      : /* nothing */ ); \
slouken@689
   320
		printf(#regd "=0x%08x%08x\n", \
slouken@689
   321
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   322
	}
slouken@689
   323
slouken@689
   324
#define	mmx_m2m(op, mems, memd) \
slouken@689
   325
	{ \
slouken@689
   326
		mmx_t mmx_trace; \
slouken@689
   327
		mmx_trace = (mems); \
slouken@689
   328
		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
slouken@689
   329
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   330
		mmx_trace = (memd); \
slouken@689
   331
		printf(#memd "=0x%08x%08x) => ", \
slouken@689
   332
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   333
		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   334
				      #op " %1, %%mm0\n\t" \
slouken@689
   335
				      "movq %%mm0, %0" \
slouken@689
   336
				      : "=X" (memd) \
slouken@689
   337
				      : "X" (mems)); \
slouken@689
   338
		mmx_trace = (memd); \
slouken@689
   339
		printf(#memd "=0x%08x%08x\n", \
slouken@689
   340
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   341
	}
slouken@689
   342
slouken@689
   343
#else
slouken@689
   344
slouken@689
   345
/*	These macros are a lot simpler without the tracing...
slouken@689
   346
*/
slouken@689
   347
slouken@689
   348
#define	mmx_i2r(op, imm, reg) \
slouken@689
   349
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   350
			      : /* nothing */ \
slouken@689
   351
			      : "X" (imm) )
slouken@689
   352
slouken@689
   353
#define	mmx_m2r(op, mem, reg) \
slouken@689
   354
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   355
			      : /* nothing */ \
slouken@738
   356
			      : "m" (mem))
slouken@689
   357
slouken@689
   358
#define	mmx_r2m(op, reg, mem) \
slouken@689
   359
	__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   360
			      : "=X" (mem) \
slouken@689
   361
			      : /* nothing */ )
slouken@689
   362
slouken@689
   363
#define	mmx_r2r(op, regs, regd) \
slouken@689
   364
	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
slouken@689
   365
slouken@689
   366
#define	mmx_m2m(op, mems, memd) \
slouken@689
   367
	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   368
			      #op " %1, %%mm0\n\t" \
slouken@689
   369
			      "movq %%mm0, %0" \
slouken@689
   370
			      : "=X" (memd) \
slouken@689
   371
			      : "X" (mems))
slouken@689
   372
slouken@689
   373
#endif
slouken@689
   374
slouken@689
   375
slouken@689
   376
/*	1x64 MOVe Quadword
slouken@689
   377
	(this is both a load and a store...
slouken@689
   378
	 in fact, it is the only way to store)
slouken@689
   379
*/
slouken@689
   380
#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
slouken@689
   381
#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
slouken@689
   382
#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
slouken@689
   383
#define	movq(vars, vard) \
slouken@689
   384
	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
slouken@689
   385
			      "movq %%mm0, %0" \
slouken@689
   386
			      : "=X" (vard) \
slouken@689
   387
			      : "X" (vars))
slouken@689
   388
slouken@689
   389
slouken@689
   390
/*	1x32 MOVe Doubleword
slouken@689
   391
	(like movq, this is both load and store...
slouken@689
   392
	 but is most useful for moving things between
slouken@689
   393
	 mmx registers and ordinary registers)
slouken@689
   394
*/
slouken@689
   395
#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
slouken@689
   396
#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
slouken@689
   397
#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
slouken@689
   398
#define	movd(vars, vard) \
slouken@689
   399
	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
slouken@689
   400
			      "movd %%mm0, %0" \
slouken@689
   401
			      : "=X" (vard) \
slouken@689
   402
			      : "X" (vars))
slouken@689
   403
slouken@689
   404
slouken@689
   405
/*	2x32, 4x16, and 8x8 Parallel ADDs
slouken@689
   406
*/
slouken@689
   407
#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
slouken@689
   408
#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
slouken@689
   409
#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
slouken@689
   410
slouken@689
   411
#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
slouken@689
   412
#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
slouken@689
   413
#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
slouken@689
   414
slouken@689
   415
#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
slouken@689
   416
#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
slouken@689
   417
#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
slouken@689
   418
slouken@689
   419
slouken@689
   420
/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
slouken@689
   421
*/
slouken@689
   422
#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
slouken@689
   423
#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
slouken@689
   424
#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
slouken@689
   425
slouken@689
   426
#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
slouken@689
   427
#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
slouken@689
   428
#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
slouken@689
   429
slouken@689
   430
slouken@689
   431
/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
slouken@689
   432
*/
slouken@689
   433
#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
slouken@689
   434
#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
slouken@689
   435
#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
slouken@689
   436
slouken@689
   437
#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
slouken@689
   438
#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
slouken@689
   439
#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
slouken@689
   440
slouken@689
   441
slouken@689
   442
/*	2x32, 4x16, and 8x8 Parallel SUBs
slouken@689
   443
*/
slouken@689
   444
#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
slouken@689
   445
#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
slouken@689
   446
#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
slouken@689
   447
slouken@689
   448
#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
slouken@689
   449
#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
slouken@689
   450
#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
slouken@689
   451
slouken@689
   452
#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
slouken@689
   453
#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
slouken@689
   454
#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
slouken@689
   455
slouken@689
   456
slouken@689
   457
/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
slouken@689
   458
*/
slouken@689
   459
#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
slouken@689
   460
#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
slouken@689
   461
#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
slouken@689
   462
slouken@689
   463
#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
slouken@689
   464
#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
slouken@689
   465
#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
slouken@689
   466
slouken@689
   467
slouken@689
   468
/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
slouken@689
   469
*/
slouken@689
   470
#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
slouken@689
   471
#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
slouken@689
   472
#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
slouken@689
   473
slouken@689
   474
#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
slouken@689
   475
#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
slouken@689
   476
#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
slouken@689
   477
slouken@689
   478
slouken@689
   479
/*	4x16 Parallel MULs giving Low 4x16 portions of results
slouken@689
   480
*/
slouken@689
   481
#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
slouken@689
   482
#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
slouken@689
   483
#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
slouken@689
   484
slouken@689
   485
slouken@689
   486
/*	4x16 Parallel MULs giving High 4x16 portions of results
slouken@689
   487
*/
slouken@689
   488
#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
slouken@689
   489
#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
slouken@689
   490
#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
slouken@689
   491
slouken@689
   492
slouken@689
   493
/*	4x16->2x32 Parallel Mul-ADD
slouken@689
   494
	(muls like pmullw, then adds adjacent 16-bit fields
slouken@689
   495
	 in the multiply result to make the final 2x32 result)
slouken@689
   496
*/
slouken@689
   497
#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
slouken@689
   498
#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
slouken@689
   499
#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
slouken@689
   500
slouken@689
   501
slouken@689
   502
/*	1x64 bitwise AND
slouken@689
   503
*/
slouken@689
   504
#ifdef	BROKEN_PAND
slouken@689
   505
#define	pand_m2r(var, reg) \
slouken@689
   506
	{ \
slouken@689
   507
		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
slouken@689
   508
		mmx_m2r(pandn, var, reg); \
slouken@689
   509
	}
slouken@689
   510
#define	pand_r2r(regs, regd) \
slouken@689
   511
	{ \
slouken@689
   512
		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
slouken@689
   513
		mmx_r2r(pandn, regs, regd) \
slouken@689
   514
	}
slouken@689
   515
#define	pand(vars, vard) \
slouken@689
   516
	{ \
slouken@689
   517
		movq_m2r(vard, mm0); \
slouken@689
   518
		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
slouken@689
   519
		mmx_m2r(pandn, vars, mm0); \
slouken@689
   520
		movq_r2m(mm0, vard); \
slouken@689
   521
	}
slouken@689
   522
#else
slouken@689
   523
#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
slouken@689
   524
#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
slouken@689
   525
#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
slouken@689
   526
#endif
slouken@689
   527
slouken@689
   528
slouken@689
   529
/*	1x64 bitwise AND with Not the destination
slouken@689
   530
*/
slouken@689
   531
#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
slouken@689
   532
#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
slouken@689
   533
#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
slouken@689
   534
slouken@689
   535
slouken@689
   536
/*	1x64 bitwise OR
slouken@689
   537
*/
slouken@689
   538
#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
slouken@689
   539
#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
slouken@689
   540
#define	por(vars, vard)	mmx_m2m(por, vars, vard)
slouken@689
   541
slouken@689
   542
slouken@689
   543
/*	1x64 bitwise eXclusive OR
slouken@689
   544
*/
slouken@689
   545
#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
slouken@689
   546
#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
slouken@689
   547
#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
slouken@689
   548
slouken@689
   549
slouken@689
   550
/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
slouken@689
   551
	(resulting fields are either 0 or -1)
slouken@689
   552
*/
slouken@689
   553
#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
slouken@689
   554
#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
slouken@689
   555
#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
slouken@689
   556
slouken@689
   557
#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
slouken@689
   558
#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
slouken@689
   559
#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
slouken@689
   560
slouken@689
   561
#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
slouken@689
   562
#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
slouken@689
   563
#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
slouken@689
   564
slouken@689
   565
slouken@689
   566
/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
slouken@689
   567
	(resulting fields are either 0 or -1)
slouken@689
   568
*/
slouken@689
   569
#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
slouken@689
   570
#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
slouken@689
   571
#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
slouken@689
   572
slouken@689
   573
#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
slouken@689
   574
#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
slouken@689
   575
#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
slouken@689
   576
slouken@689
   577
#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
slouken@689
   578
#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
slouken@689
   579
#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
slouken@689
   580
slouken@689
   581
slouken@689
   582
/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
slouken@689
   583
*/
slouken@689
   584
#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
slouken@689
   585
#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
slouken@689
   586
#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
slouken@689
   587
#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
slouken@689
   588
slouken@689
   589
#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
slouken@689
   590
#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
slouken@689
   591
#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
slouken@689
   592
#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
slouken@689
   593
slouken@689
   594
#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
slouken@689
   595
#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
slouken@689
   596
#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
slouken@689
   597
#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
slouken@689
   598
slouken@689
   599
slouken@689
   600
/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
slouken@689
   601
*/
slouken@689
   602
#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
slouken@689
   603
#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
slouken@689
   604
#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
slouken@689
   605
#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
slouken@689
   606
slouken@689
   607
#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
slouken@689
   608
#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
slouken@689
   609
#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
slouken@689
   610
#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
slouken@689
   611
slouken@689
   612
#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
slouken@689
   613
#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
slouken@689
   614
#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
slouken@689
   615
#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
slouken@689
   616
slouken@689
   617
slouken@689
   618
/*	2x32 and 4x16 Parallel Shift Right Arithmetic
slouken@689
   619
*/
slouken@689
   620
#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
slouken@689
   621
#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
slouken@689
   622
#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
slouken@689
   623
#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
slouken@689
   624
slouken@689
   625
#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
slouken@689
   626
#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
slouken@689
   627
#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
slouken@689
   628
#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
slouken@689
   629
slouken@689
   630
slouken@689
   631
/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
slouken@689
   632
	(packs source and dest fields into dest in that order)
slouken@689
   633
*/
slouken@689
   634
#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
slouken@689
   635
#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
slouken@689
   636
#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
slouken@689
   637
slouken@689
   638
#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
slouken@689
   639
#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
slouken@689
   640
#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
slouken@689
   641
slouken@689
   642
slouken@689
   643
/*	4x16->8x8 PACK and Unsigned Saturate
slouken@689
   644
	(packs source and dest fields into dest in that order)
slouken@689
   645
*/
slouken@689
   646
#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
slouken@689
   647
#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
slouken@689
   648
#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
slouken@689
   649
slouken@689
   650
slouken@689
   651
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
slouken@689
   652
	(interleaves low half of dest with low half of source
slouken@689
   653
	 as padding in each result field)
slouken@689
   654
*/
slouken@689
   655
#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
slouken@689
   656
#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
slouken@689
   657
#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
slouken@689
   658
slouken@689
   659
#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
slouken@689
   660
#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
slouken@689
   661
#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
slouken@689
   662
slouken@689
   663
#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
slouken@689
   664
#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
slouken@689
   665
#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
slouken@689
   666
slouken@689
   667
slouken@689
   668
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
slouken@689
   669
	(interleaves high half of dest with high half of source
slouken@689
   670
	 as padding in each result field)
slouken@689
   671
*/
slouken@689
   672
#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
slouken@689
   673
#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
slouken@689
   674
#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
slouken@689
   675
slouken@689
   676
#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
slouken@689
   677
#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
slouken@689
   678
#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
slouken@689
   679
slouken@689
   680
#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
slouken@689
   681
#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
slouken@689
   682
#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
slouken@689
   683
slouken@689
   684
slouken@689
   685
/*	Empty MMx State
slouken@689
   686
	(used to clean-up when going from mmx to float use
slouken@689
   687
	 of the registers that are shared by both; note that
slouken@689
   688
	 there is no float-to-mmx operation needed, because
slouken@689
   689
	 only the float tag word info is corruptible)
slouken@689
   690
*/
slouken@689
   691
#ifdef	MMX_TRACE
slouken@689
   692
slouken@689
   693
#define	emms() \
slouken@689
   694
	{ \
slouken@689
   695
		printf("emms()\n"); \
slouken@689
   696
		__asm__ __volatile__ ("emms"); \
slouken@689
   697
	}
slouken@689
   698
slouken@689
   699
#else
slouken@689
   700
slouken@689
   701
#define	emms()			__asm__ __volatile__ ("emms")
slouken@689
   702
slouken@689
   703
#endif
slouken@689
   704
slouken@689
   705
#endif
slouken@689
   706