src/video/mmx.h
author Sam Lantinga <slouken@libsdl.org>
Sun, 28 May 2006 13:04:16 +0000
branchSDL-1.3
changeset 1662 782fd950bd46
parent 1330 450721ad5436
child 1668 4da1ee79c9af
permissions -rw-r--r--
Revamp of the video system in progress - adding support for multiple displays, multiple windows, and a full video mode selection API.

WARNING: None of the video drivers have been updated for the new API yet! The API is still under design and very fluid.

The code is now run through a consistent indent format:
indent -i4 -nut -nsc -br -ce

The headers are being converted to automatically generate doxygen documentation.
slouken@689
     1
/*	mmx.h
slouken@689
     2
slouken@689
     3
	MultiMedia eXtensions GCC interface library for IA32.
slouken@689
     4
slouken@689
     5
	To use this library, simply include this header file
slouken@689
     6
	and compile with GCC.  You MUST have inlining enabled
slouken@689
     7
	in order for mmx_ok() to work; this can be done by
slouken@689
     8
	simply using -O on the GCC command line.
slouken@689
     9
slouken@689
    10
	Compiling with -DMMX_TRACE will cause detailed trace
slouken@689
    11
	output to be sent to stderr for each mmx operation.
slouken@689
    12
	This adds lots of code, and obviously slows execution to
slouken@689
    13
	a crawl, but can be very useful for debugging.
slouken@689
    14
slouken@689
    15
	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
slouken@689
    16
	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
slouken@689
    17
	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
slouken@689
    18
	AND FITNESS FOR ANY PARTICULAR PURPOSE.
slouken@689
    19
slouken@689
    20
	1997-99 by H. Dietz and R. Fisher
slouken@689
    21
slouken@689
    22
 Notes:
slouken@689
    23
	It appears that the latest gas has the pand problem fixed, therefore
slouken@689
    24
	  I'll undefine BROKEN_PAND by default.
slouken@689
    25
*/
slouken@689
    26
slouken@689
    27
#ifndef _MMX_H
slouken@689
    28
#define _MMX_H
slouken@689
    29
slouken@689
    30
slouken@689
    31
/*	Warning:  at this writing, the version of GAS packaged
slouken@689
    32
	with most Linux distributions does not handle the
slouken@689
    33
	parallel AND operation mnemonic correctly.  If the
slouken@689
    34
	symbol BROKEN_PAND is defined, a slower alternative
slouken@689
    35
	coding will be used.  If execution of mmxtest results
slouken@689
    36
	in an illegal instruction fault, define this symbol.
slouken@689
    37
*/
slouken@689
    38
#undef	BROKEN_PAND
slouken@689
    39
slouken@689
    40
slouken@689
    41
/*	The type of an value that fits in an MMX register
slouken@689
    42
	(note that long long constant values MUST be suffixed
slouken@689
    43
	 by LL and unsigned long long values by ULL, lest
slouken@689
    44
	 they be truncated by the compiler)
slouken@689
    45
*/
slouken@1662
    46
typedef union
slouken@1662
    47
{
slouken@1662
    48
    long long q;                /* Quadword (64-bit) value */
slouken@1662
    49
    unsigned long long uq;      /* Unsigned Quadword */
slouken@1662
    50
    int d[2];                   /* 2 Doubleword (32-bit) values */
slouken@1662
    51
    unsigned int ud[2];         /* 2 Unsigned Doubleword */
slouken@1662
    52
    short w[4];                 /* 4 Word (16-bit) values */
slouken@1662
    53
    unsigned short uw[4];       /* 4 Unsigned Word */
slouken@1662
    54
    char b[8];                  /* 8 Byte (8-bit) values */
slouken@1662
    55
    unsigned char ub[8];        /* 8 Unsigned Byte */
slouken@1662
    56
    float s[2];                 /* Single-precision (32-bit) value */
slouken@1662
    57
} __attribute__ ((aligned (8))) mmx_t;  /* On an 8-byte (64-bit) boundary */
slouken@689
    58
slouken@689
    59
slouken@689
    60
#if 0
slouken@689
    61
/*	Function to test if multimedia instructions are supported...
slouken@689
    62
*/
slouken@689
    63
inline extern int
slouken@1662
    64
mm_support (void)
slouken@689
    65
{
slouken@1662
    66
    /* Returns 1 if MMX instructions are supported,
slouken@1662
    67
       3 if Cyrix MMX and Extended MMX instructions are supported
slouken@1662
    68
       5 if AMD MMX and 3DNow! instructions are supported
slouken@1662
    69
       0 if hardware does not support any of these
slouken@1662
    70
     */
slouken@1662
    71
    register int rval = 0;
slouken@689
    72
slouken@1662
    73
    __asm__ __volatile__ (
slouken@1662
    74
                             /* See if CPUID instruction is supported ... */
slouken@1662
    75
                             /* ... Get copies of EFLAGS into eax and ecx */
slouken@1662
    76
                             "pushf\n\t"
slouken@1662
    77
                             "popl %%eax\n\t" "movl %%eax, %%ecx\n\t"
slouken@1662
    78
                             /* ... Toggle the ID bit in one copy and store */
slouken@1662
    79
                             /*     to the EFLAGS reg */
slouken@1662
    80
                             "xorl $0x200000, %%eax\n\t"
slouken@1662
    81
                             "push %%eax\n\t" "popf\n\t"
slouken@1662
    82
                             /* ... Get the (hopefully modified) EFLAGS */
slouken@1662
    83
                             "pushf\n\t" "popl %%eax\n\t"
slouken@1662
    84
                             /* ... Compare and test result */
slouken@1662
    85
                             "xorl %%eax, %%ecx\n\t" "testl $0x200000, %%ecx\n\t" "jz NotSupported1\n\t"        /* CPUID not supported */
slouken@1662
    86
                             /* Get standard CPUID information, and
slouken@1662
    87
                                go to a specific vendor section */
slouken@1662
    88
                             "movl $0, %%eax\n\t" "cpuid\n\t"
slouken@1662
    89
                             /* Check for Intel */
slouken@1662
    90
                             "cmpl $0x756e6547, %%ebx\n\t"
slouken@1662
    91
                             "jne TryAMD\n\t"
slouken@1662
    92
                             "cmpl $0x49656e69, %%edx\n\t"
slouken@1662
    93
                             "jne TryAMD\n\t"
slouken@1662
    94
                             "cmpl $0x6c65746e, %%ecx\n"
slouken@1662
    95
                             "jne TryAMD\n\t" "jmp Intel\n\t"
slouken@1662
    96
                             /* Check for AMD */
slouken@1662
    97
                             "\nTryAMD:\n\t"
slouken@1662
    98
                             "cmpl $0x68747541, %%ebx\n\t"
slouken@1662
    99
                             "jne TryCyrix\n\t"
slouken@1662
   100
                             "cmpl $0x69746e65, %%edx\n\t"
slouken@1662
   101
                             "jne TryCyrix\n\t"
slouken@1662
   102
                             "cmpl $0x444d4163, %%ecx\n"
slouken@1662
   103
                             "jne TryCyrix\n\t" "jmp AMD\n\t"
slouken@1662
   104
                             /* Check for Cyrix */
slouken@1662
   105
                             "\nTryCyrix:\n\t"
slouken@1662
   106
                             "cmpl $0x69727943, %%ebx\n\t"
slouken@1662
   107
                             "jne NotSupported2\n\t"
slouken@1662
   108
                             "cmpl $0x736e4978, %%edx\n\t"
slouken@1662
   109
                             "jne NotSupported3\n\t"
slouken@1662
   110
                             "cmpl $0x64616574, %%ecx\n\t"
slouken@1662
   111
                             "jne NotSupported4\n\t"
slouken@1662
   112
                             /* Drop through to Cyrix... */
slouken@1662
   113
                             /* Cyrix Section */
slouken@1662
   114
                             /* See if extended CPUID level 80000001 is supported */
slouken@1662
   115
                             /* The value of CPUID/80000001 for the 6x86MX is undefined
slouken@1662
   116
                                according to the Cyrix CPU Detection Guide (Preliminary
slouken@1662
   117
                                Rev. 1.01 table 1), so we'll check the value of eax for
slouken@1662
   118
                                CPUID/0 to see if standard CPUID level 2 is supported.
slouken@1662
   119
                                According to the table, the only CPU which supports level
slouken@1662
   120
                                2 is also the only one which supports extended CPUID levels.
slouken@1662
   121
                              */
slouken@1662
   122
                             "cmpl $0x2, %%eax\n\t" "jne MMXtest\n\t"   /* Use standard CPUID instead */
slouken@1662
   123
                             /* Extended CPUID supported (in theory), so get extended
slouken@1662
   124
                                features */
slouken@1662
   125
                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%eax\n\t"   /* Test for MMX */
slouken@1662
   126
                             "jz NotSupported5\n\t"     /* MMX not supported */
slouken@1662
   127
                             "testl $0x01000000, %%eax\n\t"     /* Test for Ext'd MMX */
slouken@1662
   128
                             "jnz EMMXSupported\n\t" "movl $1, %0:\n\n\t"       /* MMX Supported */
slouken@1662
   129
                             "jmp Return\n\n" "EMMXSupported:\n\t" "movl $3, %0:\n\n\t" /* EMMX and MMX Supported */
slouken@1662
   130
                             "jmp Return\n\t"
slouken@1662
   131
                             /* AMD Section */
slouken@1662
   132
                             "AMD:\n\t"
slouken@1662
   133
                             /* See if extended CPUID is supported */
slouken@1662
   134
                             "movl $0x80000000, %%eax\n\t" "cpuid\n\t" "cmpl $0x80000000, %%eax\n\t" "jl MMXtest\n\t"   /* Use standard CPUID instead */
slouken@1662
   135
                             /* Extended CPUID supported, so get extended features */
slouken@1662
   136
                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"   /* Test for MMX */
slouken@1662
   137
                             "jz NotSupported6\n\t"     /* MMX not supported */
slouken@1662
   138
                             "testl $0x80000000, %%edx\n\t"     /* Test for 3DNow! */
slouken@1662
   139
                             "jnz ThreeDNowSupported\n\t" "movl $1, %0:\n\n\t"  /* MMX Supported */
slouken@1662
   140
                             "jmp Return\n\n" "ThreeDNowSupported:\n\t" "movl $5, %0:\n\n\t"    /* 3DNow! and MMX Supported */
slouken@1662
   141
                             "jmp Return\n\t"
slouken@1662
   142
                             /* Intel Section */
slouken@1662
   143
                             "Intel:\n\t"
slouken@1662
   144
                             /* Check for MMX */
slouken@1662
   145
                             "MMXtest:\n\t" "movl $1, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"     /* Test for MMX */
slouken@1662
   146
                             "jz NotSupported7\n\t"     /* MMX Not supported */
slouken@1662
   147
                             "movl $1, %0:\n\n\t"       /* MMX Supported */
slouken@1662
   148
                             "jmp Return\n\t"
slouken@1662
   149
                             /* Nothing supported */
slouken@1662
   150
                             "\nNotSupported1:\n\t" "#movl $101, %0:\n\n\t" "\nNotSupported2:\n\t" "#movl $102, %0:\n\n\t" "\nNotSupported3:\n\t" "#movl $103, %0:\n\n\t" "\nNotSupported4:\n\t" "#movl $104, %0:\n\n\t" "\nNotSupported5:\n\t" "#movl $105, %0:\n\n\t" "\nNotSupported6:\n\t" "#movl $106, %0:\n\n\t" "\nNotSupported7:\n\t" "#movl $107, %0:\n\n\t" "movl $0, %0:\n\n\t" "Return:\n\t":"=a" (rval):   /* no input */
slouken@1662
   151
                             :"eax", "ebx", "ecx", "edx");
slouken@689
   152
slouken@1662
   153
    /* Return */
slouken@1662
   154
    return (rval);
slouken@689
   155
}
slouken@689
   156
slouken@689
   157
/*	Function to test if mmx instructions are supported...
slouken@689
   158
*/
slouken@689
   159
inline extern int
slouken@1662
   160
mmx_ok (void)
slouken@689
   161
{
slouken@1662
   162
    /* Returns 1 if MMX instructions are supported, 0 otherwise */
slouken@1662
   163
    return (mm_support () & 0x1);
slouken@689
   164
}
slouken@689
   165
#endif
slouken@689
   166
slouken@689
   167
/*	Helper functions for the instruction macros that follow...
slouken@689
   168
	(note that memory-to-register, m2r, instructions are nearly
slouken@689
   169
	 as efficient as register-to-register, r2r, instructions;
slouken@689
   170
	 however, memory-to-memory instructions are really simulated
slouken@689
   171
	 as a convenience, and are only 1/3 as efficient)
slouken@689
   172
*/
slouken@689
   173
#ifdef	MMX_TRACE
slouken@689
   174
slouken@689
   175
/*	Include the stuff for printing a trace to stderr...
slouken@689
   176
*/
slouken@689
   177
slouken@689
   178
#define	mmx_i2r(op, imm, reg) \
slouken@689
   179
	{ \
slouken@689
   180
		mmx_t mmx_trace; \
slouken@689
   181
		mmx_trace.uq = (imm); \
slouken@689
   182
		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
slouken@689
   183
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   184
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   185
				      : "=X" (mmx_trace) \
slouken@689
   186
				      : /* nothing */ ); \
slouken@689
   187
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   188
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   189
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   190
				      : /* nothing */ \
slouken@689
   191
				      : "X" (imm)); \
slouken@689
   192
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   193
				      : "=X" (mmx_trace) \
slouken@689
   194
				      : /* nothing */ ); \
slouken@689
   195
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   196
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   197
	}
slouken@689
   198
slouken@689
   199
#define	mmx_m2r(op, mem, reg) \
slouken@689
   200
	{ \
slouken@689
   201
		mmx_t mmx_trace; \
slouken@689
   202
		mmx_trace = (mem); \
slouken@689
   203
		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
slouken@689
   204
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   205
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   206
				      : "=X" (mmx_trace) \
slouken@689
   207
				      : /* nothing */ ); \
slouken@689
   208
		printf(#reg "=0x%08x%08x) => ", \
slouken@689
   209
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   210
		__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   211
				      : /* nothing */ \
slouken@689
   212
				      : "X" (mem)); \
slouken@689
   213
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   214
				      : "=X" (mmx_trace) \
slouken@689
   215
				      : /* nothing */ ); \
slouken@689
   216
		printf(#reg "=0x%08x%08x\n", \
slouken@689
   217
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   218
	}
slouken@689
   219
slouken@689
   220
#define	mmx_r2m(op, reg, mem) \
slouken@689
   221
	{ \
slouken@689
   222
		mmx_t mmx_trace; \
slouken@689
   223
		__asm__ __volatile__ ("movq %%" #reg ", %0" \
slouken@689
   224
				      : "=X" (mmx_trace) \
slouken@689
   225
				      : /* nothing */ ); \
slouken@689
   226
		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
slouken@689
   227
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   228
		mmx_trace = (mem); \
slouken@689
   229
		printf(#mem "=0x%08x%08x) => ", \
slouken@689
   230
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   231
		__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   232
				      : "=X" (mem) \
slouken@689
   233
				      : /* nothing */ ); \
slouken@689
   234
		mmx_trace = (mem); \
slouken@689
   235
		printf(#mem "=0x%08x%08x\n", \
slouken@689
   236
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   237
	}
slouken@689
   238
slouken@689
   239
#define	mmx_r2r(op, regs, regd) \
slouken@689
   240
	{ \
slouken@689
   241
		mmx_t mmx_trace; \
slouken@689
   242
		__asm__ __volatile__ ("movq %%" #regs ", %0" \
slouken@689
   243
				      : "=X" (mmx_trace) \
slouken@689
   244
				      : /* nothing */ ); \
slouken@689
   245
		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
slouken@689
   246
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   247
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   248
				      : "=X" (mmx_trace) \
slouken@689
   249
				      : /* nothing */ ); \
slouken@689
   250
		printf(#regd "=0x%08x%08x) => ", \
slouken@689
   251
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   252
		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
slouken@689
   253
		__asm__ __volatile__ ("movq %%" #regd ", %0" \
slouken@689
   254
				      : "=X" (mmx_trace) \
slouken@689
   255
				      : /* nothing */ ); \
slouken@689
   256
		printf(#regd "=0x%08x%08x\n", \
slouken@689
   257
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   258
	}
slouken@689
   259
slouken@689
   260
#define	mmx_m2m(op, mems, memd) \
slouken@689
   261
	{ \
slouken@689
   262
		mmx_t mmx_trace; \
slouken@689
   263
		mmx_trace = (mems); \
slouken@689
   264
		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
slouken@689
   265
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   266
		mmx_trace = (memd); \
slouken@689
   267
		printf(#memd "=0x%08x%08x) => ", \
slouken@689
   268
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   269
		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   270
				      #op " %1, %%mm0\n\t" \
slouken@689
   271
				      "movq %%mm0, %0" \
slouken@689
   272
				      : "=X" (memd) \
slouken@689
   273
				      : "X" (mems)); \
slouken@689
   274
		mmx_trace = (memd); \
slouken@689
   275
		printf(#memd "=0x%08x%08x\n", \
slouken@689
   276
			mmx_trace.d[1], mmx_trace.d[0]); \
slouken@689
   277
	}
slouken@689
   278
slouken@689
   279
#else
slouken@689
   280
slouken@689
   281
/*	These macros are a lot simpler without the tracing...
slouken@689
   282
*/
slouken@689
   283
slouken@689
   284
#define	mmx_i2r(op, imm, reg) \
slouken@689
   285
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   286
			      : /* nothing */ \
slouken@689
   287
			      : "X" (imm) )
slouken@689
   288
slouken@689
   289
#define	mmx_m2r(op, mem, reg) \
slouken@689
   290
	__asm__ __volatile__ (#op " %0, %%" #reg \
slouken@689
   291
			      : /* nothing */ \
slouken@738
   292
			      : "m" (mem))
slouken@689
   293
slouken@689
   294
#define	mmx_r2m(op, reg, mem) \
slouken@689
   295
	__asm__ __volatile__ (#op " %%" #reg ", %0" \
slouken@689
   296
			      : "=X" (mem) \
slouken@689
   297
			      : /* nothing */ )
slouken@689
   298
slouken@689
   299
#define	mmx_r2r(op, regs, regd) \
slouken@689
   300
	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
slouken@689
   301
slouken@689
   302
#define	mmx_m2m(op, mems, memd) \
slouken@689
   303
	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
slouken@689
   304
			      #op " %1, %%mm0\n\t" \
slouken@689
   305
			      "movq %%mm0, %0" \
slouken@689
   306
			      : "=X" (memd) \
slouken@689
   307
			      : "X" (mems))
slouken@689
   308
slouken@689
   309
#endif
slouken@689
   310
slouken@689
   311
slouken@689
   312
/*	1x64 MOVe Quadword
slouken@689
   313
	(this is both a load and a store...
slouken@689
   314
	 in fact, it is the only way to store)
slouken@689
   315
*/
slouken@689
   316
#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
slouken@689
   317
#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
slouken@689
   318
#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
slouken@689
   319
#define	movq(vars, vard) \
slouken@689
   320
	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
slouken@689
   321
			      "movq %%mm0, %0" \
slouken@689
   322
			      : "=X" (vard) \
slouken@689
   323
			      : "X" (vars))
slouken@689
   324
slouken@689
   325
slouken@689
   326
/*	1x32 MOVe Doubleword
slouken@689
   327
	(like movq, this is both load and store...
slouken@689
   328
	 but is most useful for moving things between
slouken@689
   329
	 mmx registers and ordinary registers)
slouken@689
   330
*/
slouken@689
   331
#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
slouken@689
   332
#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
slouken@689
   333
#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
slouken@689
   334
#define	movd(vars, vard) \
slouken@689
   335
	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
slouken@689
   336
			      "movd %%mm0, %0" \
slouken@689
   337
			      : "=X" (vard) \
slouken@689
   338
			      : "X" (vars))
slouken@689
   339
slouken@689
   340
slouken@689
   341
/*	2x32, 4x16, and 8x8 Parallel ADDs
slouken@689
   342
*/
slouken@689
   343
#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
slouken@689
   344
#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
slouken@689
   345
#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
slouken@689
   346
slouken@689
   347
#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
slouken@689
   348
#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
slouken@689
   349
#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
slouken@689
   350
slouken@689
   351
#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
slouken@689
   352
#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
slouken@689
   353
#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
slouken@689
   354
slouken@689
   355
slouken@689
   356
/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
slouken@689
   357
*/
slouken@689
   358
#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
slouken@689
   359
#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
slouken@689
   360
#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
slouken@689
   361
slouken@689
   362
#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
slouken@689
   363
#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
slouken@689
   364
#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
slouken@689
   365
slouken@689
   366
slouken@689
   367
/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
slouken@689
   368
*/
slouken@689
   369
#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
slouken@689
   370
#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
slouken@689
   371
#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
slouken@689
   372
slouken@689
   373
#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
slouken@689
   374
#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
slouken@689
   375
#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
slouken@689
   376
slouken@689
   377
slouken@689
   378
/*	2x32, 4x16, and 8x8 Parallel SUBs
slouken@689
   379
*/
slouken@689
   380
#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
slouken@689
   381
#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
slouken@689
   382
#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
slouken@689
   383
slouken@689
   384
#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
slouken@689
   385
#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
slouken@689
   386
#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
slouken@689
   387
slouken@689
   388
#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
slouken@689
   389
#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
slouken@689
   390
#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
slouken@689
   391
slouken@689
   392
slouken@689
   393
/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
slouken@689
   394
*/
slouken@689
   395
#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
slouken@689
   396
#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
slouken@689
   397
#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
slouken@689
   398
slouken@689
   399
#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
slouken@689
   400
#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
slouken@689
   401
#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
slouken@689
   402
slouken@689
   403
slouken@689
   404
/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
slouken@689
   405
*/
slouken@689
   406
#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
slouken@689
   407
#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
slouken@689
   408
#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
slouken@689
   409
slouken@689
   410
#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
slouken@689
   411
#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
slouken@689
   412
#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
slouken@689
   413
slouken@689
   414
slouken@689
   415
/*	4x16 Parallel MULs giving Low 4x16 portions of results
slouken@689
   416
*/
slouken@689
   417
#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
slouken@689
   418
#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
slouken@689
   419
#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
slouken@689
   420
slouken@689
   421
slouken@689
   422
/*	4x16 Parallel MULs giving High 4x16 portions of results
slouken@689
   423
*/
slouken@689
   424
#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
slouken@689
   425
#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
slouken@689
   426
#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
slouken@689
   427
slouken@689
   428
slouken@689
   429
/*	4x16->2x32 Parallel Mul-ADD
slouken@689
   430
	(muls like pmullw, then adds adjacent 16-bit fields
slouken@689
   431
	 in the multiply result to make the final 2x32 result)
slouken@689
   432
*/
slouken@689
   433
#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
slouken@689
   434
#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
slouken@689
   435
#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
slouken@689
   436
slouken@689
   437
slouken@689
   438
/*	1x64 bitwise AND
slouken@689
   439
*/
slouken@689
   440
#ifdef	BROKEN_PAND
slouken@689
   441
#define	pand_m2r(var, reg) \
slouken@689
   442
	{ \
slouken@689
   443
		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
slouken@689
   444
		mmx_m2r(pandn, var, reg); \
slouken@689
   445
	}
slouken@689
   446
#define	pand_r2r(regs, regd) \
slouken@689
   447
	{ \
slouken@689
   448
		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
slouken@689
   449
		mmx_r2r(pandn, regs, regd) \
slouken@689
   450
	}
slouken@689
   451
#define	pand(vars, vard) \
slouken@689
   452
	{ \
slouken@689
   453
		movq_m2r(vard, mm0); \
slouken@689
   454
		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
slouken@689
   455
		mmx_m2r(pandn, vars, mm0); \
slouken@689
   456
		movq_r2m(mm0, vard); \
slouken@689
   457
	}
slouken@689
   458
#else
slouken@689
   459
#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
slouken@689
   460
#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
slouken@689
   461
#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
slouken@689
   462
#endif
slouken@689
   463
slouken@689
   464
slouken@689
   465
/*	1x64 bitwise AND with Not the destination
slouken@689
   466
*/
slouken@689
   467
#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
slouken@689
   468
#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
slouken@689
   469
#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
slouken@689
   470
slouken@689
   471
slouken@689
   472
/*	1x64 bitwise OR
slouken@689
   473
*/
slouken@689
   474
#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
slouken@689
   475
#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
slouken@689
   476
#define	por(vars, vard)	mmx_m2m(por, vars, vard)
slouken@689
   477
slouken@689
   478
slouken@689
   479
/*	1x64 bitwise eXclusive OR
slouken@689
   480
*/
slouken@689
   481
#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
slouken@689
   482
#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
slouken@689
   483
#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
slouken@689
   484
slouken@689
   485
slouken@689
   486
/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
slouken@689
   487
	(resulting fields are either 0 or -1)
slouken@689
   488
*/
slouken@689
   489
#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
slouken@689
   490
#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
slouken@689
   491
#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
slouken@689
   492
slouken@689
   493
#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
slouken@689
   494
#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
slouken@689
   495
#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
slouken@689
   496
slouken@689
   497
#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
slouken@689
   498
#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
slouken@689
   499
#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
slouken@689
   500
slouken@689
   501
slouken@689
   502
/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
slouken@689
   503
	(resulting fields are either 0 or -1)
slouken@689
   504
*/
slouken@689
   505
#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
slouken@689
   506
#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
slouken@689
   507
#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
slouken@689
   508
slouken@689
   509
#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
slouken@689
   510
#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
slouken@689
   511
#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
slouken@689
   512
slouken@689
   513
#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
slouken@689
   514
#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
slouken@689
   515
#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
slouken@689
   516
slouken@689
   517
slouken@689
   518
/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
slouken@689
   519
*/
slouken@689
   520
#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
slouken@689
   521
#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
slouken@689
   522
#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
slouken@689
   523
#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
slouken@689
   524
slouken@689
   525
#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
slouken@689
   526
#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
slouken@689
   527
#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
slouken@689
   528
#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
slouken@689
   529
slouken@689
   530
#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
slouken@689
   531
#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
slouken@689
   532
#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
slouken@689
   533
#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
slouken@689
   534
slouken@689
   535
slouken@689
   536
/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
slouken@689
   537
*/
slouken@689
   538
#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
slouken@689
   539
#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
slouken@689
   540
#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
slouken@689
   541
#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
slouken@689
   542
slouken@689
   543
#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
slouken@689
   544
#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
slouken@689
   545
#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
slouken@689
   546
#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
slouken@689
   547
slouken@689
   548
#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
slouken@689
   549
#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
slouken@689
   550
#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
slouken@689
   551
#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
slouken@689
   552
slouken@689
   553
slouken@689
   554
/*	2x32 and 4x16 Parallel Shift Right Arithmetic
slouken@689
   555
*/
slouken@689
   556
#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
slouken@689
   557
#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
slouken@689
   558
#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
slouken@689
   559
#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
slouken@689
   560
slouken@689
   561
#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
slouken@689
   562
#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
slouken@689
   563
#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
slouken@689
   564
#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
slouken@689
   565
slouken@689
   566
slouken@689
   567
/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
slouken@689
   568
	(packs source and dest fields into dest in that order)
slouken@689
   569
*/
slouken@689
   570
#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
slouken@689
   571
#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
slouken@689
   572
#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
slouken@689
   573
slouken@689
   574
#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
slouken@689
   575
#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
slouken@689
   576
#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
slouken@689
   577
slouken@689
   578
slouken@689
   579
/*	4x16->8x8 PACK and Unsigned Saturate
slouken@689
   580
	(packs source and dest fields into dest in that order)
slouken@689
   581
*/
slouken@689
   582
#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
slouken@689
   583
#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
slouken@689
   584
#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
slouken@689
   585
slouken@689
   586
slouken@689
   587
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
slouken@689
   588
	(interleaves low half of dest with low half of source
slouken@689
   589
	 as padding in each result field)
slouken@689
   590
*/
slouken@689
   591
#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
slouken@689
   592
#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
slouken@689
   593
#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
slouken@689
   594
slouken@689
   595
#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
slouken@689
   596
#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
slouken@689
   597
#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
slouken@689
   598
slouken@689
   599
#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
slouken@689
   600
#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
slouken@689
   601
#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
slouken@689
   602
slouken@689
   603
slouken@689
   604
/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
slouken@689
   605
	(interleaves high half of dest with high half of source
slouken@689
   606
	 as padding in each result field)
slouken@689
   607
*/
slouken@689
   608
#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
slouken@689
   609
#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
slouken@689
   610
#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
slouken@689
   611
slouken@689
   612
#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
slouken@689
   613
#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
slouken@689
   614
#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
slouken@689
   615
slouken@689
   616
#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
slouken@689
   617
#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
slouken@689
   618
#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
slouken@689
   619
slouken@689
   620
slouken@689
   621
/*	Empty MMx State
slouken@689
   622
	(used to clean-up when going from mmx to float use
slouken@689
   623
	 of the registers that are shared by both; note that
slouken@689
   624
	 there is no float-to-mmx operation needed, because
slouken@689
   625
	 only the float tag word info is corruptible)
slouken@689
   626
*/
slouken@689
   627
#ifdef	MMX_TRACE
slouken@689
   628
slouken@689
   629
#define	emms() \
slouken@689
   630
	{ \
slouken@689
   631
		printf("emms()\n"); \
slouken@689
   632
		__asm__ __volatile__ ("emms"); \
slouken@689
   633
	}
slouken@689
   634
slouken@689
   635
#else
slouken@689
   636
slouken@689
   637
#define	emms()			__asm__ __volatile__ ("emms")
slouken@689
   638
slouken@689
   639
#endif
slouken@689
   640
slouken@689
   641
#endif
slouken@1662
   642
/* vi: set ts=4 sw=4 expandtab: */