src/render/mmx.h
author Sam Lantinga <slouken@libsdl.org>
Wed, 05 Jun 2013 22:33:10 -0700
changeset 7286 04087a99e3f4
parent 5156 307ccc9c135e
permissions -rw-r--r--
Added testautomation to the Visual Studio 2008 project.
     1 /*	mmx.h
     2 
     3 	MultiMedia eXtensions GCC interface library for IA32.
     4 
     5 	To use this library, simply include this header file
     6 	and compile with GCC.  You MUST have inlining enabled
     7 	in order for mmx_ok() to work; this can be done by
     8 	simply using -O on the GCC command line.
     9 
    10 	Compiling with -DMMX_TRACE will cause detailed trace
    11 	output to be sent to stderr for each mmx operation.
    12 	This adds lots of code, and obviously slows execution to
    13 	a crawl, but can be very useful for debugging.
    14 
    15 	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
    16 	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
    17 	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
    18 	AND FITNESS FOR ANY PARTICULAR PURPOSE.
    19 
    20 	1997-99 by H. Dietz and R. Fisher
    21 
    22  Notes:
    23 	It appears that the latest gas has the pand problem fixed, therefore
    24 	  I'll undefine BROKEN_PAND by default.
    25 */
    26 
    27 #ifndef _MMX_H
    28 #define _MMX_H
    29 
    30 
    31 /*	Warning:  at this writing, the version of GAS packaged
    32 	with most Linux distributions does not handle the
    33 	parallel AND operation mnemonic correctly.  If the
    34 	symbol BROKEN_PAND is defined, a slower alternative
    35 	coding will be used.  If execution of mmxtest results
    36 	in an illegal instruction fault, define this symbol.
    37 */
    38 #undef	BROKEN_PAND
    39 
    40 
    41 /*	The type of an value that fits in an MMX register
    42 	(note that long long constant values MUST be suffixed
    43 	 by LL and unsigned long long values by ULL, lest
    44 	 they be truncated by the compiler)
    45 */
    46 typedef union
    47 {
    48     long long q;                /* Quadword (64-bit) value */
    49     unsigned long long uq;      /* Unsigned Quadword */
    50     int d[2];                   /* 2 Doubleword (32-bit) values */
    51     unsigned int ud[2];         /* 2 Unsigned Doubleword */
    52     short w[4];                 /* 4 Word (16-bit) values */
    53     unsigned short uw[4];       /* 4 Unsigned Word */
    54     char b[8];                  /* 8 Byte (8-bit) values */
    55     unsigned char ub[8];        /* 8 Unsigned Byte */
    56     float s[2];                 /* Single-precision (32-bit) value */
    57 } __attribute__ ((aligned(8))) mmx_t;   /* On an 8-byte (64-bit) boundary */
    58 
    59 
    60 #if 0
    61 /*	Function to test if multimedia instructions are supported...
    62 */
    63 inline extern int
    64 mm_support(void)
    65 {
    66     /* Returns 1 if MMX instructions are supported,
    67        3 if Cyrix MMX and Extended MMX instructions are supported
    68        5 if AMD MMX and 3DNow! instructions are supported
    69        0 if hardware does not support any of these
    70      */
    71     register int rval = 0;
    72 
    73     __asm__ __volatile__(
    74                             /* See if CPUID instruction is supported ... */
    75                             /* ... Get copies of EFLAGS into eax and ecx */
    76                             "pushf\n\t"
    77                             "popl %%eax\n\t" "movl %%eax, %%ecx\n\t"
    78                             /* ... Toggle the ID bit in one copy and store */
    79                             /*     to the EFLAGS reg */
    80                             "xorl $0x200000, %%eax\n\t"
    81                             "push %%eax\n\t" "popf\n\t"
    82                             /* ... Get the (hopefully modified) EFLAGS */
    83                             "pushf\n\t" "popl %%eax\n\t"
    84                             /* ... Compare and test result */
    85                             "xorl %%eax, %%ecx\n\t" "testl $0x200000, %%ecx\n\t" "jz NotSupported1\n\t" /* CPUID not supported */
    86                             /* Get standard CPUID information, and
    87                                go to a specific vendor section */
    88                             "movl $0, %%eax\n\t" "cpuid\n\t"
    89                             /* Check for Intel */
    90                             "cmpl $0x756e6547, %%ebx\n\t"
    91                             "jne TryAMD\n\t"
    92                             "cmpl $0x49656e69, %%edx\n\t"
    93                             "jne TryAMD\n\t"
    94                             "cmpl $0x6c65746e, %%ecx\n"
    95                             "jne TryAMD\n\t" "jmp Intel\n\t"
    96                             /* Check for AMD */
    97                             "\nTryAMD:\n\t"
    98                             "cmpl $0x68747541, %%ebx\n\t"
    99                             "jne TryCyrix\n\t"
   100                             "cmpl $0x69746e65, %%edx\n\t"
   101                             "jne TryCyrix\n\t"
   102                             "cmpl $0x444d4163, %%ecx\n"
   103                             "jne TryCyrix\n\t" "jmp AMD\n\t"
   104                             /* Check for Cyrix */
   105                             "\nTryCyrix:\n\t"
   106                             "cmpl $0x69727943, %%ebx\n\t"
   107                             "jne NotSupported2\n\t"
   108                             "cmpl $0x736e4978, %%edx\n\t"
   109                             "jne NotSupported3\n\t"
   110                             "cmpl $0x64616574, %%ecx\n\t"
   111                             "jne NotSupported4\n\t"
   112                             /* Drop through to Cyrix... */
   113                             /* Cyrix Section */
   114                             /* See if extended CPUID level 80000001 is supported */
   115                             /* The value of CPUID/80000001 for the 6x86MX is undefined
   116                                according to the Cyrix CPU Detection Guide (Preliminary
   117                                Rev. 1.01 table 1), so we'll check the value of eax for
   118                                CPUID/0 to see if standard CPUID level 2 is supported.
   119                                According to the table, the only CPU which supports level
   120                                2 is also the only one which supports extended CPUID levels.
   121                              */
   122                             "cmpl $0x2, %%eax\n\t" "jne MMXtest\n\t"    /* Use standard CPUID instead */
   123                             /* Extended CPUID supported (in theory), so get extended
   124                                features */
   125                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%eax\n\t"    /* Test for MMX */
   126                             "jz NotSupported5\n\t"      /* MMX not supported */
   127                             "testl $0x01000000, %%eax\n\t"      /* Test for Ext'd MMX */
   128                             "jnz EMMXSupported\n\t" "movl $1, %0:\n\n\t"        /* MMX Supported */
   129                             "jmp Return\n\n" "EMMXSupported:\n\t" "movl $3, %0:\n\n\t"  /* EMMX and MMX Supported */
   130                             "jmp Return\n\t"
   131                             /* AMD Section */
   132                             "AMD:\n\t"
   133                             /* See if extended CPUID is supported */
   134                             "movl $0x80000000, %%eax\n\t" "cpuid\n\t" "cmpl $0x80000000, %%eax\n\t" "jl MMXtest\n\t"    /* Use standard CPUID instead */
   135                             /* Extended CPUID supported, so get extended features */
   136                             "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"    /* Test for MMX */
   137                             "jz NotSupported6\n\t"      /* MMX not supported */
   138                             "testl $0x80000000, %%edx\n\t"      /* Test for 3DNow! */
   139                             "jnz ThreeDNowSupported\n\t" "movl $1, %0:\n\n\t"   /* MMX Supported */
   140                             "jmp Return\n\n" "ThreeDNowSupported:\n\t" "movl $5, %0:\n\n\t"     /* 3DNow! and MMX Supported */
   141                             "jmp Return\n\t"
   142                             /* Intel Section */
   143                             "Intel:\n\t"
   144                             /* Check for MMX */
   145                             "MMXtest:\n\t" "movl $1, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"      /* Test for MMX */
   146                             "jz NotSupported7\n\t"      /* MMX Not supported */
   147                             "movl $1, %0:\n\n\t"        /* MMX Supported */
   148                             "jmp Return\n\t"
   149                             /* Nothing supported */
   150                             "\nNotSupported1:\n\t" "#movl $101, %0:\n\n\t" "\nNotSupported2:\n\t" "#movl $102, %0:\n\n\t" "\nNotSupported3:\n\t" "#movl $103, %0:\n\n\t" "\nNotSupported4:\n\t" "#movl $104, %0:\n\n\t" "\nNotSupported5:\n\t" "#movl $105, %0:\n\n\t" "\nNotSupported6:\n\t" "#movl $106, %0:\n\n\t" "\nNotSupported7:\n\t" "#movl $107, %0:\n\n\t" "movl $0, %0:\n\n\t" "Return:\n\t":"=a"(rval):     /* no input */
   151                             :"eax", "ebx", "ecx", "edx");
   152 
   153     /* Return */
   154     return (rval);
   155 }
   156 
   157 /*	Function to test if mmx instructions are supported...
   158 */
   159 inline extern int
   160 mmx_ok(void)
   161 {
   162     /* Returns 1 if MMX instructions are supported, 0 otherwise */
   163     return (mm_support() & 0x1);
   164 }
   165 #endif
   166 
   167 /*	Helper functions for the instruction macros that follow...
   168 	(note that memory-to-register, m2r, instructions are nearly
   169 	 as efficient as register-to-register, r2r, instructions;
   170 	 however, memory-to-memory instructions are really simulated
   171 	 as a convenience, and are only 1/3 as efficient)
   172 */
   173 #ifdef	MMX_TRACE
   174 
   175 /*	Include the stuff for printing a trace to stderr...
   176 */
   177 
   178 #define	mmx_i2r(op, imm, reg) \
   179 	{ \
   180 		mmx_t mmx_trace; \
   181 		mmx_trace.uq = (imm); \
   182 		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
   183 			mmx_trace.d[1], mmx_trace.d[0]); \
   184 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   185 				      : "=X" (mmx_trace) \
   186 				      : /* nothing */ ); \
   187 		printf(#reg "=0x%08x%08x) => ", \
   188 			mmx_trace.d[1], mmx_trace.d[0]); \
   189 		__asm__ __volatile__ (#op " %0, %%" #reg \
   190 				      : /* nothing */ \
   191 				      : "X" (imm)); \
   192 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   193 				      : "=X" (mmx_trace) \
   194 				      : /* nothing */ ); \
   195 		printf(#reg "=0x%08x%08x\n", \
   196 			mmx_trace.d[1], mmx_trace.d[0]); \
   197 	}
   198 
   199 #define	mmx_m2r(op, mem, reg) \
   200 	{ \
   201 		mmx_t mmx_trace; \
   202 		mmx_trace = (mem); \
   203 		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
   204 			mmx_trace.d[1], mmx_trace.d[0]); \
   205 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   206 				      : "=X" (mmx_trace) \
   207 				      : /* nothing */ ); \
   208 		printf(#reg "=0x%08x%08x) => ", \
   209 			mmx_trace.d[1], mmx_trace.d[0]); \
   210 		__asm__ __volatile__ (#op " %0, %%" #reg \
   211 				      : /* nothing */ \
   212 				      : "X" (mem)); \
   213 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   214 				      : "=X" (mmx_trace) \
   215 				      : /* nothing */ ); \
   216 		printf(#reg "=0x%08x%08x\n", \
   217 			mmx_trace.d[1], mmx_trace.d[0]); \
   218 	}
   219 
   220 #define	mmx_r2m(op, reg, mem) \
   221 	{ \
   222 		mmx_t mmx_trace; \
   223 		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   224 				      : "=X" (mmx_trace) \
   225 				      : /* nothing */ ); \
   226 		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
   227 			mmx_trace.d[1], mmx_trace.d[0]); \
   228 		mmx_trace = (mem); \
   229 		printf(#mem "=0x%08x%08x) => ", \
   230 			mmx_trace.d[1], mmx_trace.d[0]); \
   231 		__asm__ __volatile__ (#op " %%" #reg ", %0" \
   232 				      : "=X" (mem) \
   233 				      : /* nothing */ ); \
   234 		mmx_trace = (mem); \
   235 		printf(#mem "=0x%08x%08x\n", \
   236 			mmx_trace.d[1], mmx_trace.d[0]); \
   237 	}
   238 
   239 #define	mmx_r2r(op, regs, regd) \
   240 	{ \
   241 		mmx_t mmx_trace; \
   242 		__asm__ __volatile__ ("movq %%" #regs ", %0" \
   243 				      : "=X" (mmx_trace) \
   244 				      : /* nothing */ ); \
   245 		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
   246 			mmx_trace.d[1], mmx_trace.d[0]); \
   247 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
   248 				      : "=X" (mmx_trace) \
   249 				      : /* nothing */ ); \
   250 		printf(#regd "=0x%08x%08x) => ", \
   251 			mmx_trace.d[1], mmx_trace.d[0]); \
   252 		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
   253 		__asm__ __volatile__ ("movq %%" #regd ", %0" \
   254 				      : "=X" (mmx_trace) \
   255 				      : /* nothing */ ); \
   256 		printf(#regd "=0x%08x%08x\n", \
   257 			mmx_trace.d[1], mmx_trace.d[0]); \
   258 	}
   259 
   260 #define	mmx_m2m(op, mems, memd) \
   261 	{ \
   262 		mmx_t mmx_trace; \
   263 		mmx_trace = (mems); \
   264 		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
   265 			mmx_trace.d[1], mmx_trace.d[0]); \
   266 		mmx_trace = (memd); \
   267 		printf(#memd "=0x%08x%08x) => ", \
   268 			mmx_trace.d[1], mmx_trace.d[0]); \
   269 		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
   270 				      #op " %1, %%mm0\n\t" \
   271 				      "movq %%mm0, %0" \
   272 				      : "=X" (memd) \
   273 				      : "X" (mems)); \
   274 		mmx_trace = (memd); \
   275 		printf(#memd "=0x%08x%08x\n", \
   276 			mmx_trace.d[1], mmx_trace.d[0]); \
   277 	}
   278 
   279 #else
   280 
   281 /*	These macros are a lot simpler without the tracing...
   282 */
   283 
   284 #define	mmx_i2r(op, imm, reg) \
   285 	__asm__ __volatile__ (#op " %0, %%" #reg \
   286 			      : /* nothing */ \
   287 			      : "X" (imm) )
   288 
   289 #define	mmx_m2r(op, mem, reg) \
   290 	__asm__ __volatile__ (#op " %0, %%" #reg \
   291 			      : /* nothing */ \
   292 			      : "m" (mem))
   293 
   294 #define	mmx_r2m(op, reg, mem) \
   295 	__asm__ __volatile__ (#op " %%" #reg ", %0" \
   296 			      : "=m" (mem) \
   297 			      : /* nothing */ )
   298 
   299 #define	mmx_r2r(op, regs, regd) \
   300 	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
   301 
   302 #define	mmx_m2m(op, mems, memd) \
   303 	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
   304 			      #op " %1, %%mm0\n\t" \
   305 			      "movq %%mm0, %0" \
   306 			      : "=X" (memd) \
   307 			      : "X" (mems))
   308 
   309 #endif
   310 
   311 
   312 /*	1x64 MOVe Quadword
   313 	(this is both a load and a store...
   314 	 in fact, it is the only way to store)
   315 */
   316 #define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
   317 #define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
   318 #define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
   319 #define	movq(vars, vard) \
   320 	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
   321 			      "movq %%mm0, %0" \
   322 			      : "=X" (vard) \
   323 			      : "X" (vars))
   324 
   325 
   326 /*	1x32 MOVe Doubleword
   327 	(like movq, this is both load and store...
   328 	 but is most useful for moving things between
   329 	 mmx registers and ordinary registers)
   330 */
   331 #define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
   332 #define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
   333 #define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
   334 #define	movd(vars, vard) \
   335 	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
   336 			      "movd %%mm0, %0" \
   337 			      : "=X" (vard) \
   338 			      : "X" (vars))
   339 
   340 
   341 /*	2x32, 4x16, and 8x8 Parallel ADDs
   342 */
   343 #define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
   344 #define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
   345 #define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
   346 
   347 #define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
   348 #define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
   349 #define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
   350 
   351 #define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
   352 #define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
   353 #define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
   354 
   355 
   356 /*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
   357 */
   358 #define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
   359 #define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
   360 #define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
   361 
   362 #define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
   363 #define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
   364 #define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
   365 
   366 
   367 /*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
   368 */
   369 #define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
   370 #define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
   371 #define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
   372 
   373 #define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
   374 #define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
   375 #define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
   376 
   377 
   378 /*	2x32, 4x16, and 8x8 Parallel SUBs
   379 */
   380 #define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
   381 #define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
   382 #define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
   383 
   384 #define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
   385 #define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
   386 #define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
   387 
   388 #define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
   389 #define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
   390 #define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
   391 
   392 
   393 /*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
   394 */
   395 #define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
   396 #define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
   397 #define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
   398 
   399 #define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
   400 #define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
   401 #define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
   402 
   403 
   404 /*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
   405 */
   406 #define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
   407 #define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
   408 #define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
   409 
   410 #define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
   411 #define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
   412 #define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
   413 
   414 
   415 /*	4x16 Parallel MULs giving Low 4x16 portions of results
   416 */
   417 #define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
   418 #define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
   419 #define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
   420 
   421 
   422 /*	4x16 Parallel MULs giving High 4x16 portions of results
   423 */
   424 #define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
   425 #define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
   426 #define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
   427 
   428 
   429 /*	4x16->2x32 Parallel Mul-ADD
   430 	(muls like pmullw, then adds adjacent 16-bit fields
   431 	 in the multiply result to make the final 2x32 result)
   432 */
   433 #define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
   434 #define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
   435 #define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
   436 
   437 
   438 /*	1x64 bitwise AND
   439 */
   440 #ifdef	BROKEN_PAND
   441 #define	pand_m2r(var, reg) \
   442 	{ \
   443 		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
   444 		mmx_m2r(pandn, var, reg); \
   445 	}
   446 #define	pand_r2r(regs, regd) \
   447 	{ \
   448 		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
   449 		mmx_r2r(pandn, regs, regd) \
   450 	}
   451 #define	pand(vars, vard) \
   452 	{ \
   453 		movq_m2r(vard, mm0); \
   454 		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
   455 		mmx_m2r(pandn, vars, mm0); \
   456 		movq_r2m(mm0, vard); \
   457 	}
   458 #else
   459 #define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
   460 #define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
   461 #define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
   462 #endif
   463 
   464 
   465 /*	1x64 bitwise AND with Not the destination
   466 */
   467 #define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
   468 #define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
   469 #define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
   470 
   471 
   472 /*	1x64 bitwise OR
   473 */
   474 #define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
   475 #define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
   476 #define	por(vars, vard)	mmx_m2m(por, vars, vard)
   477 
   478 
   479 /*	1x64 bitwise eXclusive OR
   480 */
   481 #define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
   482 #define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
   483 #define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
   484 
   485 
   486 /*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
   487 	(resulting fields are either 0 or -1)
   488 */
   489 #define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
   490 #define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
   491 #define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
   492 
   493 #define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
   494 #define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
   495 #define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
   496 
   497 #define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
   498 #define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
   499 #define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
   500 
   501 
   502 /*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
   503 	(resulting fields are either 0 or -1)
   504 */
   505 #define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
   506 #define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
   507 #define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
   508 
   509 #define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
   510 #define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
   511 #define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
   512 
   513 #define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
   514 #define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
   515 #define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
   516 
   517 
   518 /*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
   519 */
   520 #define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
   521 #define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
   522 #define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
   523 #define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
   524 
   525 #define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
   526 #define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
   527 #define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
   528 #define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
   529 
   530 #define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
   531 #define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
   532 #define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
   533 #define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
   534 
   535 
   536 /*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
   537 */
   538 #define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
   539 #define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
   540 #define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
   541 #define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
   542 
   543 #define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
   544 #define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
   545 #define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
   546 #define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
   547 
   548 #define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
   549 #define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
   550 #define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
   551 #define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
   552 
   553 
   554 /*	2x32 and 4x16 Parallel Shift Right Arithmetic
   555 */
   556 #define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
   557 #define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
   558 #define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
   559 #define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
   560 
   561 #define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
   562 #define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
   563 #define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
   564 #define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
   565 
   566 
   567 /*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
   568 	(packs source and dest fields into dest in that order)
   569 */
   570 #define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
   571 #define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
   572 #define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
   573 
   574 #define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
   575 #define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
   576 #define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
   577 
   578 
   579 /*	4x16->8x8 PACK and Unsigned Saturate
   580 	(packs source and dest fields into dest in that order)
   581 */
   582 #define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
   583 #define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
   584 #define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
   585 
   586 
   587 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
   588 	(interleaves low half of dest with low half of source
   589 	 as padding in each result field)
   590 */
   591 #define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
   592 #define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
   593 #define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
   594 
   595 #define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
   596 #define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
   597 #define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
   598 
   599 #define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
   600 #define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
   601 #define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
   602 
   603 
   604 /*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
   605 	(interleaves high half of dest with high half of source
   606 	 as padding in each result field)
   607 */
   608 #define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
   609 #define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
   610 #define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
   611 
   612 #define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
   613 #define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
   614 #define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
   615 
   616 #define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
   617 #define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
   618 #define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
   619 
   620 
   621 /*	Empty MMx State
   622 	(used to clean-up when going from mmx to float use
   623 	 of the registers that are shared by both; note that
   624 	 there is no float-to-mmx operation needed, because
   625 	 only the float tag word info is corruptible)
   626 */
   627 #ifdef	MMX_TRACE
   628 
   629 #define	emms() \
   630 	{ \
   631 		printf("emms()\n"); \
   632 		__asm__ __volatile__ ("emms"); \
   633 	}
   634 
   635 #else
   636 
   637 #define	emms()			__asm__ __volatile__ ("emms")
   638 
   639 #endif
   640 
   641 #endif
   642 /* vi: set ts=4 sw=4 expandtab: */