src/render/mmx.h
changeset 5156 307ccc9c135e
parent 5128 f2c2f0ecba5f
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/render/mmx.h	Thu Feb 03 00:19:40 2011 -0800
     1.3 @@ -0,0 +1,642 @@
     1.4 +/*	mmx.h
     1.5 +
     1.6 +	MultiMedia eXtensions GCC interface library for IA32.
     1.7 +
     1.8 +	To use this library, simply include this header file
     1.9 +	and compile with GCC.  You MUST have inlining enabled
    1.10 +	in order for mmx_ok() to work; this can be done by
    1.11 +	simply using -O on the GCC command line.
    1.12 +
    1.13 +	Compiling with -DMMX_TRACE will cause detailed trace
    1.14 +	output to be sent to stderr for each mmx operation.
    1.15 +	This adds lots of code, and obviously slows execution to
    1.16 +	a crawl, but can be very useful for debugging.
    1.17 +
    1.18 +	THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
    1.19 +	EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
    1.20 +	LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
    1.21 +	AND FITNESS FOR ANY PARTICULAR PURPOSE.
    1.22 +
    1.23 +	1997-99 by H. Dietz and R. Fisher
    1.24 +
    1.25 + Notes:
    1.26 +	It appears that the latest gas has the pand problem fixed, therefore
    1.27 +	  I'll undefine BROKEN_PAND by default.
    1.28 +*/
    1.29 +
    1.30 +#ifndef _MMX_H
    1.31 +#define _MMX_H
    1.32 +
    1.33 +
    1.34 +/*	Warning:  at this writing, the version of GAS packaged
    1.35 +	with most Linux distributions does not handle the
    1.36 +	parallel AND operation mnemonic correctly.  If the
    1.37 +	symbol BROKEN_PAND is defined, a slower alternative
    1.38 +	coding will be used.  If execution of mmxtest results
    1.39 +	in an illegal instruction fault, define this symbol.
    1.40 +*/
    1.41 +#undef	BROKEN_PAND
    1.42 +
    1.43 +
    1.44 +/*	The type of an value that fits in an MMX register
    1.45 +	(note that long long constant values MUST be suffixed
    1.46 +	 by LL and unsigned long long values by ULL, lest
    1.47 +	 they be truncated by the compiler)
    1.48 +*/
    1.49 +typedef union
    1.50 +{
    1.51 +    long long q;                /* Quadword (64-bit) value */
    1.52 +    unsigned long long uq;      /* Unsigned Quadword */
    1.53 +    int d[2];                   /* 2 Doubleword (32-bit) values */
    1.54 +    unsigned int ud[2];         /* 2 Unsigned Doubleword */
    1.55 +    short w[4];                 /* 4 Word (16-bit) values */
    1.56 +    unsigned short uw[4];       /* 4 Unsigned Word */
    1.57 +    char b[8];                  /* 8 Byte (8-bit) values */
    1.58 +    unsigned char ub[8];        /* 8 Unsigned Byte */
    1.59 +    float s[2];                 /* Single-precision (32-bit) value */
    1.60 +} __attribute__ ((aligned(8))) mmx_t;   /* On an 8-byte (64-bit) boundary */
    1.61 +
    1.62 +
    1.63 +#if 0
    1.64 +/*	Function to test if multimedia instructions are supported...
    1.65 +*/
    1.66 +inline extern int
    1.67 +mm_support(void)
    1.68 +{
    1.69 +    /* Returns 1 if MMX instructions are supported,
    1.70 +       3 if Cyrix MMX and Extended MMX instructions are supported
    1.71 +       5 if AMD MMX and 3DNow! instructions are supported
    1.72 +       0 if hardware does not support any of these
    1.73 +     */
    1.74 +    register int rval = 0;
    1.75 +
    1.76 +    __asm__ __volatile__(
    1.77 +                            /* See if CPUID instruction is supported ... */
    1.78 +                            /* ... Get copies of EFLAGS into eax and ecx */
    1.79 +                            "pushf\n\t"
    1.80 +                            "popl %%eax\n\t" "movl %%eax, %%ecx\n\t"
    1.81 +                            /* ... Toggle the ID bit in one copy and store */
    1.82 +                            /*     to the EFLAGS reg */
    1.83 +                            "xorl $0x200000, %%eax\n\t"
    1.84 +                            "push %%eax\n\t" "popf\n\t"
    1.85 +                            /* ... Get the (hopefully modified) EFLAGS */
    1.86 +                            "pushf\n\t" "popl %%eax\n\t"
    1.87 +                            /* ... Compare and test result */
    1.88 +                            "xorl %%eax, %%ecx\n\t" "testl $0x200000, %%ecx\n\t" "jz NotSupported1\n\t" /* CPUID not supported */
    1.89 +                            /* Get standard CPUID information, and
    1.90 +                               go to a specific vendor section */
    1.91 +                            "movl $0, %%eax\n\t" "cpuid\n\t"
    1.92 +                            /* Check for Intel */
    1.93 +                            "cmpl $0x756e6547, %%ebx\n\t"
    1.94 +                            "jne TryAMD\n\t"
    1.95 +                            "cmpl $0x49656e69, %%edx\n\t"
    1.96 +                            "jne TryAMD\n\t"
    1.97 +                            "cmpl $0x6c65746e, %%ecx\n"
    1.98 +                            "jne TryAMD\n\t" "jmp Intel\n\t"
    1.99 +                            /* Check for AMD */
   1.100 +                            "\nTryAMD:\n\t"
   1.101 +                            "cmpl $0x68747541, %%ebx\n\t"
   1.102 +                            "jne TryCyrix\n\t"
   1.103 +                            "cmpl $0x69746e65, %%edx\n\t"
   1.104 +                            "jne TryCyrix\n\t"
   1.105 +                            "cmpl $0x444d4163, %%ecx\n"
   1.106 +                            "jne TryCyrix\n\t" "jmp AMD\n\t"
   1.107 +                            /* Check for Cyrix */
   1.108 +                            "\nTryCyrix:\n\t"
   1.109 +                            "cmpl $0x69727943, %%ebx\n\t"
   1.110 +                            "jne NotSupported2\n\t"
   1.111 +                            "cmpl $0x736e4978, %%edx\n\t"
   1.112 +                            "jne NotSupported3\n\t"
   1.113 +                            "cmpl $0x64616574, %%ecx\n\t"
   1.114 +                            "jne NotSupported4\n\t"
   1.115 +                            /* Drop through to Cyrix... */
   1.116 +                            /* Cyrix Section */
   1.117 +                            /* See if extended CPUID level 80000001 is supported */
   1.118 +                            /* The value of CPUID/80000001 for the 6x86MX is undefined
   1.119 +                               according to the Cyrix CPU Detection Guide (Preliminary
   1.120 +                               Rev. 1.01 table 1), so we'll check the value of eax for
   1.121 +                               CPUID/0 to see if standard CPUID level 2 is supported.
   1.122 +                               According to the table, the only CPU which supports level
   1.123 +                               2 is also the only one which supports extended CPUID levels.
   1.124 +                             */
   1.125 +                            "cmpl $0x2, %%eax\n\t" "jne MMXtest\n\t"    /* Use standard CPUID instead */
   1.126 +                            /* Extended CPUID supported (in theory), so get extended
   1.127 +                               features */
   1.128 +                            "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%eax\n\t"    /* Test for MMX */
   1.129 +                            "jz NotSupported5\n\t"      /* MMX not supported */
   1.130 +                            "testl $0x01000000, %%eax\n\t"      /* Test for Ext'd MMX */
   1.131 +                            "jnz EMMXSupported\n\t" "movl $1, %0:\n\n\t"        /* MMX Supported */
   1.132 +                            "jmp Return\n\n" "EMMXSupported:\n\t" "movl $3, %0:\n\n\t"  /* EMMX and MMX Supported */
   1.133 +                            "jmp Return\n\t"
   1.134 +                            /* AMD Section */
   1.135 +                            "AMD:\n\t"
   1.136 +                            /* See if extended CPUID is supported */
   1.137 +                            "movl $0x80000000, %%eax\n\t" "cpuid\n\t" "cmpl $0x80000000, %%eax\n\t" "jl MMXtest\n\t"    /* Use standard CPUID instead */
   1.138 +                            /* Extended CPUID supported, so get extended features */
   1.139 +                            "movl $0x80000001, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"    /* Test for MMX */
   1.140 +                            "jz NotSupported6\n\t"      /* MMX not supported */
   1.141 +                            "testl $0x80000000, %%edx\n\t"      /* Test for 3DNow! */
   1.142 +                            "jnz ThreeDNowSupported\n\t" "movl $1, %0:\n\n\t"   /* MMX Supported */
   1.143 +                            "jmp Return\n\n" "ThreeDNowSupported:\n\t" "movl $5, %0:\n\n\t"     /* 3DNow! and MMX Supported */
   1.144 +                            "jmp Return\n\t"
   1.145 +                            /* Intel Section */
   1.146 +                            "Intel:\n\t"
   1.147 +                            /* Check for MMX */
   1.148 +                            "MMXtest:\n\t" "movl $1, %%eax\n\t" "cpuid\n\t" "testl $0x00800000, %%edx\n\t"      /* Test for MMX */
   1.149 +                            "jz NotSupported7\n\t"      /* MMX Not supported */
   1.150 +                            "movl $1, %0:\n\n\t"        /* MMX Supported */
   1.151 +                            "jmp Return\n\t"
   1.152 +                            /* Nothing supported */
   1.153 +                            "\nNotSupported1:\n\t" "#movl $101, %0:\n\n\t" "\nNotSupported2:\n\t" "#movl $102, %0:\n\n\t" "\nNotSupported3:\n\t" "#movl $103, %0:\n\n\t" "\nNotSupported4:\n\t" "#movl $104, %0:\n\n\t" "\nNotSupported5:\n\t" "#movl $105, %0:\n\n\t" "\nNotSupported6:\n\t" "#movl $106, %0:\n\n\t" "\nNotSupported7:\n\t" "#movl $107, %0:\n\n\t" "movl $0, %0:\n\n\t" "Return:\n\t":"=a"(rval):     /* no input */
   1.154 +                            :"eax", "ebx", "ecx", "edx");
   1.155 +
   1.156 +    /* Return */
   1.157 +    return (rval);
   1.158 +}
   1.159 +
   1.160 +/*	Function to test if mmx instructions are supported...
   1.161 +*/
   1.162 +inline extern int
   1.163 +mmx_ok(void)
   1.164 +{
   1.165 +    /* Returns 1 if MMX instructions are supported, 0 otherwise */
   1.166 +    return (mm_support() & 0x1);
   1.167 +}
   1.168 +#endif
   1.169 +
   1.170 +/*	Helper functions for the instruction macros that follow...
   1.171 +	(note that memory-to-register, m2r, instructions are nearly
   1.172 +	 as efficient as register-to-register, r2r, instructions;
   1.173 +	 however, memory-to-memory instructions are really simulated
   1.174 +	 as a convenience, and are only 1/3 as efficient)
   1.175 +*/
   1.176 +#ifdef	MMX_TRACE
   1.177 +
   1.178 +/*	Include the stuff for printing a trace to stderr...
   1.179 +*/
   1.180 +
   1.181 +#define	mmx_i2r(op, imm, reg) \
   1.182 +	{ \
   1.183 +		mmx_t mmx_trace; \
   1.184 +		mmx_trace.uq = (imm); \
   1.185 +		printf(#op "_i2r(" #imm "=0x%08x%08x, ", \
   1.186 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.187 +		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   1.188 +				      : "=X" (mmx_trace) \
   1.189 +				      : /* nothing */ ); \
   1.190 +		printf(#reg "=0x%08x%08x) => ", \
   1.191 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.192 +		__asm__ __volatile__ (#op " %0, %%" #reg \
   1.193 +				      : /* nothing */ \
   1.194 +				      : "X" (imm)); \
   1.195 +		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   1.196 +				      : "=X" (mmx_trace) \
   1.197 +				      : /* nothing */ ); \
   1.198 +		printf(#reg "=0x%08x%08x\n", \
   1.199 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.200 +	}
   1.201 +
   1.202 +#define	mmx_m2r(op, mem, reg) \
   1.203 +	{ \
   1.204 +		mmx_t mmx_trace; \
   1.205 +		mmx_trace = (mem); \
   1.206 +		printf(#op "_m2r(" #mem "=0x%08x%08x, ", \
   1.207 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.208 +		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   1.209 +				      : "=X" (mmx_trace) \
   1.210 +				      : /* nothing */ ); \
   1.211 +		printf(#reg "=0x%08x%08x) => ", \
   1.212 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.213 +		__asm__ __volatile__ (#op " %0, %%" #reg \
   1.214 +				      : /* nothing */ \
   1.215 +				      : "X" (mem)); \
   1.216 +		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   1.217 +				      : "=X" (mmx_trace) \
   1.218 +				      : /* nothing */ ); \
   1.219 +		printf(#reg "=0x%08x%08x\n", \
   1.220 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.221 +	}
   1.222 +
   1.223 +#define	mmx_r2m(op, reg, mem) \
   1.224 +	{ \
   1.225 +		mmx_t mmx_trace; \
   1.226 +		__asm__ __volatile__ ("movq %%" #reg ", %0" \
   1.227 +				      : "=X" (mmx_trace) \
   1.228 +				      : /* nothing */ ); \
   1.229 +		printf(#op "_r2m(" #reg "=0x%08x%08x, ", \
   1.230 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.231 +		mmx_trace = (mem); \
   1.232 +		printf(#mem "=0x%08x%08x) => ", \
   1.233 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.234 +		__asm__ __volatile__ (#op " %%" #reg ", %0" \
   1.235 +				      : "=X" (mem) \
   1.236 +				      : /* nothing */ ); \
   1.237 +		mmx_trace = (mem); \
   1.238 +		printf(#mem "=0x%08x%08x\n", \
   1.239 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.240 +	}
   1.241 +
   1.242 +#define	mmx_r2r(op, regs, regd) \
   1.243 +	{ \
   1.244 +		mmx_t mmx_trace; \
   1.245 +		__asm__ __volatile__ ("movq %%" #regs ", %0" \
   1.246 +				      : "=X" (mmx_trace) \
   1.247 +				      : /* nothing */ ); \
   1.248 +		printf(#op "_r2r(" #regs "=0x%08x%08x, ", \
   1.249 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.250 +		__asm__ __volatile__ ("movq %%" #regd ", %0" \
   1.251 +				      : "=X" (mmx_trace) \
   1.252 +				      : /* nothing */ ); \
   1.253 +		printf(#regd "=0x%08x%08x) => ", \
   1.254 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.255 +		__asm__ __volatile__ (#op " %" #regs ", %" #regd); \
   1.256 +		__asm__ __volatile__ ("movq %%" #regd ", %0" \
   1.257 +				      : "=X" (mmx_trace) \
   1.258 +				      : /* nothing */ ); \
   1.259 +		printf(#regd "=0x%08x%08x\n", \
   1.260 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.261 +	}
   1.262 +
   1.263 +#define	mmx_m2m(op, mems, memd) \
   1.264 +	{ \
   1.265 +		mmx_t mmx_trace; \
   1.266 +		mmx_trace = (mems); \
   1.267 +		printf(#op "_m2m(" #mems "=0x%08x%08x, ", \
   1.268 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.269 +		mmx_trace = (memd); \
   1.270 +		printf(#memd "=0x%08x%08x) => ", \
   1.271 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.272 +		__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
   1.273 +				      #op " %1, %%mm0\n\t" \
   1.274 +				      "movq %%mm0, %0" \
   1.275 +				      : "=X" (memd) \
   1.276 +				      : "X" (mems)); \
   1.277 +		mmx_trace = (memd); \
   1.278 +		printf(#memd "=0x%08x%08x\n", \
   1.279 +			mmx_trace.d[1], mmx_trace.d[0]); \
   1.280 +	}
   1.281 +
   1.282 +#else
   1.283 +
   1.284 +/*	These macros are a lot simpler without the tracing...
   1.285 +*/
   1.286 +
   1.287 +#define	mmx_i2r(op, imm, reg) \
   1.288 +	__asm__ __volatile__ (#op " %0, %%" #reg \
   1.289 +			      : /* nothing */ \
   1.290 +			      : "X" (imm) )
   1.291 +
   1.292 +#define	mmx_m2r(op, mem, reg) \
   1.293 +	__asm__ __volatile__ (#op " %0, %%" #reg \
   1.294 +			      : /* nothing */ \
   1.295 +			      : "m" (mem))
   1.296 +
   1.297 +#define	mmx_r2m(op, reg, mem) \
   1.298 +	__asm__ __volatile__ (#op " %%" #reg ", %0" \
   1.299 +			      : "=m" (mem) \
   1.300 +			      : /* nothing */ )
   1.301 +
   1.302 +#define	mmx_r2r(op, regs, regd) \
   1.303 +	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
   1.304 +
   1.305 +#define	mmx_m2m(op, mems, memd) \
   1.306 +	__asm__ __volatile__ ("movq %0, %%mm0\n\t" \
   1.307 +			      #op " %1, %%mm0\n\t" \
   1.308 +			      "movq %%mm0, %0" \
   1.309 +			      : "=X" (memd) \
   1.310 +			      : "X" (mems))
   1.311 +
   1.312 +#endif
   1.313 +
   1.314 +
   1.315 +/*	1x64 MOVe Quadword
   1.316 +	(this is both a load and a store...
   1.317 +	 in fact, it is the only way to store)
   1.318 +*/
   1.319 +#define	movq_m2r(var, reg)	mmx_m2r(movq, var, reg)
   1.320 +#define	movq_r2m(reg, var)	mmx_r2m(movq, reg, var)
   1.321 +#define	movq_r2r(regs, regd)	mmx_r2r(movq, regs, regd)
   1.322 +#define	movq(vars, vard) \
   1.323 +	__asm__ __volatile__ ("movq %1, %%mm0\n\t" \
   1.324 +			      "movq %%mm0, %0" \
   1.325 +			      : "=X" (vard) \
   1.326 +			      : "X" (vars))
   1.327 +
   1.328 +
   1.329 +/*	1x32 MOVe Doubleword
   1.330 +	(like movq, this is both load and store...
   1.331 +	 but is most useful for moving things between
   1.332 +	 mmx registers and ordinary registers)
   1.333 +*/
   1.334 +#define	movd_m2r(var, reg)	mmx_m2r(movd, var, reg)
   1.335 +#define	movd_r2m(reg, var)	mmx_r2m(movd, reg, var)
   1.336 +#define	movd_r2r(regs, regd)	mmx_r2r(movd, regs, regd)
   1.337 +#define	movd(vars, vard) \
   1.338 +	__asm__ __volatile__ ("movd %1, %%mm0\n\t" \
   1.339 +			      "movd %%mm0, %0" \
   1.340 +			      : "=X" (vard) \
   1.341 +			      : "X" (vars))
   1.342 +
   1.343 +
   1.344 +/*	2x32, 4x16, and 8x8 Parallel ADDs
   1.345 +*/
   1.346 +#define	paddd_m2r(var, reg)	mmx_m2r(paddd, var, reg)
   1.347 +#define	paddd_r2r(regs, regd)	mmx_r2r(paddd, regs, regd)
   1.348 +#define	paddd(vars, vard)	mmx_m2m(paddd, vars, vard)
   1.349 +
   1.350 +#define	paddw_m2r(var, reg)	mmx_m2r(paddw, var, reg)
   1.351 +#define	paddw_r2r(regs, regd)	mmx_r2r(paddw, regs, regd)
   1.352 +#define	paddw(vars, vard)	mmx_m2m(paddw, vars, vard)
   1.353 +
   1.354 +#define	paddb_m2r(var, reg)	mmx_m2r(paddb, var, reg)
   1.355 +#define	paddb_r2r(regs, regd)	mmx_r2r(paddb, regs, regd)
   1.356 +#define	paddb(vars, vard)	mmx_m2m(paddb, vars, vard)
   1.357 +
   1.358 +
   1.359 +/*	4x16 and 8x8 Parallel ADDs using Saturation arithmetic
   1.360 +*/
   1.361 +#define	paddsw_m2r(var, reg)	mmx_m2r(paddsw, var, reg)
   1.362 +#define	paddsw_r2r(regs, regd)	mmx_r2r(paddsw, regs, regd)
   1.363 +#define	paddsw(vars, vard)	mmx_m2m(paddsw, vars, vard)
   1.364 +
   1.365 +#define	paddsb_m2r(var, reg)	mmx_m2r(paddsb, var, reg)
   1.366 +#define	paddsb_r2r(regs, regd)	mmx_r2r(paddsb, regs, regd)
   1.367 +#define	paddsb(vars, vard)	mmx_m2m(paddsb, vars, vard)
   1.368 +
   1.369 +
   1.370 +/*	4x16 and 8x8 Parallel ADDs using Unsigned Saturation arithmetic
   1.371 +*/
   1.372 +#define	paddusw_m2r(var, reg)	mmx_m2r(paddusw, var, reg)
   1.373 +#define	paddusw_r2r(regs, regd)	mmx_r2r(paddusw, regs, regd)
   1.374 +#define	paddusw(vars, vard)	mmx_m2m(paddusw, vars, vard)
   1.375 +
   1.376 +#define	paddusb_m2r(var, reg)	mmx_m2r(paddusb, var, reg)
   1.377 +#define	paddusb_r2r(regs, regd)	mmx_r2r(paddusb, regs, regd)
   1.378 +#define	paddusb(vars, vard)	mmx_m2m(paddusb, vars, vard)
   1.379 +
   1.380 +
   1.381 +/*	2x32, 4x16, and 8x8 Parallel SUBs
   1.382 +*/
   1.383 +#define	psubd_m2r(var, reg)	mmx_m2r(psubd, var, reg)
   1.384 +#define	psubd_r2r(regs, regd)	mmx_r2r(psubd, regs, regd)
   1.385 +#define	psubd(vars, vard)	mmx_m2m(psubd, vars, vard)
   1.386 +
   1.387 +#define	psubw_m2r(var, reg)	mmx_m2r(psubw, var, reg)
   1.388 +#define	psubw_r2r(regs, regd)	mmx_r2r(psubw, regs, regd)
   1.389 +#define	psubw(vars, vard)	mmx_m2m(psubw, vars, vard)
   1.390 +
   1.391 +#define	psubb_m2r(var, reg)	mmx_m2r(psubb, var, reg)
   1.392 +#define	psubb_r2r(regs, regd)	mmx_r2r(psubb, regs, regd)
   1.393 +#define	psubb(vars, vard)	mmx_m2m(psubb, vars, vard)
   1.394 +
   1.395 +
   1.396 +/*	4x16 and 8x8 Parallel SUBs using Saturation arithmetic
   1.397 +*/
   1.398 +#define	psubsw_m2r(var, reg)	mmx_m2r(psubsw, var, reg)
   1.399 +#define	psubsw_r2r(regs, regd)	mmx_r2r(psubsw, regs, regd)
   1.400 +#define	psubsw(vars, vard)	mmx_m2m(psubsw, vars, vard)
   1.401 +
   1.402 +#define	psubsb_m2r(var, reg)	mmx_m2r(psubsb, var, reg)
   1.403 +#define	psubsb_r2r(regs, regd)	mmx_r2r(psubsb, regs, regd)
   1.404 +#define	psubsb(vars, vard)	mmx_m2m(psubsb, vars, vard)
   1.405 +
   1.406 +
   1.407 +/*	4x16 and 8x8 Parallel SUBs using Unsigned Saturation arithmetic
   1.408 +*/
   1.409 +#define	psubusw_m2r(var, reg)	mmx_m2r(psubusw, var, reg)
   1.410 +#define	psubusw_r2r(regs, regd)	mmx_r2r(psubusw, regs, regd)
   1.411 +#define	psubusw(vars, vard)	mmx_m2m(psubusw, vars, vard)
   1.412 +
   1.413 +#define	psubusb_m2r(var, reg)	mmx_m2r(psubusb, var, reg)
   1.414 +#define	psubusb_r2r(regs, regd)	mmx_r2r(psubusb, regs, regd)
   1.415 +#define	psubusb(vars, vard)	mmx_m2m(psubusb, vars, vard)
   1.416 +
   1.417 +
   1.418 +/*	4x16 Parallel MULs giving Low 4x16 portions of results
   1.419 +*/
   1.420 +#define	pmullw_m2r(var, reg)	mmx_m2r(pmullw, var, reg)
   1.421 +#define	pmullw_r2r(regs, regd)	mmx_r2r(pmullw, regs, regd)
   1.422 +#define	pmullw(vars, vard)	mmx_m2m(pmullw, vars, vard)
   1.423 +
   1.424 +
   1.425 +/*	4x16 Parallel MULs giving High 4x16 portions of results
   1.426 +*/
   1.427 +#define	pmulhw_m2r(var, reg)	mmx_m2r(pmulhw, var, reg)
   1.428 +#define	pmulhw_r2r(regs, regd)	mmx_r2r(pmulhw, regs, regd)
   1.429 +#define	pmulhw(vars, vard)	mmx_m2m(pmulhw, vars, vard)
   1.430 +
   1.431 +
   1.432 +/*	4x16->2x32 Parallel Mul-ADD
   1.433 +	(muls like pmullw, then adds adjacent 16-bit fields
   1.434 +	 in the multiply result to make the final 2x32 result)
   1.435 +*/
   1.436 +#define	pmaddwd_m2r(var, reg)	mmx_m2r(pmaddwd, var, reg)
   1.437 +#define	pmaddwd_r2r(regs, regd)	mmx_r2r(pmaddwd, regs, regd)
   1.438 +#define	pmaddwd(vars, vard)	mmx_m2m(pmaddwd, vars, vard)
   1.439 +
   1.440 +
   1.441 +/*	1x64 bitwise AND
   1.442 +*/
   1.443 +#ifdef	BROKEN_PAND
   1.444 +#define	pand_m2r(var, reg) \
   1.445 +	{ \
   1.446 +		mmx_m2r(pandn, (mmx_t) -1LL, reg); \
   1.447 +		mmx_m2r(pandn, var, reg); \
   1.448 +	}
   1.449 +#define	pand_r2r(regs, regd) \
   1.450 +	{ \
   1.451 +		mmx_m2r(pandn, (mmx_t) -1LL, regd); \
   1.452 +		mmx_r2r(pandn, regs, regd) \
   1.453 +	}
   1.454 +#define	pand(vars, vard) \
   1.455 +	{ \
   1.456 +		movq_m2r(vard, mm0); \
   1.457 +		mmx_m2r(pandn, (mmx_t) -1LL, mm0); \
   1.458 +		mmx_m2r(pandn, vars, mm0); \
   1.459 +		movq_r2m(mm0, vard); \
   1.460 +	}
   1.461 +#else
   1.462 +#define	pand_m2r(var, reg)	mmx_m2r(pand, var, reg)
   1.463 +#define	pand_r2r(regs, regd)	mmx_r2r(pand, regs, regd)
   1.464 +#define	pand(vars, vard)	mmx_m2m(pand, vars, vard)
   1.465 +#endif
   1.466 +
   1.467 +
   1.468 +/*	1x64 bitwise AND with Not the destination
   1.469 +*/
   1.470 +#define	pandn_m2r(var, reg)	mmx_m2r(pandn, var, reg)
   1.471 +#define	pandn_r2r(regs, regd)	mmx_r2r(pandn, regs, regd)
   1.472 +#define	pandn(vars, vard)	mmx_m2m(pandn, vars, vard)
   1.473 +
   1.474 +
   1.475 +/*	1x64 bitwise OR
   1.476 +*/
   1.477 +#define	por_m2r(var, reg)	mmx_m2r(por, var, reg)
   1.478 +#define	por_r2r(regs, regd)	mmx_r2r(por, regs, regd)
   1.479 +#define	por(vars, vard)	mmx_m2m(por, vars, vard)
   1.480 +
   1.481 +
   1.482 +/*	1x64 bitwise eXclusive OR
   1.483 +*/
   1.484 +#define	pxor_m2r(var, reg)	mmx_m2r(pxor, var, reg)
   1.485 +#define	pxor_r2r(regs, regd)	mmx_r2r(pxor, regs, regd)
   1.486 +#define	pxor(vars, vard)	mmx_m2m(pxor, vars, vard)
   1.487 +
   1.488 +
   1.489 +/*	2x32, 4x16, and 8x8 Parallel CoMPare for EQuality
   1.490 +	(resulting fields are either 0 or -1)
   1.491 +*/
   1.492 +#define	pcmpeqd_m2r(var, reg)	mmx_m2r(pcmpeqd, var, reg)
   1.493 +#define	pcmpeqd_r2r(regs, regd)	mmx_r2r(pcmpeqd, regs, regd)
   1.494 +#define	pcmpeqd(vars, vard)	mmx_m2m(pcmpeqd, vars, vard)
   1.495 +
   1.496 +#define	pcmpeqw_m2r(var, reg)	mmx_m2r(pcmpeqw, var, reg)
   1.497 +#define	pcmpeqw_r2r(regs, regd)	mmx_r2r(pcmpeqw, regs, regd)
   1.498 +#define	pcmpeqw(vars, vard)	mmx_m2m(pcmpeqw, vars, vard)
   1.499 +
   1.500 +#define	pcmpeqb_m2r(var, reg)	mmx_m2r(pcmpeqb, var, reg)
   1.501 +#define	pcmpeqb_r2r(regs, regd)	mmx_r2r(pcmpeqb, regs, regd)
   1.502 +#define	pcmpeqb(vars, vard)	mmx_m2m(pcmpeqb, vars, vard)
   1.503 +
   1.504 +
   1.505 +/*	2x32, 4x16, and 8x8 Parallel CoMPare for Greater Than
   1.506 +	(resulting fields are either 0 or -1)
   1.507 +*/
   1.508 +#define	pcmpgtd_m2r(var, reg)	mmx_m2r(pcmpgtd, var, reg)
   1.509 +#define	pcmpgtd_r2r(regs, regd)	mmx_r2r(pcmpgtd, regs, regd)
   1.510 +#define	pcmpgtd(vars, vard)	mmx_m2m(pcmpgtd, vars, vard)
   1.511 +
   1.512 +#define	pcmpgtw_m2r(var, reg)	mmx_m2r(pcmpgtw, var, reg)
   1.513 +#define	pcmpgtw_r2r(regs, regd)	mmx_r2r(pcmpgtw, regs, regd)
   1.514 +#define	pcmpgtw(vars, vard)	mmx_m2m(pcmpgtw, vars, vard)
   1.515 +
   1.516 +#define	pcmpgtb_m2r(var, reg)	mmx_m2r(pcmpgtb, var, reg)
   1.517 +#define	pcmpgtb_r2r(regs, regd)	mmx_r2r(pcmpgtb, regs, regd)
   1.518 +#define	pcmpgtb(vars, vard)	mmx_m2m(pcmpgtb, vars, vard)
   1.519 +
   1.520 +
   1.521 +/*	1x64, 2x32, and 4x16 Parallel Shift Left Logical
   1.522 +*/
   1.523 +#define	psllq_i2r(imm, reg)	mmx_i2r(psllq, imm, reg)
   1.524 +#define	psllq_m2r(var, reg)	mmx_m2r(psllq, var, reg)
   1.525 +#define	psllq_r2r(regs, regd)	mmx_r2r(psllq, regs, regd)
   1.526 +#define	psllq(vars, vard)	mmx_m2m(psllq, vars, vard)
   1.527 +
   1.528 +#define	pslld_i2r(imm, reg)	mmx_i2r(pslld, imm, reg)
   1.529 +#define	pslld_m2r(var, reg)	mmx_m2r(pslld, var, reg)
   1.530 +#define	pslld_r2r(regs, regd)	mmx_r2r(pslld, regs, regd)
   1.531 +#define	pslld(vars, vard)	mmx_m2m(pslld, vars, vard)
   1.532 +
   1.533 +#define	psllw_i2r(imm, reg)	mmx_i2r(psllw, imm, reg)
   1.534 +#define	psllw_m2r(var, reg)	mmx_m2r(psllw, var, reg)
   1.535 +#define	psllw_r2r(regs, regd)	mmx_r2r(psllw, regs, regd)
   1.536 +#define	psllw(vars, vard)	mmx_m2m(psllw, vars, vard)
   1.537 +
   1.538 +
   1.539 +/*	1x64, 2x32, and 4x16 Parallel Shift Right Logical
   1.540 +*/
   1.541 +#define	psrlq_i2r(imm, reg)	mmx_i2r(psrlq, imm, reg)
   1.542 +#define	psrlq_m2r(var, reg)	mmx_m2r(psrlq, var, reg)
   1.543 +#define	psrlq_r2r(regs, regd)	mmx_r2r(psrlq, regs, regd)
   1.544 +#define	psrlq(vars, vard)	mmx_m2m(psrlq, vars, vard)
   1.545 +
   1.546 +#define	psrld_i2r(imm, reg)	mmx_i2r(psrld, imm, reg)
   1.547 +#define	psrld_m2r(var, reg)	mmx_m2r(psrld, var, reg)
   1.548 +#define	psrld_r2r(regs, regd)	mmx_r2r(psrld, regs, regd)
   1.549 +#define	psrld(vars, vard)	mmx_m2m(psrld, vars, vard)
   1.550 +
   1.551 +#define	psrlw_i2r(imm, reg)	mmx_i2r(psrlw, imm, reg)
   1.552 +#define	psrlw_m2r(var, reg)	mmx_m2r(psrlw, var, reg)
   1.553 +#define	psrlw_r2r(regs, regd)	mmx_r2r(psrlw, regs, regd)
   1.554 +#define	psrlw(vars, vard)	mmx_m2m(psrlw, vars, vard)
   1.555 +
   1.556 +
   1.557 +/*	2x32 and 4x16 Parallel Shift Right Arithmetic
   1.558 +*/
   1.559 +#define	psrad_i2r(imm, reg)	mmx_i2r(psrad, imm, reg)
   1.560 +#define	psrad_m2r(var, reg)	mmx_m2r(psrad, var, reg)
   1.561 +#define	psrad_r2r(regs, regd)	mmx_r2r(psrad, regs, regd)
   1.562 +#define	psrad(vars, vard)	mmx_m2m(psrad, vars, vard)
   1.563 +
   1.564 +#define	psraw_i2r(imm, reg)	mmx_i2r(psraw, imm, reg)
   1.565 +#define	psraw_m2r(var, reg)	mmx_m2r(psraw, var, reg)
   1.566 +#define	psraw_r2r(regs, regd)	mmx_r2r(psraw, regs, regd)
   1.567 +#define	psraw(vars, vard)	mmx_m2m(psraw, vars, vard)
   1.568 +
   1.569 +
   1.570 +/*	2x32->4x16 and 4x16->8x8 PACK and Signed Saturate
   1.571 +	(packs source and dest fields into dest in that order)
   1.572 +*/
   1.573 +#define	packssdw_m2r(var, reg)	mmx_m2r(packssdw, var, reg)
   1.574 +#define	packssdw_r2r(regs, regd) mmx_r2r(packssdw, regs, regd)
   1.575 +#define	packssdw(vars, vard)	mmx_m2m(packssdw, vars, vard)
   1.576 +
   1.577 +#define	packsswb_m2r(var, reg)	mmx_m2r(packsswb, var, reg)
   1.578 +#define	packsswb_r2r(regs, regd) mmx_r2r(packsswb, regs, regd)
   1.579 +#define	packsswb(vars, vard)	mmx_m2m(packsswb, vars, vard)
   1.580 +
   1.581 +
   1.582 +/*	4x16->8x8 PACK and Unsigned Saturate
   1.583 +	(packs source and dest fields into dest in that order)
   1.584 +*/
   1.585 +#define	packuswb_m2r(var, reg)	mmx_m2r(packuswb, var, reg)
   1.586 +#define	packuswb_r2r(regs, regd) mmx_r2r(packuswb, regs, regd)
   1.587 +#define	packuswb(vars, vard)	mmx_m2m(packuswb, vars, vard)
   1.588 +
   1.589 +
   1.590 +/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK Low
   1.591 +	(interleaves low half of dest with low half of source
   1.592 +	 as padding in each result field)
   1.593 +*/
   1.594 +#define	punpckldq_m2r(var, reg)	mmx_m2r(punpckldq, var, reg)
   1.595 +#define	punpckldq_r2r(regs, regd) mmx_r2r(punpckldq, regs, regd)
   1.596 +#define	punpckldq(vars, vard)	mmx_m2m(punpckldq, vars, vard)
   1.597 +
   1.598 +#define	punpcklwd_m2r(var, reg)	mmx_m2r(punpcklwd, var, reg)
   1.599 +#define	punpcklwd_r2r(regs, regd) mmx_r2r(punpcklwd, regs, regd)
   1.600 +#define	punpcklwd(vars, vard)	mmx_m2m(punpcklwd, vars, vard)
   1.601 +
   1.602 +#define	punpcklbw_m2r(var, reg)	mmx_m2r(punpcklbw, var, reg)
   1.603 +#define	punpcklbw_r2r(regs, regd) mmx_r2r(punpcklbw, regs, regd)
   1.604 +#define	punpcklbw(vars, vard)	mmx_m2m(punpcklbw, vars, vard)
   1.605 +
   1.606 +
   1.607 +/*	2x32->1x64, 4x16->2x32, and 8x8->4x16 UNPaCK High
   1.608 +	(interleaves high half of dest with high half of source
   1.609 +	 as padding in each result field)
   1.610 +*/
   1.611 +#define	punpckhdq_m2r(var, reg)	mmx_m2r(punpckhdq, var, reg)
   1.612 +#define	punpckhdq_r2r(regs, regd) mmx_r2r(punpckhdq, regs, regd)
   1.613 +#define	punpckhdq(vars, vard)	mmx_m2m(punpckhdq, vars, vard)
   1.614 +
   1.615 +#define	punpckhwd_m2r(var, reg)	mmx_m2r(punpckhwd, var, reg)
   1.616 +#define	punpckhwd_r2r(regs, regd) mmx_r2r(punpckhwd, regs, regd)
   1.617 +#define	punpckhwd(vars, vard)	mmx_m2m(punpckhwd, vars, vard)
   1.618 +
   1.619 +#define	punpckhbw_m2r(var, reg)	mmx_m2r(punpckhbw, var, reg)
   1.620 +#define	punpckhbw_r2r(regs, regd) mmx_r2r(punpckhbw, regs, regd)
   1.621 +#define	punpckhbw(vars, vard)	mmx_m2m(punpckhbw, vars, vard)
   1.622 +
   1.623 +
   1.624 +/*	Empty MMx State
   1.625 +	(used to clean-up when going from mmx to float use
   1.626 +	 of the registers that are shared by both; note that
   1.627 +	 there is no float-to-mmx operation needed, because
   1.628 +	 only the float tag word info is corruptible)
   1.629 +*/
   1.630 +#ifdef	MMX_TRACE
   1.631 +
   1.632 +#define	emms() \
   1.633 +	{ \
   1.634 +		printf("emms()\n"); \
   1.635 +		__asm__ __volatile__ ("emms"); \
   1.636 +	}
   1.637 +
   1.638 +#else
   1.639 +
   1.640 +#define	emms()			__asm__ __volatile__ ("emms")
   1.641 +
   1.642 +#endif
   1.643 +
   1.644 +#endif
   1.645 +/* vi: set ts=4 sw=4 expandtab: */