src/libm/k_rem_pio2.c
author Ryan C. Gordon <icculus@icculus.org>
Thu, 21 Apr 2016 03:16:44 -0400
changeset 11729 d1ce8396c356
parent 11683 48bcba563d9c
child 11838 5ef6e4e70103
permissions -rw-r--r--
Initial shot at a renderer target for Apple's Metal API.

This isn't complete, but is enough to run testsprite2. It's currently
Mac-only; with a little work to figure out how to properly glue in a Metal
layer to a UIView, this will likely work on iOS, too.

This is only wired up to the configure script right now, and disabled by
default. CMake and Xcode still need their bits filled in as appropriate.
     1 /*
     2  * ====================================================
     3  * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
     4  *
     5  * Developed at SunPro, a Sun Microsystems, Inc. business.
     6  * Permission to use, copy, modify, and distribute this
     7  * software is freely granted, provided that this notice
     8  * is preserved.
     9  * ====================================================
    10  */
    11 
    12 /*
    13  * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
    14  * double x[],y[]; int e0,nx,prec; int ipio2[];
    15  *
    16  * __kernel_rem_pio2 return the last three digits of N with
    17  *		y = x - N*pi/2
    18  * so that |y| < pi/2.
    19  *
    20  * The method is to compute the integer (mod 8) and fraction parts of
    21  * (2/pi)*x without doing the full multiplication. In general we
    22  * skip the part of the product that are known to be a huge integer (
    23  * more accurately, = 0 mod 8 ). Thus the number of operations are
    24  * independent of the exponent of the input.
    25  *
    26  * (2/pi) is represented by an array of 24-bit integers in ipio2[].
    27  *
    28  * Input parameters:
    29  * 	x[]	The input value (must be positive) is broken into nx
    30  *		pieces of 24-bit integers in double precision format.
    31  *		x[i] will be the i-th 24 bit of x. The scaled exponent
    32  *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
    33  *		match x's up to 24 bits.
    34  *
    35  *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
    36  *			e0 = ilogb(z)-23
    37  *			z  = scalbn(z,-e0)
    38  *		for i = 0,1,2
    39  *			x[i] = floor(z)
    40  *			z    = (z-x[i])*2**24
    41  *
    42  *
    43  *	y[]	ouput result in an array of double precision numbers.
    44  *		The dimension of y[] is:
    45  *			24-bit  precision	1
    46  *			53-bit  precision	2
    47  *			64-bit  precision	2
    48  *			113-bit precision	3
    49  *		The actual value is the sum of them. Thus for 113-bit
    50  *		precison, one may have to do something like:
    51  *
    52  *		long double t,w,r_head, r_tail;
    53  *		t = (long double)y[2] + (long double)y[1];
    54  *		w = (long double)y[0];
    55  *		r_head = t+w;
    56  *		r_tail = w - (r_head - t);
    57  *
    58  *	e0	The exponent of x[0]
    59  *
    60  *	nx	dimension of x[]
    61  *
    62  *  	prec	an integer indicating the precision:
    63  *			0	24  bits (single)
    64  *			1	53  bits (double)
    65  *			2	64  bits (extended)
    66  *			3	113 bits (quad)
    67  *
    68  *	ipio2[]
    69  *		integer array, contains the (24*i)-th to (24*i+23)-th
    70  *		bit of 2/pi after binary point. The corresponding
    71  *		floating value is
    72  *
    73  *			ipio2[i] * 2^(-24(i+1)).
    74  *
    75  * External function:
    76  *	double scalbn(), floor();
    77  *
    78  *
    79  * Here is the description of some local variables:
    80  *
    81  * 	jk	jk+1 is the initial number of terms of ipio2[] needed
    82  *		in the computation. The recommended value is 2,3,4,
    83  *		6 for single, double, extended,and quad.
    84  *
    85  * 	jz	local integer variable indicating the number of
    86  *		terms of ipio2[] used.
    87  *
    88  *	jx	nx - 1
    89  *
    90  *	jv	index for pointing to the suitable ipio2[] for the
    91  *		computation. In general, we want
    92  *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
    93  *		is an integer. Thus
    94  *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
    95  *		Hence jv = max(0,(e0-3)/24).
    96  *
    97  *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
    98  *
    99  * 	q[]	double array with integral value, representing the
   100  *		24-bits chunk of the product of x and 2/pi.
   101  *
   102  *	q0	the corresponding exponent of q[0]. Note that the
   103  *		exponent for q[i] would be q0-24*i.
   104  *
   105  *	PIo2[]	double precision array, obtained by cutting pi/2
   106  *		into 24 bits chunks.
   107  *
   108  *	f[]	ipio2[] in floating point
   109  *
   110  *	iq[]	integer array by breaking up q[] in 24-bits chunk.
   111  *
   112  *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
   113  *
   114  *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
   115  *		it also indicates the *sign* of the result.
   116  *
   117  */
   118 
   119 
   120 /*
   121  * Constants:
   122  * The hexadecimal values are the intended ones for the following
   123  * constants. The decimal values may be used, provided that the
   124  * compiler will convert from decimal to binary accurately enough
   125  * to produce the hexadecimal values shown.
   126  */
   127 
   128 #include "math_libm.h"
   129 #include "math_private.h"
   130 
   131 static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
   132 
   133 static const double PIo2[] = {
   134   1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
   135   7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
   136   5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
   137   3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
   138   1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
   139   1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
   140   2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
   141   2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
   142 };
   143 
   144 static const double
   145 zero   = 0.0,
   146 one    = 1.0,
   147 two24   =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
   148 twon24  =  5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
   149 
   150 int attribute_hidden __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2)
   151 {
   152 	int32_t jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
   153 	double z,fw,f[20],fq[20],q[20];
   154 
   155     /* initialize jk*/
   156 	jk = init_jk[prec];
   157 	jp = jk;
   158 
   159     /* determine jx,jv,q0, note that 3>q0 */
   160 	jx =  nx-1;
   161 	jv = (e0-3)/24; if(jv<0) jv=0;
   162 	q0 =  e0-24*(jv+1);
   163 
   164     /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
   165 	j = jv-jx; m = jx+jk;
   166 	for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (double) ipio2[j];
   167 
   168     /* compute q[0],q[1],...q[jk] */
   169 	for (i=0;i<=jk;i++) {
   170 	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
   171 	}
   172 
   173 	jz = jk;
   174 recompute:
   175     /* distill q[] into iq[] reversingly */
   176 	for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
   177 	    fw    =  (double)((int32_t)(twon24* z));
   178 	    iq[i] =  (int32_t)(z-two24*fw);
   179 	    z     =  q[j-1]+fw;
   180 	}
   181 
   182     /* compute n */
   183 	z  = scalbn(z,q0);		/* actual value of z */
   184 	z -= 8.0*floor(z*0.125);		/* trim off integer >= 8 */
   185 	n  = (int32_t) z;
   186 	z -= (double)n;
   187 	ih = 0;
   188 	if(q0>0) {	/* need iq[jz-1] to determine n */
   189 	    i  = (iq[jz-1]>>(24-q0)); n += i;
   190 	    iq[jz-1] -= i<<(24-q0);
   191 	    ih = iq[jz-1]>>(23-q0);
   192 	}
   193 	else if(q0==0) ih = iq[jz-1]>>23;
   194 	else if(z>=0.5) ih=2;
   195 
   196 	if(ih>0) {	/* q > 0.5 */
   197 	    n += 1; carry = 0;
   198 	    for(i=0;i<jz ;i++) {	/* compute 1-q */
   199 		j = iq[i];
   200 		if(carry==0) {
   201 		    if(j!=0) {
   202 			carry = 1; iq[i] = 0x1000000- j;
   203 		    }
   204 		} else  iq[i] = 0xffffff - j;
   205 	    }
   206 	    if(q0>0) {		/* rare case: chance is 1 in 12 */
   207 	        switch(q0) {
   208 	        case 1:
   209 	    	   iq[jz-1] &= 0x7fffff; break;
   210 	    	case 2:
   211 	    	   iq[jz-1] &= 0x3fffff; break;
   212 	        }
   213 	    }
   214 	    if(ih==2) {
   215 		z = one - z;
   216 		if(carry!=0) z -= scalbn(one,q0);
   217 	    }
   218 	}
   219 
   220     /* check if recomputation is needed */
   221 	if(z==zero) {
   222 	    j = 0;
   223 	    for (i=jz-1;i>=jk;i--) j |= iq[i];
   224 	    if(j==0) { /* need recomputation */
   225 		for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
   226 
   227 		for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
   228 		    f[jx+i] = (double) ipio2[jv+i];
   229 		    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
   230 		    q[i] = fw;
   231 		}
   232 		jz += k;
   233 		goto recompute;
   234 	    }
   235 	}
   236 
   237     /* chop off zero terms */
   238 	if(z==0.0) {
   239 	    jz -= 1; q0 -= 24;
   240 	    while(iq[jz]==0) { jz--; q0-=24;}
   241 	} else { /* break z into 24-bit if necessary */
   242 	    z = scalbn(z,-q0);
   243 	    if(z>=two24) {
   244 		fw = (double)((int32_t)(twon24*z));
   245 		iq[jz] = (int32_t)(z-two24*fw);
   246 		jz += 1; q0 += 24;
   247 		iq[jz] = (int32_t) fw;
   248 	    } else iq[jz] = (int32_t) z ;
   249 	}
   250 
   251     /* convert integer "bit" chunk to floating-point value */
   252 	fw = scalbn(one,q0);
   253 	for(i=jz;i>=0;i--) {
   254 	    q[i] = fw*(double)iq[i]; fw*=twon24;
   255 	}
   256 
   257     /* compute PIo2[0,...,jp]*q[jz,...,0] */
   258 	for(i=jz;i>=0;i--) {
   259 	    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
   260 	    fq[jz-i] = fw;
   261 	}
   262 
   263     /* compress fq[] into y[] */
   264 	switch(prec) {
   265 	    case 0:
   266 		fw = 0.0;
   267 		for (i=jz;i>=0;i--) fw += fq[i];
   268 		y[0] = (ih==0)? fw: -fw;
   269 		break;
   270 	    case 1:
   271 	    case 2:
   272 		fw = 0.0;
   273 		for (i=jz;i>=0;i--) fw += fq[i];
   274 		y[0] = (ih==0)? fw: -fw;
   275 		fw = fq[0]-fw;
   276 		for (i=1;i<=jz;i++) fw += fq[i];
   277 		y[1] = (ih==0)? fw: -fw;
   278 		break;
   279 	    case 3:	/* painful */
   280 		for (i=jz;i>0;i--) {
   281 		    fw      = fq[i-1]+fq[i];
   282 		    fq[i]  += fq[i-1]-fw;
   283 		    fq[i-1] = fw;
   284 		}
   285 		for (i=jz;i>1;i--) {
   286 		    fw      = fq[i-1]+fq[i];
   287 		    fq[i]  += fq[i-1]-fw;
   288 		    fq[i-1] = fw;
   289 		}
   290 		for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
   291 		if(ih==0) {
   292 		    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
   293 		} else {
   294 		    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
   295 		}
   296 	}
   297 	return n&7;
   298 }