src/libm/k_rem_pio2.c
author Ryan C. Gordon <icculus@icculus.org>
Thu, 21 Apr 2016 03:16:44 -0400
changeset 11729 d1ce8396c356
parent 11683 48bcba563d9c
child 11838 5ef6e4e70103
permissions -rw-r--r--
Initial shot at a renderer target for Apple's Metal API.

This isn't complete, but is enough to run testsprite2. It's currently
Mac-only; with a little work to figure out how to properly glue in a Metal
layer to a UIView, this will likely work on iOS, too.

This is only wired up to the configure script right now, and disabled by
default. CMake and Xcode still need their bits filled in as appropriate.
slouken@2757
     1
/*
slouken@2757
     2
 * ====================================================
slouken@2757
     3
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
slouken@2757
     4
 *
slouken@2757
     5
 * Developed at SunPro, a Sun Microsystems, Inc. business.
slouken@2757
     6
 * Permission to use, copy, modify, and distribute this
slouken@2757
     7
 * software is freely granted, provided that this notice
slouken@2757
     8
 * is preserved.
slouken@2757
     9
 * ====================================================
slouken@2757
    10
 */
slouken@2757
    11
slouken@2757
    12
/*
slouken@2757
    13
 * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
slouken@2757
    14
 * double x[],y[]; int e0,nx,prec; int ipio2[];
slouken@2757
    15
 *
slouken@2757
    16
 * __kernel_rem_pio2 return the last three digits of N with
slouken@2757
    17
 *		y = x - N*pi/2
slouken@2757
    18
 * so that |y| < pi/2.
slouken@2757
    19
 *
slouken@2757
    20
 * The method is to compute the integer (mod 8) and fraction parts of
slouken@2757
    21
 * (2/pi)*x without doing the full multiplication. In general we
slouken@2757
    22
 * skip the part of the product that are known to be a huge integer (
slouken@2757
    23
 * more accurately, = 0 mod 8 ). Thus the number of operations are
slouken@2757
    24
 * independent of the exponent of the input.
slouken@2757
    25
 *
slouken@2757
    26
 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
slouken@2757
    27
 *
slouken@2757
    28
 * Input parameters:
slouken@2757
    29
 * 	x[]	The input value (must be positive) is broken into nx
slouken@2757
    30
 *		pieces of 24-bit integers in double precision format.
slouken@2757
    31
 *		x[i] will be the i-th 24 bit of x. The scaled exponent
slouken@2757
    32
 *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
slouken@2757
    33
 *		match x's up to 24 bits.
slouken@2757
    34
 *
slouken@2757
    35
 *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
slouken@2757
    36
 *			e0 = ilogb(z)-23
slouken@2757
    37
 *			z  = scalbn(z,-e0)
slouken@2757
    38
 *		for i = 0,1,2
slouken@2757
    39
 *			x[i] = floor(z)
slouken@2757
    40
 *			z    = (z-x[i])*2**24
slouken@2757
    41
 *
slouken@2757
    42
 *
slouken@2757
    43
 *	y[]	ouput result in an array of double precision numbers.
slouken@2757
    44
 *		The dimension of y[] is:
slouken@2757
    45
 *			24-bit  precision	1
slouken@2757
    46
 *			53-bit  precision	2
slouken@2757
    47
 *			64-bit  precision	2
slouken@2757
    48
 *			113-bit precision	3
slouken@2757
    49
 *		The actual value is the sum of them. Thus for 113-bit
slouken@2757
    50
 *		precison, one may have to do something like:
slouken@2757
    51
 *
slouken@2757
    52
 *		long double t,w,r_head, r_tail;
slouken@2757
    53
 *		t = (long double)y[2] + (long double)y[1];
slouken@2757
    54
 *		w = (long double)y[0];
slouken@2757
    55
 *		r_head = t+w;
slouken@2757
    56
 *		r_tail = w - (r_head - t);
slouken@2757
    57
 *
slouken@2757
    58
 *	e0	The exponent of x[0]
slouken@2757
    59
 *
slouken@2757
    60
 *	nx	dimension of x[]
slouken@2757
    61
 *
slouken@2757
    62
 *  	prec	an integer indicating the precision:
slouken@2757
    63
 *			0	24  bits (single)
slouken@2757
    64
 *			1	53  bits (double)
slouken@2757
    65
 *			2	64  bits (extended)
slouken@2757
    66
 *			3	113 bits (quad)
slouken@2757
    67
 *
slouken@2757
    68
 *	ipio2[]
slouken@2757
    69
 *		integer array, contains the (24*i)-th to (24*i+23)-th
slouken@2757
    70
 *		bit of 2/pi after binary point. The corresponding
slouken@2757
    71
 *		floating value is
slouken@2757
    72
 *
slouken@2757
    73
 *			ipio2[i] * 2^(-24(i+1)).
slouken@2757
    74
 *
slouken@2757
    75
 * External function:
slouken@2757
    76
 *	double scalbn(), floor();
slouken@2757
    77
 *
slouken@2757
    78
 *
slouken@2757
    79
 * Here is the description of some local variables:
slouken@2757
    80
 *
slouken@2757
    81
 * 	jk	jk+1 is the initial number of terms of ipio2[] needed
slouken@2757
    82
 *		in the computation. The recommended value is 2,3,4,
slouken@2757
    83
 *		6 for single, double, extended,and quad.
slouken@2757
    84
 *
slouken@2757
    85
 * 	jz	local integer variable indicating the number of
slouken@2757
    86
 *		terms of ipio2[] used.
slouken@2757
    87
 *
slouken@2757
    88
 *	jx	nx - 1
slouken@2757
    89
 *
slouken@2757
    90
 *	jv	index for pointing to the suitable ipio2[] for the
slouken@2757
    91
 *		computation. In general, we want
slouken@2757
    92
 *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
slouken@2757
    93
 *		is an integer. Thus
slouken@2757
    94
 *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
slouken@2757
    95
 *		Hence jv = max(0,(e0-3)/24).
slouken@2757
    96
 *
slouken@2757
    97
 *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
slouken@2757
    98
 *
slouken@2757
    99
 * 	q[]	double array with integral value, representing the
slouken@2757
   100
 *		24-bits chunk of the product of x and 2/pi.
slouken@2757
   101
 *
slouken@2757
   102
 *	q0	the corresponding exponent of q[0]. Note that the
slouken@2757
   103
 *		exponent for q[i] would be q0-24*i.
slouken@2757
   104
 *
slouken@2757
   105
 *	PIo2[]	double precision array, obtained by cutting pi/2
slouken@2757
   106
 *		into 24 bits chunks.
slouken@2757
   107
 *
slouken@2757
   108
 *	f[]	ipio2[] in floating point
slouken@2757
   109
 *
slouken@2757
   110
 *	iq[]	integer array by breaking up q[] in 24-bits chunk.
slouken@2757
   111
 *
slouken@2757
   112
 *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
slouken@2757
   113
 *
slouken@2757
   114
 *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
slouken@2757
   115
 *		it also indicates the *sign* of the result.
slouken@2757
   116
 *
slouken@2757
   117
 */
slouken@2757
   118
slouken@2757
   119
slouken@2757
   120
/*
slouken@2757
   121
 * Constants:
slouken@2757
   122
 * The hexadecimal values are the intended ones for the following
slouken@2757
   123
 * constants. The decimal values may be used, provided that the
slouken@2757
   124
 * compiler will convert from decimal to binary accurately enough
slouken@2757
   125
 * to produce the hexadecimal values shown.
slouken@2757
   126
 */
slouken@2757
   127
slouken@6044
   128
#include "math_libm.h"
slouken@2757
   129
#include "math_private.h"
slouken@2757
   130
slouken@11683
   131
static const int init_jk[] = {2,3,4,6}; /* initial value for jk */
icculus@8670
   132
slouken@2757
   133
static const double PIo2[] = {
slouken@11683
   134
  1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
slouken@11683
   135
  7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
slouken@11683
   136
  5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
slouken@11683
   137
  3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
slouken@11683
   138
  1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
slouken@11683
   139
  1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
slouken@11683
   140
  2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
slouken@11683
   141
  2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
slouken@2757
   142
};
slouken@2757
   143
slouken@2757
   144
static const double
slouken@11683
   145
zero   = 0.0,
slouken@11683
   146
one    = 1.0,
slouken@11683
   147
two24   =  1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
slouken@11683
   148
twon24  =  5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
slouken@2757
   149
slouken@11683
   150
int attribute_hidden __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int32_t *ipio2)
slouken@2757
   151
{
slouken@11683
   152
	int32_t jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
slouken@11683
   153
	double z,fw,f[20],fq[20],q[20];
slouken@2757
   154
slouken@11683
   155
    /* initialize jk*/
slouken@11683
   156
	jk = init_jk[prec];
slouken@11683
   157
	jp = jk;
slouken@2757
   158
slouken@2757
   159
    /* determine jx,jv,q0, note that 3>q0 */
slouken@11683
   160
	jx =  nx-1;
slouken@11683
   161
	jv = (e0-3)/24; if(jv<0) jv=0;
slouken@11683
   162
	q0 =  e0-24*(jv+1);
slouken@2757
   163
slouken@2757
   164
    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
slouken@11683
   165
	j = jv-jx; m = jx+jk;
slouken@11683
   166
	for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (double) ipio2[j];
slouken@2757
   167
slouken@2757
   168
    /* compute q[0],q[1],...q[jk] */
slouken@11683
   169
	for (i=0;i<=jk;i++) {
slouken@11683
   170
	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
slouken@11683
   171
	}
slouken@2757
   172
slouken@11683
   173
	jz = jk;
slouken@11683
   174
recompute:
slouken@2757
   175
    /* distill q[] into iq[] reversingly */
slouken@11683
   176
	for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
slouken@11683
   177
	    fw    =  (double)((int32_t)(twon24* z));
slouken@11683
   178
	    iq[i] =  (int32_t)(z-two24*fw);
slouken@11683
   179
	    z     =  q[j-1]+fw;
slouken@11683
   180
	}
slouken@2757
   181
slouken@2757
   182
    /* compute n */
slouken@11683
   183
	z  = scalbn(z,q0);		/* actual value of z */
slouken@11683
   184
	z -= 8.0*floor(z*0.125);		/* trim off integer >= 8 */
slouken@11683
   185
	n  = (int32_t) z;
slouken@11683
   186
	z -= (double)n;
slouken@11683
   187
	ih = 0;
slouken@11683
   188
	if(q0>0) {	/* need iq[jz-1] to determine n */
slouken@11683
   189
	    i  = (iq[jz-1]>>(24-q0)); n += i;
slouken@11683
   190
	    iq[jz-1] -= i<<(24-q0);
slouken@11683
   191
	    ih = iq[jz-1]>>(23-q0);
slouken@11683
   192
	}
slouken@11683
   193
	else if(q0==0) ih = iq[jz-1]>>23;
slouken@11683
   194
	else if(z>=0.5) ih=2;
slouken@2757
   195
slouken@11683
   196
	if(ih>0) {	/* q > 0.5 */
slouken@11683
   197
	    n += 1; carry = 0;
slouken@11683
   198
	    for(i=0;i<jz ;i++) {	/* compute 1-q */
slouken@11683
   199
		j = iq[i];
slouken@11683
   200
		if(carry==0) {
slouken@11683
   201
		    if(j!=0) {
slouken@11683
   202
			carry = 1; iq[i] = 0x1000000- j;
slouken@11683
   203
		    }
slouken@11683
   204
		} else  iq[i] = 0xffffff - j;
slouken@11683
   205
	    }
slouken@11683
   206
	    if(q0>0) {		/* rare case: chance is 1 in 12 */
slouken@11683
   207
	        switch(q0) {
slouken@11683
   208
	        case 1:
slouken@11683
   209
	    	   iq[jz-1] &= 0x7fffff; break;
slouken@11683
   210
	    	case 2:
slouken@11683
   211
	    	   iq[jz-1] &= 0x3fffff; break;
slouken@11683
   212
	        }
slouken@11683
   213
	    }
slouken@11683
   214
	    if(ih==2) {
slouken@11683
   215
		z = one - z;
slouken@11683
   216
		if(carry!=0) z -= scalbn(one,q0);
slouken@11683
   217
	    }
slouken@11683
   218
	}
slouken@2757
   219
slouken@2757
   220
    /* check if recomputation is needed */
slouken@11683
   221
	if(z==zero) {
slouken@11683
   222
	    j = 0;
slouken@11683
   223
	    for (i=jz-1;i>=jk;i--) j |= iq[i];
slouken@11683
   224
	    if(j==0) { /* need recomputation */
slouken@11683
   225
		for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
slouken@2757
   226
slouken@11683
   227
		for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
slouken@11683
   228
		    f[jx+i] = (double) ipio2[jv+i];
slouken@11683
   229
		    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
slouken@11683
   230
		    q[i] = fw;
slouken@11683
   231
		}
slouken@11683
   232
		jz += k;
slouken@11683
   233
		goto recompute;
slouken@11683
   234
	    }
slouken@11683
   235
	}
slouken@2757
   236
slouken@2757
   237
    /* chop off zero terms */
slouken@11683
   238
	if(z==0.0) {
slouken@11683
   239
	    jz -= 1; q0 -= 24;
slouken@11683
   240
	    while(iq[jz]==0) { jz--; q0-=24;}
slouken@11683
   241
	} else { /* break z into 24-bit if necessary */
slouken@11683
   242
	    z = scalbn(z,-q0);
slouken@11683
   243
	    if(z>=two24) {
slouken@11683
   244
		fw = (double)((int32_t)(twon24*z));
slouken@11683
   245
		iq[jz] = (int32_t)(z-two24*fw);
slouken@11683
   246
		jz += 1; q0 += 24;
slouken@11683
   247
		iq[jz] = (int32_t) fw;
slouken@11683
   248
	    } else iq[jz] = (int32_t) z ;
slouken@11683
   249
	}
slouken@2757
   250
slouken@2757
   251
    /* convert integer "bit" chunk to floating-point value */
slouken@11683
   252
	fw = scalbn(one,q0);
slouken@11683
   253
	for(i=jz;i>=0;i--) {
slouken@11683
   254
	    q[i] = fw*(double)iq[i]; fw*=twon24;
slouken@11683
   255
	}
slouken@2757
   256
slouken@2757
   257
    /* compute PIo2[0,...,jp]*q[jz,...,0] */
slouken@11683
   258
	for(i=jz;i>=0;i--) {
slouken@11683
   259
	    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
slouken@11683
   260
	    fq[jz-i] = fw;
slouken@11683
   261
	}
slouken@2757
   262
slouken@2757
   263
    /* compress fq[] into y[] */
slouken@11683
   264
	switch(prec) {
slouken@11683
   265
	    case 0:
slouken@11683
   266
		fw = 0.0;
slouken@11683
   267
		for (i=jz;i>=0;i--) fw += fq[i];
slouken@11683
   268
		y[0] = (ih==0)? fw: -fw;
slouken@11683
   269
		break;
slouken@11683
   270
	    case 1:
slouken@11683
   271
	    case 2:
slouken@11683
   272
		fw = 0.0;
slouken@11683
   273
		for (i=jz;i>=0;i--) fw += fq[i];
slouken@11683
   274
		y[0] = (ih==0)? fw: -fw;
slouken@11683
   275
		fw = fq[0]-fw;
slouken@11683
   276
		for (i=1;i<=jz;i++) fw += fq[i];
slouken@11683
   277
		y[1] = (ih==0)? fw: -fw;
slouken@11683
   278
		break;
slouken@11683
   279
	    case 3:	/* painful */
slouken@11683
   280
		for (i=jz;i>0;i--) {
slouken@11683
   281
		    fw      = fq[i-1]+fq[i];
slouken@11683
   282
		    fq[i]  += fq[i-1]-fw;
slouken@11683
   283
		    fq[i-1] = fw;
slouken@11683
   284
		}
slouken@11683
   285
		for (i=jz;i>1;i--) {
slouken@11683
   286
		    fw      = fq[i-1]+fq[i];
slouken@11683
   287
		    fq[i]  += fq[i-1]-fw;
slouken@11683
   288
		    fq[i-1] = fw;
slouken@11683
   289
		}
slouken@11683
   290
		for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
slouken@11683
   291
		if(ih==0) {
slouken@11683
   292
		    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
slouken@11683
   293
		} else {
slouken@11683
   294
		    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
slouken@11683
   295
		}
slouken@11683
   296
	}
slouken@11683
   297
	return n&7;
slouken@2757
   298
}