audio: Converting audio samples from int to float was using wrong equation.
authorRyan C. Gordon <icculus@icculus.org>
Tue, 29 Aug 2017 00:02:04 -0400
changeset 114034cdc242e4102
parent 11402 d459d8934897
child 11404 bd5b569b2a1b
audio: Converting audio samples from int to float was using wrong equation.

Fixes Bugzilla #3775.
src/audio/SDL_audiotypecvt.c
     1.1 --- a/src/audio/SDL_audiotypecvt.c	Mon Aug 28 20:52:05 2017 -0700
     1.2 +++ b/src/audio/SDL_audiotypecvt.c	Tue Aug 29 00:02:04 2017 -0400
     1.3 @@ -60,9 +60,9 @@
     1.4  SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
     1.5  
     1.6  
     1.7 -#define DIVBY127 0.0078740157480315f
     1.8 -#define DIVBY32767 3.05185094759972e-05f
     1.9 -#define DIVBY2147483647 4.6566128752458e-10f
    1.10 +#define DIVBY128 0.0078125f
    1.11 +#define DIVBY32768 0.000030517578125f
    1.12 +#define DIVBY2147483648 0.00000000046566128730773926
    1.13  
    1.14  
    1.15  #if NEED_SCALAR_CONVERTER_FALLBACKS
    1.16 @@ -76,7 +76,7 @@
    1.17      LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    1.18  
    1.19      for (i = cvt->len_cvt; i; --i, --src, --dst) {
    1.20 -        *dst = (((float) *src) * DIVBY127);
    1.21 +        *dst = ((float) *src) * DIVBY128;
    1.22      }
    1.23  
    1.24      cvt->len_cvt *= 4;
    1.25 @@ -95,7 +95,7 @@
    1.26      LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    1.27  
    1.28      for (i = cvt->len_cvt; i; --i, --src, --dst) {
    1.29 -        *dst = ((((float) *src) * DIVBY127) - 1.0f);
    1.30 +        *dst = (((float) *src) * DIVBY128) - 1.0f;
    1.31      }
    1.32  
    1.33      cvt->len_cvt *= 4;
    1.34 @@ -114,7 +114,7 @@
    1.35      LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
    1.36  
    1.37      for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
    1.38 -        *dst = (((float) *src) * DIVBY32767);
    1.39 +        *dst = ((float) *src) * DIVBY32768;
    1.40      }
    1.41  
    1.42      cvt->len_cvt *= 2;
    1.43 @@ -133,7 +133,7 @@
    1.44      LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
    1.45  
    1.46      for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
    1.47 -        *dst = ((((float) *src) * DIVBY32767) - 1.0f);
    1.48 +        *dst = (((float) *src) * DIVBY32768) - 1.0f;
    1.49      }
    1.50  
    1.51      cvt->len_cvt *= 2;
    1.52 @@ -152,7 +152,7 @@
    1.53      LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
    1.54  
    1.55      for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
    1.56 -        *dst = (float) (((double) *src) * DIVBY2147483647);
    1.57 +        *dst = (float) (((double) *src) * DIVBY2147483648);
    1.58      }
    1.59  
    1.60      if (cvt->filters[++cvt->filter_index]) {
    1.61 @@ -268,7 +268,7 @@
    1.62  
    1.63      /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
    1.64      for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
    1.65 -        *dst = (((float) *src) * DIVBY127);
    1.66 +        *dst = ((float) *src) * DIVBY128;
    1.67      }
    1.68  
    1.69      src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
    1.70 @@ -279,7 +279,7 @@
    1.71          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    1.72          const __m128i *mmsrc = (const __m128i *) src;
    1.73          const __m128i zero = _mm_setzero_si128();
    1.74 -        const __m128 divby127 = _mm_set1_ps(DIVBY127);
    1.75 +        const __m128 divby128 = _mm_set1_ps(DIVBY128);
    1.76          while (i >= 16) {   /* 16 * 8-bit */
    1.77              const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
    1.78              /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
    1.79 @@ -287,10 +287,10 @@
    1.80              /* right-shift-sign-extend gets us sint16 with the other set of values. */
    1.81              const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
    1.82              /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
    1.83 -            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby127);
    1.84 -            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby127);
    1.85 -            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby127);
    1.86 -            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby127);
    1.87 +            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
    1.88 +            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
    1.89 +            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
    1.90 +            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
    1.91              /* Interleave back into correct order, store. */
    1.92              _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
    1.93              _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
    1.94 @@ -306,7 +306,7 @@
    1.95  
    1.96      /* Finish off any leftovers with scalar operations. */
    1.97      while (i) {
    1.98 -        *dst = (((float) *src) * DIVBY127);
    1.99 +        *dst = ((float) *src) * DIVBY128;
   1.100          i--; src--; dst--;
   1.101      }
   1.102  
   1.103 @@ -327,7 +327,7 @@
   1.104  
   1.105      /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.106      for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   1.107 -        *dst = ((((float) *src) * DIVBY127) - 1.0f);
   1.108 +        *dst = (((float) *src) * DIVBY128) - 1.0f;
   1.109      }
   1.110  
   1.111      src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   1.112 @@ -338,7 +338,7 @@
   1.113          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.114          const __m128i *mmsrc = (const __m128i *) src;
   1.115          const __m128i zero = _mm_setzero_si128();
   1.116 -        const __m128 divby127 = _mm_set1_ps(DIVBY127);
   1.117 +        const __m128 divby128 = _mm_set1_ps(DIVBY128);
   1.118          const __m128 minus1 = _mm_set1_ps(-1.0f);
   1.119          while (i >= 16) {   /* 16 * 8-bit */
   1.120              const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   1.121 @@ -348,10 +348,10 @@
   1.122              const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   1.123              /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   1.124              /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   1.125 -            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby127), minus1);
   1.126 -            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby127), minus1);
   1.127 -            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby127), minus1);
   1.128 -            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby127), minus1);
   1.129 +            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   1.130 +            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   1.131 +            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   1.132 +            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   1.133              /* Interleave back into correct order, store. */
   1.134              _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   1.135              _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   1.136 @@ -367,7 +367,7 @@
   1.137  
   1.138      /* Finish off any leftovers with scalar operations. */
   1.139      while (i) {
   1.140 -        *dst = ((((float) *src) * DIVBY127) - 1.0f);
   1.141 +        *dst = (((float) *src) * DIVBY128) - 1.0f;
   1.142          i--; src--; dst--;
   1.143      }
   1.144  
   1.145 @@ -388,7 +388,7 @@
   1.146  
   1.147      /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.148      for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   1.149 -        *dst = (((float) *src) * DIVBY32767);
   1.150 +        *dst = ((float) *src) * DIVBY32768;
   1.151      }
   1.152  
   1.153      src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   1.154 @@ -397,7 +397,7 @@
   1.155      /* Make sure src is aligned too. */
   1.156      if ((((size_t) src) & 15) == 0) {
   1.157          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.158 -        const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   1.159 +        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   1.160          while (i >= 8) {   /* 8 * 16-bit */
   1.161              const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   1.162              /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   1.163 @@ -405,8 +405,8 @@
   1.164              /* right-shift-sign-extend gets us sint32 with the other set of values. */
   1.165              const __m128i b = _mm_srai_epi32(ints, 16);
   1.166              /* Interleave these back into the right order, convert to float, multiply, store. */
   1.167 -            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767));
   1.168 -            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767));
   1.169 +            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   1.170 +            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   1.171              i -= 8; src -= 8; dst -= 8;
   1.172          }
   1.173      }
   1.174 @@ -415,7 +415,7 @@
   1.175  
   1.176      /* Finish off any leftovers with scalar operations. */
   1.177      while (i) {
   1.178 -        *dst = (((float) *src) * DIVBY32767);
   1.179 +        *dst = ((float) *src) * DIVBY32768;
   1.180          i--; src--; dst--;
   1.181      }
   1.182  
   1.183 @@ -436,7 +436,7 @@
   1.184  
   1.185      /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.186      for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   1.187 -        *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   1.188 +        *dst = (((float) *src) * DIVBY32768) - 1.0f;
   1.189      }
   1.190  
   1.191      src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   1.192 @@ -445,7 +445,7 @@
   1.193      /* Make sure src is aligned too. */
   1.194      if ((((size_t) src) & 15) == 0) {
   1.195          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.196 -        const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   1.197 +        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   1.198          const __m128 minus1 = _mm_set1_ps(1.0f);
   1.199          while (i >= 8) {   /* 8 * 16-bit */
   1.200              const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   1.201 @@ -454,8 +454,8 @@
   1.202              /* right-shift-sign-extend gets us sint32 with the other set of values. */
   1.203              const __m128i b = _mm_srli_epi32(ints, 16);
   1.204              /* Interleave these back into the right order, convert to float, multiply, store. */
   1.205 -            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767), minus1));
   1.206 -            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767), minus1));
   1.207 +            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   1.208 +            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   1.209              i -= 8; src -= 8; dst -= 8;
   1.210          }
   1.211      }
   1.212 @@ -464,7 +464,7 @@
   1.213  
   1.214      /* Finish off any leftovers with scalar operations. */
   1.215      while (i) {
   1.216 -        *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   1.217 +        *dst = (((float) *src) * DIVBY32768) - 1.0f;
   1.218          i--; src--; dst--;
   1.219      }
   1.220  
   1.221 @@ -485,7 +485,7 @@
   1.222  
   1.223      /* Get dst aligned to 16 bytes */
   1.224      for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.225 -        *dst = (float) (((double) *src) * DIVBY2147483647);
   1.226 +        *dst = (float) (((double) *src) * DIVBY2147483648);
   1.227      }
   1.228  
   1.229      SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.230 @@ -493,13 +493,13 @@
   1.231  
   1.232      {
   1.233          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.234 -        const __m128d divby2147483647 = _mm_set1_pd(DIVBY2147483647);
   1.235 +        const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
   1.236          const __m128i *mmsrc = (const __m128i *) src;
   1.237          while (i >= 4) {   /* 4 * sint32 */
   1.238              const __m128i ints = _mm_load_si128(mmsrc);
   1.239              /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
   1.240 -            const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483647);
   1.241 -            const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483647);
   1.242 +            const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
   1.243 +            const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
   1.244              /* convert to float32, bitshift/or to get these into a vector to store. */
   1.245              _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
   1.246              i -= 4; mmsrc++; dst += 4;
   1.247 @@ -509,7 +509,7 @@
   1.248  
   1.249      /* Finish off any leftovers with scalar operations. */
   1.250      while (i) {
   1.251 -        *dst = (float) (((double) *src) * DIVBY2147483647);
   1.252 +        *dst = (float) (((double) *src) * DIVBY2147483648);
   1.253          i--; src++; dst++;
   1.254      }
   1.255