audio: converting int32 to/from float shouldn't use doubles.
authorRyan C. Gordon <icculus@icculus.org>
Tue, 15 May 2018 01:04:11 -0400
changeset 119870c284754e25b
parent 11986 e307b74aa643
child 11988 8fa59be85dfb
audio: converting int32 to/from float shouldn't use doubles.

The concern is that a massive int sample, like 0x7FFFFFFF, won't fit in a
float32, which doesn't have enough bits to hold a whole number this large,
just to divide it to get a value between 0 and 1.
Previously we would convert to double, to get more bits, do the division, and
cast back to a float, but this is expensive.

Casting to double is more accurate, but it's 2x to 3x slower. Shifting out
the least significant byte of an int32, so it'll definitely fit in a float,
and dividing by 0x7FFFFF is still accurate to about 5 decimal places, and the
difference doesn't appear to be perceptable.
src/audio/SDL_audiotypecvt.c
     1.1 --- a/src/audio/SDL_audiotypecvt.c	Mon May 21 11:35:42 2018 -0400
     1.2 +++ b/src/audio/SDL_audiotypecvt.c	Tue May 15 01:04:11 2018 -0400
     1.3 @@ -62,7 +62,7 @@
     1.4  
     1.5  #define DIVBY128 0.0078125f
     1.6  #define DIVBY32768 0.000030517578125f
     1.7 -#define DIVBY2147483648 0.00000000046566128730773926
     1.8 +#define DIVBY8388607 0.00000011920930376163766f
     1.9  
    1.10  
    1.11  #if NEED_SCALAR_CONVERTER_FALLBACKS
    1.12 @@ -152,7 +152,7 @@
    1.13      LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
    1.14  
    1.15      for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
    1.16 -        *dst = (float) (((double) *src) * DIVBY2147483648);
    1.17 +        *dst = ((float) (*src>>8)) * DIVBY8388607;
    1.18      }
    1.19  
    1.20      if (cvt->filters[++cvt->filter_index]) {
    1.21 @@ -280,7 +280,7 @@
    1.22          } else if (sample < -1.0f) {
    1.23              *dst = -2147483647;
    1.24          } else {
    1.25 -            *dst = (Sint32)((double)sample * 2147483647.0);
    1.26 +            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
    1.27          }
    1.28      }
    1.29  
    1.30 @@ -509,16 +509,6 @@
    1.31      }
    1.32  }
    1.33  
    1.34 -#if defined(__GNUC__) && (__GNUC__ < 4)
    1.35 -/* these were added as of gcc-4.0: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19418 */
    1.36 -static inline __m128 _mm_castsi128_ps(__m128i __A) {
    1.37 -  return (__m128) __A;
    1.38 -}
    1.39 -static inline __m128i _mm_castps_si128(__m128 __A) {
    1.40 -  return (__m128i) __A;
    1.41 -}
    1.42 -#endif
    1.43 -
    1.44  static void SDLCALL
    1.45  SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.46  {
    1.47 @@ -530,7 +520,7 @@
    1.48  
    1.49      /* Get dst aligned to 16 bytes */
    1.50      for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    1.51 -        *dst = (float) (((double) *src) * DIVBY2147483648);
    1.52 +        *dst = ((float) (*src>>8)) * DIVBY8388607;
    1.53      }
    1.54  
    1.55      SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    1.56 @@ -538,15 +528,11 @@
    1.57  
    1.58      {
    1.59          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    1.60 -        const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
    1.61 +        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
    1.62          const __m128i *mmsrc = (const __m128i *) src;
    1.63          while (i >= 4) {   /* 4 * sint32 */
    1.64 -            const __m128i ints = _mm_load_si128(mmsrc);
    1.65 -            /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
    1.66 -            const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
    1.67 -            const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
    1.68 -            /* convert to float32, bitshift/or to get these into a vector to store. */
    1.69 -            _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
    1.70 +            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
    1.71 +            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
    1.72              i -= 4; mmsrc++; dst += 4;
    1.73          }
    1.74          src = (const Sint32 *) mmsrc;
    1.75 @@ -554,7 +540,7 @@
    1.76  
    1.77      /* Finish off any leftovers with scalar operations. */
    1.78      while (i) {
    1.79 -        *dst = (float) (((double) *src) * DIVBY2147483648);
    1.80 +        *dst = ((float) (*src>>8)) * DIVBY8388607;
    1.81          i--; src++; dst++;
    1.82      }
    1.83  
    1.84 @@ -755,7 +741,7 @@
    1.85  
    1.86      /* Get dst aligned to 16 bytes */
    1.87      for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
    1.88 -        *dst = (Sint32) (((double) *src) * 2147483647.0);
    1.89 +        *dst = ((Sint32)(*src * 8388607.0f)) << 8;
    1.90      }
    1.91  
    1.92      SDL_assert(!i || ((((size_t) dst) & 15) == 0));
    1.93 @@ -763,14 +749,10 @@
    1.94  
    1.95      {
    1.96          /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
    1.97 -        const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
    1.98 +        const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
    1.99          __m128i *mmdst = (__m128i *) dst;
   1.100          while (i >= 4) {   /* 4 * float32 */
   1.101 -            const __m128 floats = _mm_load_ps(src);
   1.102 -            /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
   1.103 -            const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
   1.104 -            const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
   1.105 -            _mm_store_si128(mmdst, _mm_or_si128(_mm_slli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
   1.106 +            _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby8388607)), 8));
   1.107              i -= 4; src += 4; mmdst++;
   1.108          }
   1.109          dst = (Sint32 *) mmdst;
   1.110 @@ -778,7 +760,7 @@
   1.111  
   1.112      /* Finish off any leftovers with scalar operations. */
   1.113      while (i) {
   1.114 -        *dst = (Sint32) (((double) *src) * 2147483647.0);
   1.115 +        *dst = ((Sint32)(*src * 8388607.0f)) << 8;
   1.116          i--; src++; dst++;
   1.117      }
   1.118