src/audio/SDL_audiotypecvt.c
changeset 10815 71bbe3233508
parent 10814 938218064f67
child 10835 0e9e7a128391
     1.1 --- a/src/audio/SDL_audiotypecvt.c	Sun Jan 15 05:01:59 2017 -0500
     1.2 +++ b/src/audio/SDL_audiotypecvt.c	Mon Jan 16 00:58:28 2017 -0500
     1.3 @@ -22,14 +22,55 @@
     1.4  #include "../SDL_internal.h"
     1.5  #include "SDL_audio.h"
     1.6  #include "SDL_audio_c.h"
     1.7 +#include "SDL_cpuinfo.h"
     1.8  #include "SDL_assert.h"
     1.9  
    1.10 +/* !!! FIXME: write NEON code. */
    1.11 +#define HAVE_NEON_INTRINSICS 0
    1.12 +
    1.13 +/* !!! FIXME: wire this up to the configure script, etc. */
    1.14 +#define HAVE_SSE2_INTRINSICS 0
    1.15 +
    1.16 +#if HAVE_SSE2_INTRINSICS
    1.17 +#include <emmintrin.h>
    1.18 +#endif
    1.19 +
    1.20 +#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    1.21 +#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    1.22 +#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    1.23 +#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    1.24 +#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    1.25 +#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    1.26 +#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    1.27 +#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    1.28 +#endif
    1.29 +
    1.30 +/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    1.31 +#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    1.32 +#define NEED_SCALAR_CONVERTER_FALLBACKS 1
    1.33 +#endif
    1.34 +
    1.35 +/* Function pointers set to a CPU-specific implementation. */
    1.36 +SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    1.37 +SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    1.38 +SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    1.39 +SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    1.40 +SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    1.41 +SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    1.42 +SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    1.43 +SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    1.44 +SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    1.45 +SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    1.46 +
    1.47 +
    1.48  #define DIVBY127 0.0078740157480315f
    1.49  #define DIVBY32767 3.05185094759972e-05f
    1.50  #define DIVBY2147483647 4.6566128752458e-10f
    1.51  
    1.52 -void SDLCALL
    1.53 -SDL_Convert_S8_to_F32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.54 +
    1.55 +#if NEED_SCALAR_CONVERTER_FALLBACKS
    1.56 +static void SDLCALL
    1.57 +SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.58  {
    1.59      const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    1.60      float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    1.61 @@ -47,8 +88,8 @@
    1.62      }
    1.63  }
    1.64  
    1.65 -void SDLCALL
    1.66 -SDL_Convert_U8_to_F32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.67 +static void SDLCALL
    1.68 +SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.69  {
    1.70      const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    1.71      float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    1.72 @@ -66,8 +107,8 @@
    1.73      }
    1.74  }
    1.75  
    1.76 -void SDLCALL
    1.77 -SDL_Convert_S16_to_F32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.78 +static void SDLCALL
    1.79 +SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.80  {
    1.81      const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    1.82      float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    1.83 @@ -85,8 +126,8 @@
    1.84      }
    1.85  }
    1.86  
    1.87 -void SDLCALL
    1.88 -SDL_Convert_U16_to_F32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.89 +static void SDLCALL
    1.90 +SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    1.91  {
    1.92      const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
    1.93      float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
    1.94 @@ -104,8 +145,8 @@
    1.95      }
    1.96  }
    1.97  
    1.98 -void SDLCALL
    1.99 -SDL_Convert_S32_to_F32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.100 +static void SDLCALL
   1.101 +SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.102  {
   1.103      const Sint32 *src = (const Sint32 *) cvt->buf;
   1.104      float *dst = (float *) cvt->buf;
   1.105 @@ -122,8 +163,8 @@
   1.106      }
   1.107  }
   1.108  
   1.109 -void SDLCALL
   1.110 -SDL_Convert_F32_to_S8(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.111 +static void SDLCALL
   1.112 +SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.113  {
   1.114      const float *src = (const float *) cvt->buf;
   1.115      Sint8 *dst = (Sint8 *) cvt->buf;
   1.116 @@ -141,8 +182,8 @@
   1.117      }
   1.118  }
   1.119  
   1.120 -void SDLCALL
   1.121 -SDL_Convert_F32_to_U8(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.122 +static void SDLCALL
   1.123 +SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.124  {
   1.125      const float *src = (const float *) cvt->buf;
   1.126      Uint8 *dst = (Uint8 *) cvt->buf;
   1.127 @@ -160,8 +201,8 @@
   1.128      }
   1.129  }
   1.130  
   1.131 -void SDLCALL
   1.132 -SDL_Convert_F32_to_S16(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.133 +static void SDLCALL
   1.134 +SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.135  {
   1.136      const float *src = (const float *) cvt->buf;
   1.137      Sint16 *dst = (Sint16 *) cvt->buf;
   1.138 @@ -179,8 +220,8 @@
   1.139      }
   1.140  }
   1.141  
   1.142 -void SDLCALL
   1.143 -SDL_Convert_F32_to_U16(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.144 +static void SDLCALL
   1.145 +SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.146  {
   1.147      const float *src = (const float *) cvt->buf;
   1.148      Uint16 *dst = (Uint16 *) cvt->buf;
   1.149 @@ -198,8 +239,8 @@
   1.150      }
   1.151  }
   1.152  
   1.153 -void SDLCALL
   1.154 -SDL_Convert_F32_to_S32(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.155 +static void SDLCALL
   1.156 +SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.157  {
   1.158      const float *src = (const float *) cvt->buf;
   1.159      Sint32 *dst = (Sint32 *) cvt->buf;
   1.160 @@ -215,5 +256,532 @@
   1.161          cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   1.162      }
   1.163  }
   1.164 +#endif
   1.165 +
   1.166 +
   1.167 +#if HAVE_SSE2_INTRINSICS
   1.168 +static void SDLCALL
   1.169 +SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.170 +{
   1.171 +    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   1.172 +    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   1.173 +    int i;
   1.174 +
   1.175 +    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   1.176 +
   1.177 +    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.178 +    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   1.179 +        *dst = (((float) *src) * DIVBY127);
   1.180 +    }
   1.181 +
   1.182 +    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   1.183 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.184 +
   1.185 +    /* Make sure src is aligned too. */
   1.186 +    if ((((size_t) src) & 15) == 0) {
   1.187 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.188 +        const __m128i *mmsrc = (const __m128i *) src;
   1.189 +        const __m128i zero = _mm_setzero_si128();
   1.190 +        const __m128 divby127 = _mm_set1_ps(DIVBY127);
   1.191 +        while (i >= 16) {   /* 16 * 8-bit */
   1.192 +            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   1.193 +            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   1.194 +            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   1.195 +            /* right-shift-sign-extend gets us sint16 with the other set of values. */
   1.196 +            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   1.197 +            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   1.198 +            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby127);
   1.199 +            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby127);
   1.200 +            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby127);
   1.201 +            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby127);
   1.202 +            /* Interleave back into correct order, store. */
   1.203 +            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   1.204 +            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   1.205 +            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   1.206 +            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   1.207 +            i -= 16; mmsrc--; dst -= 16;
   1.208 +        }
   1.209 +
   1.210 +        src = (const Sint8 *) mmsrc;
   1.211 +    }
   1.212 +
   1.213 +    src += 15; dst += 15;  /* adjust for any scalar finishing. */
   1.214 +
   1.215 +    /* Finish off any leftovers with scalar operations. */
   1.216 +    while (i) {
   1.217 +        *dst = (((float) *src) * DIVBY127);
   1.218 +        i--; src--; dst--;
   1.219 +    }
   1.220 +
   1.221 +    cvt->len_cvt *= 4;
   1.222 +    if (cvt->filters[++cvt->filter_index]) {
   1.223 +        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1.224 +    }
   1.225 +}
   1.226 +
   1.227 +static void SDLCALL
   1.228 +SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.229 +{
   1.230 +    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   1.231 +    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   1.232 +    int i;
   1.233 +
   1.234 +    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   1.235 +
   1.236 +    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.237 +    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   1.238 +        *dst = ((((float) *src) * DIVBY127) - 1.0f);
   1.239 +    }
   1.240 +
   1.241 +    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   1.242 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.243 +
   1.244 +    /* Make sure src is aligned too. */
   1.245 +    if ((((size_t) src) & 15) == 0) {
   1.246 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.247 +        const __m128i *mmsrc = (const __m128i *) src;
   1.248 +        const __m128i zero = _mm_setzero_si128();
   1.249 +        const __m128 divby127 = _mm_set1_ps(DIVBY127);
   1.250 +        const __m128 minus1 = _mm_set1_ps(-1.0f);
   1.251 +        while (i >= 16) {   /* 16 * 8-bit */
   1.252 +            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   1.253 +            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   1.254 +            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   1.255 +            /* right-shift-zero-extend gets us uint16 with the other set of values. */
   1.256 +            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   1.257 +            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   1.258 +            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   1.259 +            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby127), minus1);
   1.260 +            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby127), minus1);
   1.261 +            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby127), minus1);
   1.262 +            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby127), minus1);
   1.263 +            /* Interleave back into correct order, store. */
   1.264 +            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   1.265 +            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   1.266 +            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   1.267 +            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   1.268 +            i -= 16; mmsrc--; dst -= 16;
   1.269 +        }
   1.270 +
   1.271 +        src = (const Uint8 *) mmsrc;
   1.272 +    }
   1.273 +
   1.274 +    src += 15; dst += 15;  /* adjust for any scalar finishing. */
   1.275 +
   1.276 +    /* Finish off any leftovers with scalar operations. */
   1.277 +    while (i) {
   1.278 +        *dst = ((((float) *src) * DIVBY127) - 1.0f);
   1.279 +        i--; src--; dst--;
   1.280 +    }
   1.281 +
   1.282 +    cvt->len_cvt *= 4;
   1.283 +    if (cvt->filters[++cvt->filter_index]) {
   1.284 +        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1.285 +    }
   1.286 +}
   1.287 +
   1.288 +static void SDLCALL
   1.289 +SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.290 +{
   1.291 +    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   1.292 +    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   1.293 +    int i;
   1.294 +
   1.295 +    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   1.296 +
   1.297 +    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.298 +    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   1.299 +        *dst = (((float) *src) * DIVBY32767);
   1.300 +    }
   1.301 +
   1.302 +    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   1.303 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.304 +
   1.305 +    /* Make sure src is aligned too. */
   1.306 +    if ((((size_t) src) & 15) == 0) {
   1.307 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.308 +        const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   1.309 +        while (i >= 8) {   /* 8 * 16-bit */
   1.310 +            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   1.311 +            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   1.312 +            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   1.313 +            /* right-shift-sign-extend gets us sint32 with the other set of values. */
   1.314 +            const __m128i b = _mm_srai_epi32(ints, 16);
   1.315 +            /* Interleave these back into the right order, convert to float, multiply, store. */
   1.316 +            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767));
   1.317 +            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767));
   1.318 +            i -= 8; src -= 8; dst -= 8;
   1.319 +        }
   1.320 +    }
   1.321 +
   1.322 +    src += 7; dst += 7;  /* adjust for any scalar finishing. */
   1.323 +
   1.324 +    /* Finish off any leftovers with scalar operations. */
   1.325 +    while (i) {
   1.326 +        *dst = (((float) *src) * DIVBY32767);
   1.327 +        i--; src--; dst--;
   1.328 +    }
   1.329 +
   1.330 +    cvt->len_cvt *= 2;
   1.331 +    if (cvt->filters[++cvt->filter_index]) {
   1.332 +        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1.333 +    }
   1.334 +}
   1.335 +
   1.336 +static void SDLCALL
   1.337 +SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.338 +{
   1.339 +    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   1.340 +    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   1.341 +    int i;
   1.342 +
   1.343 +    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   1.344 +
   1.345 +    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   1.346 +    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   1.347 +        *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   1.348 +    }
   1.349 +
   1.350 +    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   1.351 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.352 +
   1.353 +    /* Make sure src is aligned too. */
   1.354 +    if ((((size_t) src) & 15) == 0) {
   1.355 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.356 +        const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   1.357 +        const __m128 minus1 = _mm_set1_ps(1.0f);
   1.358 +        while (i >= 8) {   /* 8 * 16-bit */
   1.359 +            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   1.360 +            /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   1.361 +            const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   1.362 +            /* right-shift-sign-extend gets us sint32 with the other set of values. */
   1.363 +            const __m128i b = _mm_srli_epi32(ints, 16);
   1.364 +            /* Interleave these back into the right order, convert to float, multiply, store. */
   1.365 +            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767), minus1));
   1.366 +            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767), minus1));
   1.367 +            i -= 8; src -= 8; dst -= 8;
   1.368 +        }
   1.369 +    }
   1.370 +
   1.371 +    src += 7; dst += 7;  /* adjust for any scalar finishing. */
   1.372 +
   1.373 +    /* Finish off any leftovers with scalar operations. */
   1.374 +    while (i) {
   1.375 +        *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   1.376 +        i--; src--; dst--;
   1.377 +    }
   1.378 +
   1.379 +    cvt->len_cvt *= 2;
   1.380 +    if (cvt->filters[++cvt->filter_index]) {
   1.381 +        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1.382 +    }
   1.383 +}
   1.384 +
   1.385 +static void SDLCALL
   1.386 +SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.387 +{
   1.388 +    const Sint32 *src = (const Sint32 *) cvt->buf;
   1.389 +    float *dst = (float *) cvt->buf;
   1.390 +    int i;
   1.391 +
   1.392 +    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   1.393 +
   1.394 +    /* Get dst aligned to 16 bytes */
   1.395 +    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.396 +        *dst = (float) (((double) *src) * DIVBY2147483647);
   1.397 +    }
   1.398 +
   1.399 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.400 +    SDL_assert(!i || ((((size_t) src) & 15) == 0));
   1.401 +
   1.402 +    {
   1.403 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.404 +        const __m128d divby2147483647 = _mm_set1_pd(DIVBY2147483647);
   1.405 +        const __m128i *mmsrc = (const __m128i *) src;
   1.406 +        while (i >= 4) {   /* 4 * sint32 */
   1.407 +            const __m128i ints = _mm_load_si128(mmsrc);
   1.408 +            /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
   1.409 +            const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_bsrli_si128(ints, 8)), divby2147483647);
   1.410 +            const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483647);
   1.411 +            /* convert to float32, bitshift/or to get these into a vector to store. */
   1.412 +            _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_bslli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
   1.413 +            i -= 4; mmsrc++; dst += 4;
   1.414 +        }
   1.415 +        src = (const Sint32 *) mmsrc;
   1.416 +    }
   1.417 +
   1.418 +    /* Finish off any leftovers with scalar operations. */
   1.419 +    while (i) {
   1.420 +        *dst = (float) (((double) *src) * DIVBY2147483647);
   1.421 +        i--; src++; dst++;
   1.422 +    }
   1.423 +
   1.424 +    if (cvt->filters[++cvt->filter_index]) {
   1.425 +        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   1.426 +    }
   1.427 +}
   1.428 +
   1.429 +static void SDLCALL
   1.430 +SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.431 +{
   1.432 +    const float *src = (const float *) cvt->buf;
   1.433 +    Sint8 *dst = (Sint8 *) cvt->buf;
   1.434 +    int i;
   1.435 +
   1.436 +    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   1.437 +
   1.438 +    /* Get dst aligned to 16 bytes */
   1.439 +    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.440 +        *dst = (Sint8) (*src * 127.0f);
   1.441 +    }
   1.442 +
   1.443 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.444 +
   1.445 +    /* Make sure src is aligned too. */
   1.446 +    if ((((size_t) src) & 15) == 0) {
   1.447 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.448 +        const __m128 mulby127 = _mm_set1_ps(127.0f);
   1.449 +        __m128i *mmdst = (__m128i *) dst;
   1.450 +        while (i >= 16) {   /* 16 * float32 */
   1.451 +            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
   1.452 +            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
   1.453 +            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
   1.454 +            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
   1.455 +            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   1.456 +            i -= 16; src += 16; mmdst++;
   1.457 +        }
   1.458 +        dst = (Sint8 *) mmdst;
   1.459 +    }
   1.460 +
   1.461 +    /* Finish off any leftovers with scalar operations. */
   1.462 +    while (i) {
   1.463 +        *dst = (Sint8) (*src * 127.0f);
   1.464 +        i--; src++; dst++;
   1.465 +    }
   1.466 +
   1.467 +    cvt->len_cvt /= 4;
   1.468 +    if (cvt->filters[++cvt->filter_index]) {
   1.469 +        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   1.470 +    }
   1.471 +}
   1.472 +
   1.473 +static void SDLCALL
   1.474 +SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.475 +{
   1.476 +    const float *src = (const float *) cvt->buf;
   1.477 +    Uint8 *dst = (Uint8 *) cvt->buf;
   1.478 +    int i;
   1.479 +
   1.480 +    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   1.481 +
   1.482 +    /* Get dst aligned to 16 bytes */
   1.483 +    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.484 +        *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   1.485 +    }
   1.486 +
   1.487 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.488 +
   1.489 +    /* Make sure src is aligned too. */
   1.490 +    if ((((size_t) src) & 15) == 0) {
   1.491 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.492 +        const __m128 add1 = _mm_set1_ps(1.0f);
   1.493 +        const __m128 mulby127 = _mm_set1_ps(127.0f);
   1.494 +        __m128i *mmdst = (__m128i *) dst;
   1.495 +        while (i >= 16) {   /* 16 * float32 */
   1.496 +            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
   1.497 +            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
   1.498 +            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
   1.499 +            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
   1.500 +            _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   1.501 +            i -= 16; src += 16; mmdst++;
   1.502 +        }
   1.503 +        dst = (Uint8 *) mmdst;
   1.504 +    }
   1.505 +
   1.506 +    /* Finish off any leftovers with scalar operations. */
   1.507 +    while (i) {
   1.508 +        *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   1.509 +        i--; src++; dst++;
   1.510 +    }
   1.511 +
   1.512 +    cvt->len_cvt /= 4;
   1.513 +    if (cvt->filters[++cvt->filter_index]) {
   1.514 +        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   1.515 +    }
   1.516 +}
   1.517 +
   1.518 +static void SDLCALL
   1.519 +SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.520 +{
   1.521 +    const float *src = (const float *) cvt->buf;
   1.522 +    Sint16 *dst = (Sint16 *) cvt->buf;
   1.523 +    int i;
   1.524 +
   1.525 +    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   1.526 +
   1.527 +    /* Get dst aligned to 16 bytes */
   1.528 +    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.529 +        *dst = (Sint16) (*src * 32767.0f);
   1.530 +    }
   1.531 +
   1.532 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.533 +
   1.534 +    /* Make sure src is aligned too. */
   1.535 +    if ((((size_t) src) & 15) == 0) {
   1.536 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.537 +        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   1.538 +        __m128i *mmdst = (__m128i *) dst;
   1.539 +        while (i >= 8) {   /* 8 * float32 */
   1.540 +            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   1.541 +            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   1.542 +            _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   1.543 +            i -= 8; src += 8; mmdst++;
   1.544 +        }
   1.545 +        dst = (Sint16 *) mmdst;
   1.546 +    }
   1.547 +
   1.548 +    /* Finish off any leftovers with scalar operations. */
   1.549 +    while (i) {
   1.550 +        *dst = (((float) *src) * DIVBY32767);
   1.551 +        i--; src++; dst++;
   1.552 +    }
   1.553 +
   1.554 +    cvt->len_cvt /= 2;
   1.555 +    if (cvt->filters[++cvt->filter_index]) {
   1.556 +        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   1.557 +    }
   1.558 +}
   1.559 +
   1.560 +static void SDLCALL
   1.561 +SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.562 +{
   1.563 +    const float *src = (const float *) cvt->buf;
   1.564 +    Uint16 *dst = (Uint16 *) cvt->buf;
   1.565 +    int i;
   1.566 +
   1.567 +    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   1.568 +
   1.569 +    /* Get dst aligned to 16 bytes */
   1.570 +    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.571 +        *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   1.572 +    }
   1.573 +
   1.574 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.575 +
   1.576 +    /* Make sure src is aligned too. */
   1.577 +    if ((((size_t) src) & 15) == 0) {
   1.578 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.579 +        /* This calculates differently than the scalar path because SSE2 can't
   1.580 +           pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   1.581 +           saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   1.582 +           but not before SSE 4.1. So we convert from float to sint16, packing
   1.583 +           that down with legit signed saturation, and then xor the top bit
   1.584 +           against 1. This results in the correct unsigned 16-bit value, even
   1.585 +           though it looks like dark magic. */
   1.586 +        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   1.587 +        const __m128i topbit = _mm_set1_epi16(-32768);
   1.588 +        __m128i *mmdst = (__m128i *) dst;
   1.589 +        while (i >= 8) {   /* 8 * float32 */
   1.590 +            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   1.591 +            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   1.592 +            _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   1.593 +            i -= 8; src += 8; mmdst++;
   1.594 +        }
   1.595 +        dst = (Uint16 *) mmdst;
   1.596 +    }
   1.597 +
   1.598 +    /* Finish off any leftovers with scalar operations. */
   1.599 +    while (i) {
   1.600 +        *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   1.601 +        i--; src++; dst++;
   1.602 +    }
   1.603 +
   1.604 +    cvt->len_cvt /= 2;
   1.605 +    if (cvt->filters[++cvt->filter_index]) {
   1.606 +        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   1.607 +    }
   1.608 +}
   1.609 +
   1.610 +static void SDLCALL
   1.611 +SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   1.612 +{
   1.613 +    const float *src = (const float *) cvt->buf;
   1.614 +    Sint32 *dst = (Sint32 *) cvt->buf;
   1.615 +    int i;
   1.616 +
   1.617 +    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   1.618 +
   1.619 +    /* Get dst aligned to 16 bytes */
   1.620 +    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   1.621 +        *dst = (Sint32) (((double) *src) * 2147483647.0);
   1.622 +    }
   1.623 +
   1.624 +    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   1.625 +    SDL_assert(!i || ((((size_t) src) & 15) == 0));
   1.626 +
   1.627 +    {
   1.628 +        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   1.629 +        const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
   1.630 +        __m128i *mmdst = (__m128i *) dst;
   1.631 +        while (i >= 4) {   /* 4 * float32 */
   1.632 +            const __m128 floats = _mm_load_ps(src);
   1.633 +            /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
   1.634 +            const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
   1.635 +            const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
   1.636 +            _mm_store_si128(mmdst, _mm_or_si128(_mm_bslli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
   1.637 +            i -= 4; src += 4; mmdst++;
   1.638 +        }
   1.639 +        dst = (Sint32 *) mmdst;
   1.640 +    }
   1.641 +
   1.642 +    /* Finish off any leftovers with scalar operations. */
   1.643 +    while (i) {
   1.644 +        *dst = (Sint32) (((double) *src) * 2147483647.0);
   1.645 +        i--; src++; dst++;
   1.646 +    }
   1.647 +
   1.648 +    if (cvt->filters[++cvt->filter_index]) {
   1.649 +        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   1.650 +    }
   1.651 +}
   1.652 +#endif
   1.653 +
   1.654 +
   1.655 +void SDL_ChooseAudioConverters(void)
   1.656 +{
   1.657 +    static SDL_bool converters_chosen = SDL_FALSE;
   1.658 +
   1.659 +    if (converters_chosen) {
   1.660 +        return;
   1.661 +    }
   1.662 +
   1.663 +    #define SET_CONVERTER_FUNCS(fntype) \
   1.664 +        SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   1.665 +        SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   1.666 +        SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   1.667 +        SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   1.668 +        SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   1.669 +        SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   1.670 +        SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   1.671 +        SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   1.672 +        SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   1.673 +        SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   1.674 +        converters_chosen = SDL_TRUE
   1.675 +
   1.676 +    #if HAVE_SSE2_INTRINSICS
   1.677 +    if (SDL_HasSSE2()) {
   1.678 +        SET_CONVERTER_FUNCS(SSE2);
   1.679 +        return;
   1.680 +    }
   1.681 +    #endif
   1.682 +
   1.683 +    #if NEED_SCALAR_CONVERTER_FALLBACKS
   1.684 +    SET_CONVERTER_FUNCS(Scalar);
   1.685 +    #endif
   1.686 +
   1.687 +    #undef SET_CONVERTER_FUNCS
   1.688 +
   1.689 +    SDL_assert(converters_chosen == SDL_TRUE);
   1.690 +}
   1.691  
   1.692  /* vi: set ts=4 sw=4 expandtab: */