src/audio/SDL_audiotypecvt.c
author Sam Lantinga <slouken@libsdl.org>
Wed, 18 Oct 2017 19:30:47 -0700
changeset 11633 37aca00967db
parent 11406 f40c2dedaded
child 11811 5d94cb6b24d3
permissions -rw-r--r--
Fixed bug 3876 - Resampling of certain sounds adds heavy distortion

Simon Hug

Patch that adds [-1, 1] clamping to the scalar audio type conversions.

This may come from the SDL_Convert_F32_to_X_Scalar functions. They don't clamp the float value to [-1, 1] and when they cast it to the target integer it may be too large or too small for the type and get truncated, causing horrible noise.

The attached patch throws clamping in, but I don't know if that's the preferred way to fix this. For x86 (without SSE) the compiler (I tested MSVC) seems to throw a horrible amount of x87 code in it. It's a bit better with SSE, but probably still quite the performance hit. And SSE2 uses a branchless approach with maxss and minss.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: write NEON code. */
    29 #define HAVE_NEON_INTRINSICS 0
    30 
    31 #ifdef __SSE2__
    32 #define HAVE_SSE2_INTRINSICS 1
    33 #endif
    34 
    35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    43 #endif
    44 
    45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    48 #endif
    49 
    50 /* Function pointers set to a CPU-specific implementation. */
    51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    61 
    62 
    63 #define DIVBY128 0.0078125f
    64 #define DIVBY32768 0.000030517578125f
    65 #define DIVBY2147483648 0.00000000046566128730773926
    66 
    67 
    68 #if NEED_SCALAR_CONVERTER_FALLBACKS
    69 static void SDLCALL
    70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    71 {
    72     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    73     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    74     int i;
    75 
    76     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    77 
    78     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    79         *dst = ((float) *src) * DIVBY128;
    80     }
    81 
    82     cvt->len_cvt *= 4;
    83     if (cvt->filters[++cvt->filter_index]) {
    84         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    85     }
    86 }
    87 
    88 static void SDLCALL
    89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    90 {
    91     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    92     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    93     int i;
    94 
    95     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    96 
    97     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    98         *dst = (((float) *src) * DIVBY128) - 1.0f;
    99     }
   100 
   101     cvt->len_cvt *= 4;
   102     if (cvt->filters[++cvt->filter_index]) {
   103         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   104     }
   105 }
   106 
   107 static void SDLCALL
   108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   109 {
   110     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   111     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   112     int i;
   113 
   114     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   115 
   116     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   117         *dst = ((float) *src) * DIVBY32768;
   118     }
   119 
   120     cvt->len_cvt *= 2;
   121     if (cvt->filters[++cvt->filter_index]) {
   122         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   123     }
   124 }
   125 
   126 static void SDLCALL
   127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   128 {
   129     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   130     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   131     int i;
   132 
   133     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   134 
   135     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   136         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   137     }
   138 
   139     cvt->len_cvt *= 2;
   140     if (cvt->filters[++cvt->filter_index]) {
   141         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   142     }
   143 }
   144 
   145 static void SDLCALL
   146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   147 {
   148     const Sint32 *src = (const Sint32 *) cvt->buf;
   149     float *dst = (float *) cvt->buf;
   150     int i;
   151 
   152     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   153 
   154     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   155         *dst = (float) (((double) *src) * DIVBY2147483648);
   156     }
   157 
   158     if (cvt->filters[++cvt->filter_index]) {
   159         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   160     }
   161 }
   162 
   163 static void SDLCALL
   164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   165 {
   166     const float *src = (const float *) cvt->buf;
   167     Sint8 *dst = (Sint8 *) cvt->buf;
   168     int i;
   169 
   170     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   171 
   172     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   173         const float sample = *src;
   174         if (sample > 1.0f) {
   175             *dst = 127;
   176         } else if (sample < -1.0f) {
   177             *dst = -127;
   178         } else {
   179             *dst = (Sint8)(sample * 127.0f);
   180         }
   181     }
   182 
   183     cvt->len_cvt /= 4;
   184     if (cvt->filters[++cvt->filter_index]) {
   185         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   186     }
   187 }
   188 
   189 static void SDLCALL
   190 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   191 {
   192     const float *src = (const float *) cvt->buf;
   193     Uint8 *dst = (Uint8 *) cvt->buf;
   194     int i;
   195 
   196     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   197 
   198     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   199         const float sample = *src;
   200         if (sample > 1.0f) {
   201             *dst = 255;
   202         } else if (sample < -1.0f) {
   203             *dst = 0;
   204         } else {
   205             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   206         }
   207     }
   208 
   209     cvt->len_cvt /= 4;
   210     if (cvt->filters[++cvt->filter_index]) {
   211         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   212     }
   213 }
   214 
   215 static void SDLCALL
   216 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   217 {
   218     const float *src = (const float *) cvt->buf;
   219     Sint16 *dst = (Sint16 *) cvt->buf;
   220     int i;
   221 
   222     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   223 
   224     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   225         const float sample = *src;
   226         if (sample > 1.0f) {
   227             *dst = 32767;
   228         } else if (sample < -1.0f) {
   229             *dst = -32767;
   230         } else {
   231             *dst = (Sint16)(sample * 32767.0f);
   232         }
   233     }
   234 
   235     cvt->len_cvt /= 2;
   236     if (cvt->filters[++cvt->filter_index]) {
   237         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   238     }
   239 }
   240 
   241 static void SDLCALL
   242 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   243 {
   244     const float *src = (const float *) cvt->buf;
   245     Uint16 *dst = (Uint16 *) cvt->buf;
   246     int i;
   247 
   248     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   249 
   250     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   251         const float sample = *src;
   252         if (sample > 1.0f) {
   253             *dst = 65534;
   254         } else if (sample < -1.0f) {
   255             *dst = 0;
   256         } else {
   257             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   258         }
   259     }
   260 
   261     cvt->len_cvt /= 2;
   262     if (cvt->filters[++cvt->filter_index]) {
   263         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   264     }
   265 }
   266 
   267 static void SDLCALL
   268 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   269 {
   270     const float *src = (const float *) cvt->buf;
   271     Sint32 *dst = (Sint32 *) cvt->buf;
   272     int i;
   273 
   274     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   275 
   276     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   277         const float sample = *src;
   278         if (sample > 1.0f) {
   279             *dst = 2147483647;
   280         } else if (sample < -1.0f) {
   281             *dst = -2147483647;
   282         } else {
   283             *dst = (Sint32)((double)sample * 2147483647.0);
   284         }
   285     }
   286 
   287     if (cvt->filters[++cvt->filter_index]) {
   288         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   289     }
   290 }
   291 #endif
   292 
   293 
   294 #if HAVE_SSE2_INTRINSICS
   295 static void SDLCALL
   296 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   297 {
   298     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   299     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   300     int i;
   301 
   302     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   303 
   304     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   305     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   306         *dst = ((float) *src) * DIVBY128;
   307     }
   308 
   309     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   310     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   311 
   312     /* Make sure src is aligned too. */
   313     if ((((size_t) src) & 15) == 0) {
   314         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   315         const __m128i *mmsrc = (const __m128i *) src;
   316         const __m128i zero = _mm_setzero_si128();
   317         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   318         while (i >= 16) {   /* 16 * 8-bit */
   319             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   320             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   321             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   322             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   323             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   324             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   325             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
   326             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
   327             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
   328             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
   329             /* Interleave back into correct order, store. */
   330             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   331             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   332             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   333             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   334             i -= 16; mmsrc--; dst -= 16;
   335         }
   336 
   337         src = (const Sint8 *) mmsrc;
   338     }
   339 
   340     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   341 
   342     /* Finish off any leftovers with scalar operations. */
   343     while (i) {
   344         *dst = ((float) *src) * DIVBY128;
   345         i--; src--; dst--;
   346     }
   347 
   348     cvt->len_cvt *= 4;
   349     if (cvt->filters[++cvt->filter_index]) {
   350         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   351     }
   352 }
   353 
   354 static void SDLCALL
   355 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   356 {
   357     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   358     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   359     int i;
   360 
   361     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   362 
   363     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   364     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   365         *dst = (((float) *src) * DIVBY128) - 1.0f;
   366     }
   367 
   368     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   369     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   370 
   371     /* Make sure src is aligned too. */
   372     if ((((size_t) src) & 15) == 0) {
   373         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   374         const __m128i *mmsrc = (const __m128i *) src;
   375         const __m128i zero = _mm_setzero_si128();
   376         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   377         const __m128 minus1 = _mm_set1_ps(-1.0f);
   378         while (i >= 16) {   /* 16 * 8-bit */
   379             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   380             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   381             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   382             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   383             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   384             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   385             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   386             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   387             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   388             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   389             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   390             /* Interleave back into correct order, store. */
   391             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   392             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   393             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   394             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   395             i -= 16; mmsrc--; dst -= 16;
   396         }
   397 
   398         src = (const Uint8 *) mmsrc;
   399     }
   400 
   401     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   402 
   403     /* Finish off any leftovers with scalar operations. */
   404     while (i) {
   405         *dst = (((float) *src) * DIVBY128) - 1.0f;
   406         i--; src--; dst--;
   407     }
   408 
   409     cvt->len_cvt *= 4;
   410     if (cvt->filters[++cvt->filter_index]) {
   411         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   412     }
   413 }
   414 
   415 static void SDLCALL
   416 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   417 {
   418     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   419     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   420     int i;
   421 
   422     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   423 
   424     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   425     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   426         *dst = ((float) *src) * DIVBY32768;
   427     }
   428 
   429     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   430     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   431 
   432     /* Make sure src is aligned too. */
   433     if ((((size_t) src) & 15) == 0) {
   434         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   435         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   436         while (i >= 8) {   /* 8 * 16-bit */
   437             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   438             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   439             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   440             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   441             const __m128i b = _mm_srai_epi32(ints, 16);
   442             /* Interleave these back into the right order, convert to float, multiply, store. */
   443             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   444             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   445             i -= 8; src -= 8; dst -= 8;
   446         }
   447     }
   448 
   449     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   450 
   451     /* Finish off any leftovers with scalar operations. */
   452     while (i) {
   453         *dst = ((float) *src) * DIVBY32768;
   454         i--; src--; dst--;
   455     }
   456 
   457     cvt->len_cvt *= 2;
   458     if (cvt->filters[++cvt->filter_index]) {
   459         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   460     }
   461 }
   462 
   463 static void SDLCALL
   464 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   465 {
   466     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   467     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   468     int i;
   469 
   470     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   471 
   472     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   473     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   474         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   475     }
   476 
   477     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   478     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   479 
   480     /* Make sure src is aligned too. */
   481     if ((((size_t) src) & 15) == 0) {
   482         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   483         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   484         const __m128 minus1 = _mm_set1_ps(1.0f);
   485         while (i >= 8) {   /* 8 * 16-bit */
   486             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   487             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   488             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   489             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   490             const __m128i b = _mm_srli_epi32(ints, 16);
   491             /* Interleave these back into the right order, convert to float, multiply, store. */
   492             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   493             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   494             i -= 8; src -= 8; dst -= 8;
   495         }
   496     }
   497 
   498     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   499 
   500     /* Finish off any leftovers with scalar operations. */
   501     while (i) {
   502         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   503         i--; src--; dst--;
   504     }
   505 
   506     cvt->len_cvt *= 2;
   507     if (cvt->filters[++cvt->filter_index]) {
   508         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   509     }
   510 }
   511 
   512 static void SDLCALL
   513 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   514 {
   515     const Sint32 *src = (const Sint32 *) cvt->buf;
   516     float *dst = (float *) cvt->buf;
   517     int i;
   518 
   519     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   520 
   521     /* Get dst aligned to 16 bytes */
   522     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   523         *dst = (float) (((double) *src) * DIVBY2147483648);
   524     }
   525 
   526     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   527     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   528 
   529     {
   530         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   531         const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
   532         const __m128i *mmsrc = (const __m128i *) src;
   533         while (i >= 4) {   /* 4 * sint32 */
   534             const __m128i ints = _mm_load_si128(mmsrc);
   535             /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
   536             const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
   537             const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
   538             /* convert to float32, bitshift/or to get these into a vector to store. */
   539             _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
   540             i -= 4; mmsrc++; dst += 4;
   541         }
   542         src = (const Sint32 *) mmsrc;
   543     }
   544 
   545     /* Finish off any leftovers with scalar operations. */
   546     while (i) {
   547         *dst = (float) (((double) *src) * DIVBY2147483648);
   548         i--; src++; dst++;
   549     }
   550 
   551     if (cvt->filters[++cvt->filter_index]) {
   552         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   553     }
   554 }
   555 
   556 static void SDLCALL
   557 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   558 {
   559     const float *src = (const float *) cvt->buf;
   560     Sint8 *dst = (Sint8 *) cvt->buf;
   561     int i;
   562 
   563     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   564 
   565     /* Get dst aligned to 16 bytes */
   566     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   567         *dst = (Sint8) (*src * 127.0f);
   568     }
   569 
   570     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   571 
   572     /* Make sure src is aligned too. */
   573     if ((((size_t) src) & 15) == 0) {
   574         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   575         const __m128 mulby127 = _mm_set1_ps(127.0f);
   576         __m128i *mmdst = (__m128i *) dst;
   577         while (i >= 16) {   /* 16 * float32 */
   578             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
   579             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
   580             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
   581             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
   582             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   583             i -= 16; src += 16; mmdst++;
   584         }
   585         dst = (Sint8 *) mmdst;
   586     }
   587 
   588     /* Finish off any leftovers with scalar operations. */
   589     while (i) {
   590         *dst = (Sint8) (*src * 127.0f);
   591         i--; src++; dst++;
   592     }
   593 
   594     cvt->len_cvt /= 4;
   595     if (cvt->filters[++cvt->filter_index]) {
   596         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   597     }
   598 }
   599 
   600 static void SDLCALL
   601 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   602 {
   603     const float *src = (const float *) cvt->buf;
   604     Uint8 *dst = (Uint8 *) cvt->buf;
   605     int i;
   606 
   607     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   608 
   609     /* Get dst aligned to 16 bytes */
   610     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   611         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   612     }
   613 
   614     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   615 
   616     /* Make sure src is aligned too. */
   617     if ((((size_t) src) & 15) == 0) {
   618         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   619         const __m128 add1 = _mm_set1_ps(1.0f);
   620         const __m128 mulby127 = _mm_set1_ps(127.0f);
   621         __m128i *mmdst = (__m128i *) dst;
   622         while (i >= 16) {   /* 16 * float32 */
   623             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
   624             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
   625             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
   626             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
   627             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   628             i -= 16; src += 16; mmdst++;
   629         }
   630         dst = (Uint8 *) mmdst;
   631     }
   632 
   633     /* Finish off any leftovers with scalar operations. */
   634     while (i) {
   635         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   636         i--; src++; dst++;
   637     }
   638 
   639     cvt->len_cvt /= 4;
   640     if (cvt->filters[++cvt->filter_index]) {
   641         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   642     }
   643 }
   644 
   645 static void SDLCALL
   646 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   647 {
   648     const float *src = (const float *) cvt->buf;
   649     Sint16 *dst = (Sint16 *) cvt->buf;
   650     int i;
   651 
   652     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   653 
   654     /* Get dst aligned to 16 bytes */
   655     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   656         *dst = (Sint16) (*src * 32767.0f);
   657     }
   658 
   659     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   660 
   661     /* Make sure src is aligned too. */
   662     if ((((size_t) src) & 15) == 0) {
   663         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   664         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   665         __m128i *mmdst = (__m128i *) dst;
   666         while (i >= 8) {   /* 8 * float32 */
   667             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   668             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   669             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   670             i -= 8; src += 8; mmdst++;
   671         }
   672         dst = (Sint16 *) mmdst;
   673     }
   674 
   675     /* Finish off any leftovers with scalar operations. */
   676     while (i) {
   677         *dst = (Sint16) (*src * 32767.0f);
   678         i--; src++; dst++;
   679     }
   680 
   681     cvt->len_cvt /= 2;
   682     if (cvt->filters[++cvt->filter_index]) {
   683         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   684     }
   685 }
   686 
   687 static void SDLCALL
   688 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   689 {
   690     const float *src = (const float *) cvt->buf;
   691     Uint16 *dst = (Uint16 *) cvt->buf;
   692     int i;
   693 
   694     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   695 
   696     /* Get dst aligned to 16 bytes */
   697     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   698         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   699     }
   700 
   701     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   702 
   703     /* Make sure src is aligned too. */
   704     if ((((size_t) src) & 15) == 0) {
   705         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   706         /* This calculates differently than the scalar path because SSE2 can't
   707            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   708            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   709            but not before SSE 4.1. So we convert from float to sint16, packing
   710            that down with legit signed saturation, and then xor the top bit
   711            against 1. This results in the correct unsigned 16-bit value, even
   712            though it looks like dark magic. */
   713         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   714         const __m128i topbit = _mm_set1_epi16(-32768);
   715         __m128i *mmdst = (__m128i *) dst;
   716         while (i >= 8) {   /* 8 * float32 */
   717             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   718             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   719             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   720             i -= 8; src += 8; mmdst++;
   721         }
   722         dst = (Uint16 *) mmdst;
   723     }
   724 
   725     /* Finish off any leftovers with scalar operations. */
   726     while (i) {
   727         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   728         i--; src++; dst++;
   729     }
   730 
   731     cvt->len_cvt /= 2;
   732     if (cvt->filters[++cvt->filter_index]) {
   733         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   734     }
   735 }
   736 
   737 static void SDLCALL
   738 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   739 {
   740     const float *src = (const float *) cvt->buf;
   741     Sint32 *dst = (Sint32 *) cvt->buf;
   742     int i;
   743 
   744     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   745 
   746     /* Get dst aligned to 16 bytes */
   747     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   748         *dst = (Sint32) (((double) *src) * 2147483647.0);
   749     }
   750 
   751     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   752     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   753 
   754     {
   755         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   756         const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
   757         __m128i *mmdst = (__m128i *) dst;
   758         while (i >= 4) {   /* 4 * float32 */
   759             const __m128 floats = _mm_load_ps(src);
   760             /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
   761             const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
   762             const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
   763             _mm_store_si128(mmdst, _mm_or_si128(_mm_slli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
   764             i -= 4; src += 4; mmdst++;
   765         }
   766         dst = (Sint32 *) mmdst;
   767     }
   768 
   769     /* Finish off any leftovers with scalar operations. */
   770     while (i) {
   771         *dst = (Sint32) (((double) *src) * 2147483647.0);
   772         i--; src++; dst++;
   773     }
   774 
   775     if (cvt->filters[++cvt->filter_index]) {
   776         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   777     }
   778 }
   779 #endif
   780 
   781 
   782 void SDL_ChooseAudioConverters(void)
   783 {
   784     static SDL_bool converters_chosen = SDL_FALSE;
   785 
   786     if (converters_chosen) {
   787         return;
   788     }
   789 
   790 #define SET_CONVERTER_FUNCS(fntype) \
   791         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   792         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   793         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   794         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   795         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   796         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   797         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   798         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   799         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   800         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   801         converters_chosen = SDL_TRUE
   802 
   803 #if HAVE_SSE2_INTRINSICS
   804     if (SDL_HasSSE2()) {
   805         SET_CONVERTER_FUNCS(SSE2);
   806         return;
   807     }
   808 #endif
   809 
   810 #if NEED_SCALAR_CONVERTER_FALLBACKS
   811     SET_CONVERTER_FUNCS(Scalar);
   812 #endif
   813 
   814 #undef SET_CONVERTER_FUNCS
   815 
   816     SDL_assert(converters_chosen == SDL_TRUE);
   817 }
   818 
   819 /* vi: set ts=4 sw=4 expandtab: */