src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Mon, 16 Jan 2017 00:58:28 -0500
changeset 10815 71bbe3233508
parent 10814 938218064f67
child 10835 0e9e7a128391
permissions -rw-r--r--
audio: Implemented SIMD support for audio data type converters.

This currently adds an SSE2 implementation (but it's #ifdef'd out for now,
until it's hooked up to the configure script and such).
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: write NEON code. */
    29 #define HAVE_NEON_INTRINSICS 0
    30 
    31 /* !!! FIXME: wire this up to the configure script, etc. */
    32 #define HAVE_SSE2_INTRINSICS 0
    33 
    34 #if HAVE_SSE2_INTRINSICS
    35 #include <emmintrin.h>
    36 #endif
    37 
    38 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    39 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    40 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    41 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    42 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    43 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    44 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    45 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    46 #endif
    47 
    48 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    49 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    50 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    51 #endif
    52 
    53 /* Function pointers set to a CPU-specific implementation. */
    54 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    57 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    58 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    61 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    62 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    63 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    64 
    65 
    66 #define DIVBY127 0.0078740157480315f
    67 #define DIVBY32767 3.05185094759972e-05f
    68 #define DIVBY2147483647 4.6566128752458e-10f
    69 
    70 
    71 #if NEED_SCALAR_CONVERTER_FALLBACKS
    72 static void SDLCALL
    73 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    74 {
    75     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    76     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    77     int i;
    78 
    79     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    80 
    81     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    82         *dst = (((float) *src) * DIVBY127);
    83     }
    84 
    85     cvt->len_cvt *= 4;
    86     if (cvt->filters[++cvt->filter_index]) {
    87         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    88     }
    89 }
    90 
    91 static void SDLCALL
    92 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    93 {
    94     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    95     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    96     int i;
    97 
    98     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    99 
   100     for (i = cvt->len_cvt; i; --i, --src, --dst) {
   101         *dst = ((((float) *src) * DIVBY127) - 1.0f);
   102     }
   103 
   104     cvt->len_cvt *= 4;
   105     if (cvt->filters[++cvt->filter_index]) {
   106         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   107     }
   108 }
   109 
   110 static void SDLCALL
   111 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   112 {
   113     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   114     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   115     int i;
   116 
   117     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   118 
   119     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   120         *dst = (((float) *src) * DIVBY32767);
   121     }
   122 
   123     cvt->len_cvt *= 2;
   124     if (cvt->filters[++cvt->filter_index]) {
   125         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   126     }
   127 }
   128 
   129 static void SDLCALL
   130 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   131 {
   132     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   133     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   134     int i;
   135 
   136     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   137 
   138     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   139         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   140     }
   141 
   142     cvt->len_cvt *= 2;
   143     if (cvt->filters[++cvt->filter_index]) {
   144         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   145     }
   146 }
   147 
   148 static void SDLCALL
   149 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   150 {
   151     const Sint32 *src = (const Sint32 *) cvt->buf;
   152     float *dst = (float *) cvt->buf;
   153     int i;
   154 
   155     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   156 
   157     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   158         *dst = (float) (((double) *src) * DIVBY2147483647);
   159     }
   160 
   161     if (cvt->filters[++cvt->filter_index]) {
   162         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   163     }
   164 }
   165 
   166 static void SDLCALL
   167 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   168 {
   169     const float *src = (const float *) cvt->buf;
   170     Sint8 *dst = (Sint8 *) cvt->buf;
   171     int i;
   172 
   173     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   174 
   175     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   176         *dst = (Sint8) (*src * 127.0f);
   177     }
   178 
   179     cvt->len_cvt /= 4;
   180     if (cvt->filters[++cvt->filter_index]) {
   181         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   182     }
   183 }
   184 
   185 static void SDLCALL
   186 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   187 {
   188     const float *src = (const float *) cvt->buf;
   189     Uint8 *dst = (Uint8 *) cvt->buf;
   190     int i;
   191 
   192     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   193 
   194     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   195         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   196     }
   197 
   198     cvt->len_cvt /= 4;
   199     if (cvt->filters[++cvt->filter_index]) {
   200         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   201     }
   202 }
   203 
   204 static void SDLCALL
   205 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   206 {
   207     const float *src = (const float *) cvt->buf;
   208     Sint16 *dst = (Sint16 *) cvt->buf;
   209     int i;
   210 
   211     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   212 
   213     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   214         *dst = (Sint16) (*src * 32767.0f);
   215     }
   216 
   217     cvt->len_cvt /= 2;
   218     if (cvt->filters[++cvt->filter_index]) {
   219         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   220     }
   221 }
   222 
   223 static void SDLCALL
   224 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   225 {
   226     const float *src = (const float *) cvt->buf;
   227     Uint16 *dst = (Uint16 *) cvt->buf;
   228     int i;
   229 
   230     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   231 
   232     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   233         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   234     }
   235 
   236     cvt->len_cvt /= 2;
   237     if (cvt->filters[++cvt->filter_index]) {
   238         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   239     }
   240 }
   241 
   242 static void SDLCALL
   243 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   244 {
   245     const float *src = (const float *) cvt->buf;
   246     Sint32 *dst = (Sint32 *) cvt->buf;
   247     int i;
   248 
   249     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   250 
   251     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   252         *dst = (Sint32) (((double) *src) * 2147483647.0);
   253     }
   254 
   255     if (cvt->filters[++cvt->filter_index]) {
   256         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   257     }
   258 }
   259 #endif
   260 
   261 
   262 #if HAVE_SSE2_INTRINSICS
   263 static void SDLCALL
   264 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   265 {
   266     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   267     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   268     int i;
   269 
   270     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   271 
   272     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   273     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   274         *dst = (((float) *src) * DIVBY127);
   275     }
   276 
   277     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   278     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   279 
   280     /* Make sure src is aligned too. */
   281     if ((((size_t) src) & 15) == 0) {
   282         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   283         const __m128i *mmsrc = (const __m128i *) src;
   284         const __m128i zero = _mm_setzero_si128();
   285         const __m128 divby127 = _mm_set1_ps(DIVBY127);
   286         while (i >= 16) {   /* 16 * 8-bit */
   287             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   288             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   289             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   290             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   291             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   292             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   293             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby127);
   294             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby127);
   295             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby127);
   296             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby127);
   297             /* Interleave back into correct order, store. */
   298             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   299             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   300             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   301             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   302             i -= 16; mmsrc--; dst -= 16;
   303         }
   304 
   305         src = (const Sint8 *) mmsrc;
   306     }
   307 
   308     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   309 
   310     /* Finish off any leftovers with scalar operations. */
   311     while (i) {
   312         *dst = (((float) *src) * DIVBY127);
   313         i--; src--; dst--;
   314     }
   315 
   316     cvt->len_cvt *= 4;
   317     if (cvt->filters[++cvt->filter_index]) {
   318         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   319     }
   320 }
   321 
   322 static void SDLCALL
   323 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   324 {
   325     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   326     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   327     int i;
   328 
   329     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   330 
   331     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   332     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   333         *dst = ((((float) *src) * DIVBY127) - 1.0f);
   334     }
   335 
   336     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   337     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   338 
   339     /* Make sure src is aligned too. */
   340     if ((((size_t) src) & 15) == 0) {
   341         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   342         const __m128i *mmsrc = (const __m128i *) src;
   343         const __m128i zero = _mm_setzero_si128();
   344         const __m128 divby127 = _mm_set1_ps(DIVBY127);
   345         const __m128 minus1 = _mm_set1_ps(-1.0f);
   346         while (i >= 16) {   /* 16 * 8-bit */
   347             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   348             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   349             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   350             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   351             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   352             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   353             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   354             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby127), minus1);
   355             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby127), minus1);
   356             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby127), minus1);
   357             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby127), minus1);
   358             /* Interleave back into correct order, store. */
   359             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   360             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   361             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   362             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   363             i -= 16; mmsrc--; dst -= 16;
   364         }
   365 
   366         src = (const Uint8 *) mmsrc;
   367     }
   368 
   369     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   370 
   371     /* Finish off any leftovers with scalar operations. */
   372     while (i) {
   373         *dst = ((((float) *src) * DIVBY127) - 1.0f);
   374         i--; src--; dst--;
   375     }
   376 
   377     cvt->len_cvt *= 4;
   378     if (cvt->filters[++cvt->filter_index]) {
   379         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   380     }
   381 }
   382 
   383 static void SDLCALL
   384 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   385 {
   386     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   387     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   388     int i;
   389 
   390     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   391 
   392     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   393     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   394         *dst = (((float) *src) * DIVBY32767);
   395     }
   396 
   397     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   398     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   399 
   400     /* Make sure src is aligned too. */
   401     if ((((size_t) src) & 15) == 0) {
   402         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   403         const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   404         while (i >= 8) {   /* 8 * 16-bit */
   405             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   406             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   407             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   408             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   409             const __m128i b = _mm_srai_epi32(ints, 16);
   410             /* Interleave these back into the right order, convert to float, multiply, store. */
   411             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767));
   412             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767));
   413             i -= 8; src -= 8; dst -= 8;
   414         }
   415     }
   416 
   417     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   418 
   419     /* Finish off any leftovers with scalar operations. */
   420     while (i) {
   421         *dst = (((float) *src) * DIVBY32767);
   422         i--; src--; dst--;
   423     }
   424 
   425     cvt->len_cvt *= 2;
   426     if (cvt->filters[++cvt->filter_index]) {
   427         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   428     }
   429 }
   430 
   431 static void SDLCALL
   432 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   433 {
   434     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   435     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   436     int i;
   437 
   438     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   439 
   440     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   441     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   442         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   443     }
   444 
   445     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   446     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   447 
   448     /* Make sure src is aligned too. */
   449     if ((((size_t) src) & 15) == 0) {
   450         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   451         const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   452         const __m128 minus1 = _mm_set1_ps(1.0f);
   453         while (i >= 8) {   /* 8 * 16-bit */
   454             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   455             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   456             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   457             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   458             const __m128i b = _mm_srli_epi32(ints, 16);
   459             /* Interleave these back into the right order, convert to float, multiply, store. */
   460             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767), minus1));
   461             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767), minus1));
   462             i -= 8; src -= 8; dst -= 8;
   463         }
   464     }
   465 
   466     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   467 
   468     /* Finish off any leftovers with scalar operations. */
   469     while (i) {
   470         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   471         i--; src--; dst--;
   472     }
   473 
   474     cvt->len_cvt *= 2;
   475     if (cvt->filters[++cvt->filter_index]) {
   476         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   477     }
   478 }
   479 
   480 static void SDLCALL
   481 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   482 {
   483     const Sint32 *src = (const Sint32 *) cvt->buf;
   484     float *dst = (float *) cvt->buf;
   485     int i;
   486 
   487     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   488 
   489     /* Get dst aligned to 16 bytes */
   490     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   491         *dst = (float) (((double) *src) * DIVBY2147483647);
   492     }
   493 
   494     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   495     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   496 
   497     {
   498         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   499         const __m128d divby2147483647 = _mm_set1_pd(DIVBY2147483647);
   500         const __m128i *mmsrc = (const __m128i *) src;
   501         while (i >= 4) {   /* 4 * sint32 */
   502             const __m128i ints = _mm_load_si128(mmsrc);
   503             /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
   504             const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_bsrli_si128(ints, 8)), divby2147483647);
   505             const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483647);
   506             /* convert to float32, bitshift/or to get these into a vector to store. */
   507             _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_bslli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
   508             i -= 4; mmsrc++; dst += 4;
   509         }
   510         src = (const Sint32 *) mmsrc;
   511     }
   512 
   513     /* Finish off any leftovers with scalar operations. */
   514     while (i) {
   515         *dst = (float) (((double) *src) * DIVBY2147483647);
   516         i--; src++; dst++;
   517     }
   518 
   519     if (cvt->filters[++cvt->filter_index]) {
   520         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   521     }
   522 }
   523 
   524 static void SDLCALL
   525 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   526 {
   527     const float *src = (const float *) cvt->buf;
   528     Sint8 *dst = (Sint8 *) cvt->buf;
   529     int i;
   530 
   531     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   532 
   533     /* Get dst aligned to 16 bytes */
   534     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   535         *dst = (Sint8) (*src * 127.0f);
   536     }
   537 
   538     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   539 
   540     /* Make sure src is aligned too. */
   541     if ((((size_t) src) & 15) == 0) {
   542         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   543         const __m128 mulby127 = _mm_set1_ps(127.0f);
   544         __m128i *mmdst = (__m128i *) dst;
   545         while (i >= 16) {   /* 16 * float32 */
   546             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
   547             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
   548             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
   549             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
   550             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   551             i -= 16; src += 16; mmdst++;
   552         }
   553         dst = (Sint8 *) mmdst;
   554     }
   555 
   556     /* Finish off any leftovers with scalar operations. */
   557     while (i) {
   558         *dst = (Sint8) (*src * 127.0f);
   559         i--; src++; dst++;
   560     }
   561 
   562     cvt->len_cvt /= 4;
   563     if (cvt->filters[++cvt->filter_index]) {
   564         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   565     }
   566 }
   567 
   568 static void SDLCALL
   569 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   570 {
   571     const float *src = (const float *) cvt->buf;
   572     Uint8 *dst = (Uint8 *) cvt->buf;
   573     int i;
   574 
   575     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   576 
   577     /* Get dst aligned to 16 bytes */
   578     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   579         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   580     }
   581 
   582     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   583 
   584     /* Make sure src is aligned too. */
   585     if ((((size_t) src) & 15) == 0) {
   586         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   587         const __m128 add1 = _mm_set1_ps(1.0f);
   588         const __m128 mulby127 = _mm_set1_ps(127.0f);
   589         __m128i *mmdst = (__m128i *) dst;
   590         while (i >= 16) {   /* 16 * float32 */
   591             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
   592             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
   593             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
   594             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
   595             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   596             i -= 16; src += 16; mmdst++;
   597         }
   598         dst = (Uint8 *) mmdst;
   599     }
   600 
   601     /* Finish off any leftovers with scalar operations. */
   602     while (i) {
   603         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   604         i--; src++; dst++;
   605     }
   606 
   607     cvt->len_cvt /= 4;
   608     if (cvt->filters[++cvt->filter_index]) {
   609         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   610     }
   611 }
   612 
   613 static void SDLCALL
   614 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   615 {
   616     const float *src = (const float *) cvt->buf;
   617     Sint16 *dst = (Sint16 *) cvt->buf;
   618     int i;
   619 
   620     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   621 
   622     /* Get dst aligned to 16 bytes */
   623     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   624         *dst = (Sint16) (*src * 32767.0f);
   625     }
   626 
   627     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   628 
   629     /* Make sure src is aligned too. */
   630     if ((((size_t) src) & 15) == 0) {
   631         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   632         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   633         __m128i *mmdst = (__m128i *) dst;
   634         while (i >= 8) {   /* 8 * float32 */
   635             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   636             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   637             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   638             i -= 8; src += 8; mmdst++;
   639         }
   640         dst = (Sint16 *) mmdst;
   641     }
   642 
   643     /* Finish off any leftovers with scalar operations. */
   644     while (i) {
   645         *dst = (((float) *src) * DIVBY32767);
   646         i--; src++; dst++;
   647     }
   648 
   649     cvt->len_cvt /= 2;
   650     if (cvt->filters[++cvt->filter_index]) {
   651         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   652     }
   653 }
   654 
   655 static void SDLCALL
   656 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   657 {
   658     const float *src = (const float *) cvt->buf;
   659     Uint16 *dst = (Uint16 *) cvt->buf;
   660     int i;
   661 
   662     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   663 
   664     /* Get dst aligned to 16 bytes */
   665     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   666         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   667     }
   668 
   669     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   670 
   671     /* Make sure src is aligned too. */
   672     if ((((size_t) src) & 15) == 0) {
   673         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   674         /* This calculates differently than the scalar path because SSE2 can't
   675            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   676            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   677            but not before SSE 4.1. So we convert from float to sint16, packing
   678            that down with legit signed saturation, and then xor the top bit
   679            against 1. This results in the correct unsigned 16-bit value, even
   680            though it looks like dark magic. */
   681         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   682         const __m128i topbit = _mm_set1_epi16(-32768);
   683         __m128i *mmdst = (__m128i *) dst;
   684         while (i >= 8) {   /* 8 * float32 */
   685             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   686             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   687             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   688             i -= 8; src += 8; mmdst++;
   689         }
   690         dst = (Uint16 *) mmdst;
   691     }
   692 
   693     /* Finish off any leftovers with scalar operations. */
   694     while (i) {
   695         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   696         i--; src++; dst++;
   697     }
   698 
   699     cvt->len_cvt /= 2;
   700     if (cvt->filters[++cvt->filter_index]) {
   701         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   702     }
   703 }
   704 
   705 static void SDLCALL
   706 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   707 {
   708     const float *src = (const float *) cvt->buf;
   709     Sint32 *dst = (Sint32 *) cvt->buf;
   710     int i;
   711 
   712     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   713 
   714     /* Get dst aligned to 16 bytes */
   715     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   716         *dst = (Sint32) (((double) *src) * 2147483647.0);
   717     }
   718 
   719     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   720     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   721 
   722     {
   723         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   724         const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
   725         __m128i *mmdst = (__m128i *) dst;
   726         while (i >= 4) {   /* 4 * float32 */
   727             const __m128 floats = _mm_load_ps(src);
   728             /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
   729             const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
   730             const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
   731             _mm_store_si128(mmdst, _mm_or_si128(_mm_bslli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
   732             i -= 4; src += 4; mmdst++;
   733         }
   734         dst = (Sint32 *) mmdst;
   735     }
   736 
   737     /* Finish off any leftovers with scalar operations. */
   738     while (i) {
   739         *dst = (Sint32) (((double) *src) * 2147483647.0);
   740         i--; src++; dst++;
   741     }
   742 
   743     if (cvt->filters[++cvt->filter_index]) {
   744         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   745     }
   746 }
   747 #endif
   748 
   749 
   750 void SDL_ChooseAudioConverters(void)
   751 {
   752     static SDL_bool converters_chosen = SDL_FALSE;
   753 
   754     if (converters_chosen) {
   755         return;
   756     }
   757 
   758     #define SET_CONVERTER_FUNCS(fntype) \
   759         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   760         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   761         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   762         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   763         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   764         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   765         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   766         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   767         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   768         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   769         converters_chosen = SDL_TRUE
   770 
   771     #if HAVE_SSE2_INTRINSICS
   772     if (SDL_HasSSE2()) {
   773         SET_CONVERTER_FUNCS(SSE2);
   774         return;
   775     }
   776     #endif
   777 
   778     #if NEED_SCALAR_CONVERTER_FALLBACKS
   779     SET_CONVERTER_FUNCS(Scalar);
   780     #endif
   781 
   782     #undef SET_CONVERTER_FUNCS
   783 
   784     SDL_assert(converters_chosen == SDL_TRUE);
   785 }
   786 
   787 /* vi: set ts=4 sw=4 expandtab: */