src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Mon, 23 Jan 2017 01:05:44 -0500
changeset 10835 0e9e7a128391
parent 10815 71bbe3233508
child 10836 8f0aa225f261
permissions -rw-r--r--
audio: Wired up new SSE code to build system.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: write NEON code. */
    29 #define HAVE_NEON_INTRINSICS 0
    30 
    31 #ifdef __SSE2__
    32 #define HAVE_SSE2_INTRINSICS 1
    33 #endif
    34 
    35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    43 #endif
    44 
    45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    48 #endif
    49 
    50 /* Function pointers set to a CPU-specific implementation. */
    51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    61 
    62 
    63 #define DIVBY127 0.0078740157480315f
    64 #define DIVBY32767 3.05185094759972e-05f
    65 #define DIVBY2147483647 4.6566128752458e-10f
    66 
    67 
    68 #if NEED_SCALAR_CONVERTER_FALLBACKS
    69 static void SDLCALL
    70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    71 {
    72     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    73     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    74     int i;
    75 
    76     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    77 
    78     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    79         *dst = (((float) *src) * DIVBY127);
    80     }
    81 
    82     cvt->len_cvt *= 4;
    83     if (cvt->filters[++cvt->filter_index]) {
    84         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    85     }
    86 }
    87 
    88 static void SDLCALL
    89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    90 {
    91     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    92     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    93     int i;
    94 
    95     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    96 
    97     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    98         *dst = ((((float) *src) * DIVBY127) - 1.0f);
    99     }
   100 
   101     cvt->len_cvt *= 4;
   102     if (cvt->filters[++cvt->filter_index]) {
   103         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   104     }
   105 }
   106 
   107 static void SDLCALL
   108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   109 {
   110     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   111     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   112     int i;
   113 
   114     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   115 
   116     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   117         *dst = (((float) *src) * DIVBY32767);
   118     }
   119 
   120     cvt->len_cvt *= 2;
   121     if (cvt->filters[++cvt->filter_index]) {
   122         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   123     }
   124 }
   125 
   126 static void SDLCALL
   127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   128 {
   129     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   130     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   131     int i;
   132 
   133     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   134 
   135     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   136         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   137     }
   138 
   139     cvt->len_cvt *= 2;
   140     if (cvt->filters[++cvt->filter_index]) {
   141         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   142     }
   143 }
   144 
   145 static void SDLCALL
   146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   147 {
   148     const Sint32 *src = (const Sint32 *) cvt->buf;
   149     float *dst = (float *) cvt->buf;
   150     int i;
   151 
   152     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   153 
   154     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   155         *dst = (float) (((double) *src) * DIVBY2147483647);
   156     }
   157 
   158     if (cvt->filters[++cvt->filter_index]) {
   159         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   160     }
   161 }
   162 
   163 static void SDLCALL
   164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   165 {
   166     const float *src = (const float *) cvt->buf;
   167     Sint8 *dst = (Sint8 *) cvt->buf;
   168     int i;
   169 
   170     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   171 
   172     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   173         *dst = (Sint8) (*src * 127.0f);
   174     }
   175 
   176     cvt->len_cvt /= 4;
   177     if (cvt->filters[++cvt->filter_index]) {
   178         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   179     }
   180 }
   181 
   182 static void SDLCALL
   183 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   184 {
   185     const float *src = (const float *) cvt->buf;
   186     Uint8 *dst = (Uint8 *) cvt->buf;
   187     int i;
   188 
   189     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   190 
   191     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   192         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   193     }
   194 
   195     cvt->len_cvt /= 4;
   196     if (cvt->filters[++cvt->filter_index]) {
   197         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   198     }
   199 }
   200 
   201 static void SDLCALL
   202 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   203 {
   204     const float *src = (const float *) cvt->buf;
   205     Sint16 *dst = (Sint16 *) cvt->buf;
   206     int i;
   207 
   208     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   209 
   210     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   211         *dst = (Sint16) (*src * 32767.0f);
   212     }
   213 
   214     cvt->len_cvt /= 2;
   215     if (cvt->filters[++cvt->filter_index]) {
   216         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   217     }
   218 }
   219 
   220 static void SDLCALL
   221 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   222 {
   223     const float *src = (const float *) cvt->buf;
   224     Uint16 *dst = (Uint16 *) cvt->buf;
   225     int i;
   226 
   227     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   228 
   229     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   230         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   231     }
   232 
   233     cvt->len_cvt /= 2;
   234     if (cvt->filters[++cvt->filter_index]) {
   235         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   236     }
   237 }
   238 
   239 static void SDLCALL
   240 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   241 {
   242     const float *src = (const float *) cvt->buf;
   243     Sint32 *dst = (Sint32 *) cvt->buf;
   244     int i;
   245 
   246     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   247 
   248     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   249         *dst = (Sint32) (((double) *src) * 2147483647.0);
   250     }
   251 
   252     if (cvt->filters[++cvt->filter_index]) {
   253         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   254     }
   255 }
   256 #endif
   257 
   258 
   259 #if HAVE_SSE2_INTRINSICS
   260 static void SDLCALL
   261 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   262 {
   263     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   264     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   265     int i;
   266 
   267     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   268 
   269     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   270     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   271         *dst = (((float) *src) * DIVBY127);
   272     }
   273 
   274     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   275     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   276 
   277     /* Make sure src is aligned too. */
   278     if ((((size_t) src) & 15) == 0) {
   279         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   280         const __m128i *mmsrc = (const __m128i *) src;
   281         const __m128i zero = _mm_setzero_si128();
   282         const __m128 divby127 = _mm_set1_ps(DIVBY127);
   283         while (i >= 16) {   /* 16 * 8-bit */
   284             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   285             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   286             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   287             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   288             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   289             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   290             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby127);
   291             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby127);
   292             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby127);
   293             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby127);
   294             /* Interleave back into correct order, store. */
   295             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   296             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   297             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   298             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   299             i -= 16; mmsrc--; dst -= 16;
   300         }
   301 
   302         src = (const Sint8 *) mmsrc;
   303     }
   304 
   305     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   306 
   307     /* Finish off any leftovers with scalar operations. */
   308     while (i) {
   309         *dst = (((float) *src) * DIVBY127);
   310         i--; src--; dst--;
   311     }
   312 
   313     cvt->len_cvt *= 4;
   314     if (cvt->filters[++cvt->filter_index]) {
   315         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   316     }
   317 }
   318 
   319 static void SDLCALL
   320 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   321 {
   322     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   323     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   324     int i;
   325 
   326     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   327 
   328     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   329     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   330         *dst = ((((float) *src) * DIVBY127) - 1.0f);
   331     }
   332 
   333     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   334     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   335 
   336     /* Make sure src is aligned too. */
   337     if ((((size_t) src) & 15) == 0) {
   338         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   339         const __m128i *mmsrc = (const __m128i *) src;
   340         const __m128i zero = _mm_setzero_si128();
   341         const __m128 divby127 = _mm_set1_ps(DIVBY127);
   342         const __m128 minus1 = _mm_set1_ps(-1.0f);
   343         while (i >= 16) {   /* 16 * 8-bit */
   344             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   345             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   346             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   347             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   348             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   349             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   350             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   351             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby127), minus1);
   352             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby127), minus1);
   353             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby127), minus1);
   354             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby127), minus1);
   355             /* Interleave back into correct order, store. */
   356             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   357             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   358             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   359             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   360             i -= 16; mmsrc--; dst -= 16;
   361         }
   362 
   363         src = (const Uint8 *) mmsrc;
   364     }
   365 
   366     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   367 
   368     /* Finish off any leftovers with scalar operations. */
   369     while (i) {
   370         *dst = ((((float) *src) * DIVBY127) - 1.0f);
   371         i--; src--; dst--;
   372     }
   373 
   374     cvt->len_cvt *= 4;
   375     if (cvt->filters[++cvt->filter_index]) {
   376         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   377     }
   378 }
   379 
   380 static void SDLCALL
   381 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   382 {
   383     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   384     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   385     int i;
   386 
   387     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   388 
   389     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   390     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   391         *dst = (((float) *src) * DIVBY32767);
   392     }
   393 
   394     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   395     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   396 
   397     /* Make sure src is aligned too. */
   398     if ((((size_t) src) & 15) == 0) {
   399         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   400         const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   401         while (i >= 8) {   /* 8 * 16-bit */
   402             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   403             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   404             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   405             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   406             const __m128i b = _mm_srai_epi32(ints, 16);
   407             /* Interleave these back into the right order, convert to float, multiply, store. */
   408             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767));
   409             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767));
   410             i -= 8; src -= 8; dst -= 8;
   411         }
   412     }
   413 
   414     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   415 
   416     /* Finish off any leftovers with scalar operations. */
   417     while (i) {
   418         *dst = (((float) *src) * DIVBY32767);
   419         i--; src--; dst--;
   420     }
   421 
   422     cvt->len_cvt *= 2;
   423     if (cvt->filters[++cvt->filter_index]) {
   424         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   425     }
   426 }
   427 
   428 static void SDLCALL
   429 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   430 {
   431     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   432     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   433     int i;
   434 
   435     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   436 
   437     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   438     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   439         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   440     }
   441 
   442     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   443     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   444 
   445     /* Make sure src is aligned too. */
   446     if ((((size_t) src) & 15) == 0) {
   447         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   448         const __m128 divby32767 = _mm_set1_ps(DIVBY32767);
   449         const __m128 minus1 = _mm_set1_ps(1.0f);
   450         while (i >= 8) {   /* 8 * 16-bit */
   451             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   452             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   453             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   454             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   455             const __m128i b = _mm_srli_epi32(ints, 16);
   456             /* Interleave these back into the right order, convert to float, multiply, store. */
   457             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32767), minus1));
   458             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32767), minus1));
   459             i -= 8; src -= 8; dst -= 8;
   460         }
   461     }
   462 
   463     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   464 
   465     /* Finish off any leftovers with scalar operations. */
   466     while (i) {
   467         *dst = ((((float) *src) * DIVBY32767) - 1.0f);
   468         i--; src--; dst--;
   469     }
   470 
   471     cvt->len_cvt *= 2;
   472     if (cvt->filters[++cvt->filter_index]) {
   473         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   474     }
   475 }
   476 
   477 static void SDLCALL
   478 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   479 {
   480     const Sint32 *src = (const Sint32 *) cvt->buf;
   481     float *dst = (float *) cvt->buf;
   482     int i;
   483 
   484     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   485 
   486     /* Get dst aligned to 16 bytes */
   487     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   488         *dst = (float) (((double) *src) * DIVBY2147483647);
   489     }
   490 
   491     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   492     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   493 
   494     {
   495         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   496         const __m128d divby2147483647 = _mm_set1_pd(DIVBY2147483647);
   497         const __m128i *mmsrc = (const __m128i *) src;
   498         while (i >= 4) {   /* 4 * sint32 */
   499             const __m128i ints = _mm_load_si128(mmsrc);
   500             /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
   501             const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_bsrli_si128(ints, 8)), divby2147483647);
   502             const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483647);
   503             /* convert to float32, bitshift/or to get these into a vector to store. */
   504             _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_bslli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
   505             i -= 4; mmsrc++; dst += 4;
   506         }
   507         src = (const Sint32 *) mmsrc;
   508     }
   509 
   510     /* Finish off any leftovers with scalar operations. */
   511     while (i) {
   512         *dst = (float) (((double) *src) * DIVBY2147483647);
   513         i--; src++; dst++;
   514     }
   515 
   516     if (cvt->filters[++cvt->filter_index]) {
   517         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   518     }
   519 }
   520 
   521 static void SDLCALL
   522 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   523 {
   524     const float *src = (const float *) cvt->buf;
   525     Sint8 *dst = (Sint8 *) cvt->buf;
   526     int i;
   527 
   528     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   529 
   530     /* Get dst aligned to 16 bytes */
   531     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   532         *dst = (Sint8) (*src * 127.0f);
   533     }
   534 
   535     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   536 
   537     /* Make sure src is aligned too. */
   538     if ((((size_t) src) & 15) == 0) {
   539         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   540         const __m128 mulby127 = _mm_set1_ps(127.0f);
   541         __m128i *mmdst = (__m128i *) dst;
   542         while (i >= 16) {   /* 16 * float32 */
   543             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
   544             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
   545             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
   546             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
   547             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   548             i -= 16; src += 16; mmdst++;
   549         }
   550         dst = (Sint8 *) mmdst;
   551     }
   552 
   553     /* Finish off any leftovers with scalar operations. */
   554     while (i) {
   555         *dst = (Sint8) (*src * 127.0f);
   556         i--; src++; dst++;
   557     }
   558 
   559     cvt->len_cvt /= 4;
   560     if (cvt->filters[++cvt->filter_index]) {
   561         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   562     }
   563 }
   564 
   565 static void SDLCALL
   566 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   567 {
   568     const float *src = (const float *) cvt->buf;
   569     Uint8 *dst = (Uint8 *) cvt->buf;
   570     int i;
   571 
   572     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   573 
   574     /* Get dst aligned to 16 bytes */
   575     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   576         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   577     }
   578 
   579     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   580 
   581     /* Make sure src is aligned too. */
   582     if ((((size_t) src) & 15) == 0) {
   583         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   584         const __m128 add1 = _mm_set1_ps(1.0f);
   585         const __m128 mulby127 = _mm_set1_ps(127.0f);
   586         __m128i *mmdst = (__m128i *) dst;
   587         while (i >= 16) {   /* 16 * float32 */
   588             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
   589             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
   590             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
   591             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
   592             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   593             i -= 16; src += 16; mmdst++;
   594         }
   595         dst = (Uint8 *) mmdst;
   596     }
   597 
   598     /* Finish off any leftovers with scalar operations. */
   599     while (i) {
   600         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   601         i--; src++; dst++;
   602     }
   603 
   604     cvt->len_cvt /= 4;
   605     if (cvt->filters[++cvt->filter_index]) {
   606         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   607     }
   608 }
   609 
   610 static void SDLCALL
   611 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   612 {
   613     const float *src = (const float *) cvt->buf;
   614     Sint16 *dst = (Sint16 *) cvt->buf;
   615     int i;
   616 
   617     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   618 
   619     /* Get dst aligned to 16 bytes */
   620     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   621         *dst = (Sint16) (*src * 32767.0f);
   622     }
   623 
   624     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   625 
   626     /* Make sure src is aligned too. */
   627     if ((((size_t) src) & 15) == 0) {
   628         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   629         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   630         __m128i *mmdst = (__m128i *) dst;
   631         while (i >= 8) {   /* 8 * float32 */
   632             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   633             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   634             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   635             i -= 8; src += 8; mmdst++;
   636         }
   637         dst = (Sint16 *) mmdst;
   638     }
   639 
   640     /* Finish off any leftovers with scalar operations. */
   641     while (i) {
   642         *dst = (((float) *src) * DIVBY32767);
   643         i--; src++; dst++;
   644     }
   645 
   646     cvt->len_cvt /= 2;
   647     if (cvt->filters[++cvt->filter_index]) {
   648         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   649     }
   650 }
   651 
   652 static void SDLCALL
   653 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   654 {
   655     const float *src = (const float *) cvt->buf;
   656     Uint16 *dst = (Uint16 *) cvt->buf;
   657     int i;
   658 
   659     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   660 
   661     /* Get dst aligned to 16 bytes */
   662     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   663         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   664     }
   665 
   666     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   667 
   668     /* Make sure src is aligned too. */
   669     if ((((size_t) src) & 15) == 0) {
   670         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   671         /* This calculates differently than the scalar path because SSE2 can't
   672            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   673            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   674            but not before SSE 4.1. So we convert from float to sint16, packing
   675            that down with legit signed saturation, and then xor the top bit
   676            against 1. This results in the correct unsigned 16-bit value, even
   677            though it looks like dark magic. */
   678         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   679         const __m128i topbit = _mm_set1_epi16(-32768);
   680         __m128i *mmdst = (__m128i *) dst;
   681         while (i >= 8) {   /* 8 * float32 */
   682             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   683             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   684             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   685             i -= 8; src += 8; mmdst++;
   686         }
   687         dst = (Uint16 *) mmdst;
   688     }
   689 
   690     /* Finish off any leftovers with scalar operations. */
   691     while (i) {
   692         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   693         i--; src++; dst++;
   694     }
   695 
   696     cvt->len_cvt /= 2;
   697     if (cvt->filters[++cvt->filter_index]) {
   698         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   699     }
   700 }
   701 
   702 static void SDLCALL
   703 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   704 {
   705     const float *src = (const float *) cvt->buf;
   706     Sint32 *dst = (Sint32 *) cvt->buf;
   707     int i;
   708 
   709     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   710 
   711     /* Get dst aligned to 16 bytes */
   712     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   713         *dst = (Sint32) (((double) *src) * 2147483647.0);
   714     }
   715 
   716     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   717     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   718 
   719     {
   720         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   721         const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
   722         __m128i *mmdst = (__m128i *) dst;
   723         while (i >= 4) {   /* 4 * float32 */
   724             const __m128 floats = _mm_load_ps(src);
   725             /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
   726             const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
   727             const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
   728             _mm_store_si128(mmdst, _mm_or_si128(_mm_bslli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
   729             i -= 4; src += 4; mmdst++;
   730         }
   731         dst = (Sint32 *) mmdst;
   732     }
   733 
   734     /* Finish off any leftovers with scalar operations. */
   735     while (i) {
   736         *dst = (Sint32) (((double) *src) * 2147483647.0);
   737         i--; src++; dst++;
   738     }
   739 
   740     if (cvt->filters[++cvt->filter_index]) {
   741         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   742     }
   743 }
   744 #endif
   745 
   746 
   747 void SDL_ChooseAudioConverters(void)
   748 {
   749     static SDL_bool converters_chosen = SDL_FALSE;
   750 
   751     if (converters_chosen) {
   752         return;
   753     }
   754 
   755     #define SET_CONVERTER_FUNCS(fntype) \
   756         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   757         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   758         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   759         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   760         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   761         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   762         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   763         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   764         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   765         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   766         converters_chosen = SDL_TRUE
   767 
   768     #if HAVE_SSE2_INTRINSICS
   769     if (SDL_HasSSE2()) {
   770         SET_CONVERTER_FUNCS(SSE2);
   771         return;
   772     }
   773     #endif
   774 
   775     #if NEED_SCALAR_CONVERTER_FALLBACKS
   776     SET_CONVERTER_FUNCS(Scalar);
   777     #endif
   778 
   779     #undef SET_CONVERTER_FUNCS
   780 
   781     SDL_assert(converters_chosen == SDL_TRUE);
   782 }
   783 
   784 /* vi: set ts=4 sw=4 expandtab: */