src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Tue, 15 May 2018 02:29:35 -0400
changeset 11991 2a487acdb306
parent 11990 ce0099c8d037
child 11992 08c415f14810
permissions -rw-r--r--
audio: SSE2 float-to-int converters should clamp input.

The scalar versions already do this.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: write NEON code. */
    29 #define HAVE_NEON_INTRINSICS 0
    30 
    31 #ifdef __SSE2__
    32 #define HAVE_SSE2_INTRINSICS 1
    33 #endif
    34 
    35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    43 #endif
    44 
    45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    48 #endif
    49 
    50 /* Function pointers set to a CPU-specific implementation. */
    51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    61 
    62 
    63 #define DIVBY128 0.0078125f
    64 #define DIVBY32768 0.000030517578125f
    65 #define DIVBY8388607 0.00000011920930376163766f
    66 
    67 
    68 #if NEED_SCALAR_CONVERTER_FALLBACKS
    69 static void SDLCALL
    70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    71 {
    72     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    73     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    74     int i;
    75 
    76     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    77 
    78     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    79         *dst = ((float) *src) * DIVBY128;
    80     }
    81 
    82     cvt->len_cvt *= 4;
    83     if (cvt->filters[++cvt->filter_index]) {
    84         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    85     }
    86 }
    87 
    88 static void SDLCALL
    89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    90 {
    91     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    92     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    93     int i;
    94 
    95     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    96 
    97     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    98         *dst = (((float) *src) * DIVBY128) - 1.0f;
    99     }
   100 
   101     cvt->len_cvt *= 4;
   102     if (cvt->filters[++cvt->filter_index]) {
   103         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   104     }
   105 }
   106 
   107 static void SDLCALL
   108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   109 {
   110     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   111     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   112     int i;
   113 
   114     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   115 
   116     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   117         *dst = ((float) *src) * DIVBY32768;
   118     }
   119 
   120     cvt->len_cvt *= 2;
   121     if (cvt->filters[++cvt->filter_index]) {
   122         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   123     }
   124 }
   125 
   126 static void SDLCALL
   127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   128 {
   129     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   130     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   131     int i;
   132 
   133     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   134 
   135     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   136         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   137     }
   138 
   139     cvt->len_cvt *= 2;
   140     if (cvt->filters[++cvt->filter_index]) {
   141         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   142     }
   143 }
   144 
   145 static void SDLCALL
   146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   147 {
   148     const Sint32 *src = (const Sint32 *) cvt->buf;
   149     float *dst = (float *) cvt->buf;
   150     int i;
   151 
   152     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   153 
   154     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   155         *dst = ((float) (*src>>8)) * DIVBY8388607;
   156     }
   157 
   158     if (cvt->filters[++cvt->filter_index]) {
   159         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   160     }
   161 }
   162 
   163 static void SDLCALL
   164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   165 {
   166     const float *src = (const float *) cvt->buf;
   167     Sint8 *dst = (Sint8 *) cvt->buf;
   168     int i;
   169 
   170     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   171 
   172     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   173         const float sample = *src;
   174         if (sample >= 1.0f) {
   175             *dst = 127;
   176         } else if (sample <= -1.0f) {
   177             *dst = -128;
   178         } else {
   179             *dst = (Sint8)(sample * 127.0f);
   180         }
   181     }
   182 
   183     cvt->len_cvt /= 4;
   184     if (cvt->filters[++cvt->filter_index]) {
   185         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   186     }
   187 }
   188 
   189 static void SDLCALL
   190 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   191 {
   192     const float *src = (const float *) cvt->buf;
   193     Uint8 *dst = (Uint8 *) cvt->buf;
   194     int i;
   195 
   196     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   197 
   198     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   199         const float sample = *src;
   200         if (sample >= 1.0f) {
   201             *dst = 255;
   202         } else if (sample <= -1.0f) {
   203             *dst = 0;
   204         } else {
   205             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   206         }
   207     }
   208 
   209     cvt->len_cvt /= 4;
   210     if (cvt->filters[++cvt->filter_index]) {
   211         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   212     }
   213 }
   214 
   215 static void SDLCALL
   216 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   217 {
   218     const float *src = (const float *) cvt->buf;
   219     Sint16 *dst = (Sint16 *) cvt->buf;
   220     int i;
   221 
   222     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   223 
   224     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   225         const float sample = *src;
   226         if (sample >= 1.0f) {
   227             *dst = 32767;
   228         } else if (sample <= -1.0f) {
   229             *dst = -32768;
   230         } else {
   231             *dst = (Sint16)(sample * 32767.0f);
   232         }
   233     }
   234 
   235     cvt->len_cvt /= 2;
   236     if (cvt->filters[++cvt->filter_index]) {
   237         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   238     }
   239 }
   240 
   241 static void SDLCALL
   242 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   243 {
   244     const float *src = (const float *) cvt->buf;
   245     Uint16 *dst = (Uint16 *) cvt->buf;
   246     int i;
   247 
   248     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   249 
   250     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   251         const float sample = *src;
   252         if (sample >= 1.0f) {
   253             *dst = 65535;
   254         } else if (sample <= -1.0f) {
   255             *dst = 0;
   256         } else {
   257             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   258         }
   259     }
   260 
   261     cvt->len_cvt /= 2;
   262     if (cvt->filters[++cvt->filter_index]) {
   263         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   264     }
   265 }
   266 
   267 static void SDLCALL
   268 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   269 {
   270     const float *src = (const float *) cvt->buf;
   271     Sint32 *dst = (Sint32 *) cvt->buf;
   272     int i;
   273 
   274     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   275 
   276     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   277         const float sample = *src;
   278         if (sample >= 1.0f) {
   279             *dst = 2147483647;
   280         } else if (sample <= -1.0f) {
   281             *dst = -2147483648;
   282         } else {
   283             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   284         }
   285     }
   286 
   287     if (cvt->filters[++cvt->filter_index]) {
   288         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   289     }
   290 }
   291 #endif
   292 
   293 
   294 #if HAVE_SSE2_INTRINSICS
   295 static void SDLCALL
   296 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   297 {
   298     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   299     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   300     int i;
   301 
   302     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   303 
   304     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   305     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   306         *dst = ((float) *src) * DIVBY128;
   307     }
   308 
   309     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   310     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   311 
   312     /* Make sure src is aligned too. */
   313     if ((((size_t) src) & 15) == 0) {
   314         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   315         const __m128i *mmsrc = (const __m128i *) src;
   316         const __m128i zero = _mm_setzero_si128();
   317         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   318         while (i >= 16) {   /* 16 * 8-bit */
   319             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   320             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   321             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   322             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   323             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   324             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   325             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
   326             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
   327             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
   328             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
   329             /* Interleave back into correct order, store. */
   330             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   331             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   332             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   333             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   334             i -= 16; mmsrc--; dst -= 16;
   335         }
   336 
   337         src = (const Sint8 *) mmsrc;
   338     }
   339 
   340     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   341 
   342     /* Finish off any leftovers with scalar operations. */
   343     while (i) {
   344         *dst = ((float) *src) * DIVBY128;
   345         i--; src--; dst--;
   346     }
   347 
   348     cvt->len_cvt *= 4;
   349     if (cvt->filters[++cvt->filter_index]) {
   350         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   351     }
   352 }
   353 
   354 static void SDLCALL
   355 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   356 {
   357     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   358     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   359     int i;
   360 
   361     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   362 
   363     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   364     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   365         *dst = (((float) *src) * DIVBY128) - 1.0f;
   366     }
   367 
   368     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   369     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   370 
   371     /* Make sure src is aligned too. */
   372     if ((((size_t) src) & 15) == 0) {
   373         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   374         const __m128i *mmsrc = (const __m128i *) src;
   375         const __m128i zero = _mm_setzero_si128();
   376         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   377         const __m128 minus1 = _mm_set1_ps(-1.0f);
   378         while (i >= 16) {   /* 16 * 8-bit */
   379             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   380             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   381             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   382             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   383             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   384             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   385             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   386             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   387             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   388             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   389             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   390             /* Interleave back into correct order, store. */
   391             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   392             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   393             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   394             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   395             i -= 16; mmsrc--; dst -= 16;
   396         }
   397 
   398         src = (const Uint8 *) mmsrc;
   399     }
   400 
   401     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   402 
   403     /* Finish off any leftovers with scalar operations. */
   404     while (i) {
   405         *dst = (((float) *src) * DIVBY128) - 1.0f;
   406         i--; src--; dst--;
   407     }
   408 
   409     cvt->len_cvt *= 4;
   410     if (cvt->filters[++cvt->filter_index]) {
   411         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   412     }
   413 }
   414 
   415 static void SDLCALL
   416 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   417 {
   418     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   419     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   420     int i;
   421 
   422     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   423 
   424     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   425     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   426         *dst = ((float) *src) * DIVBY32768;
   427     }
   428 
   429     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   430     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   431 
   432     /* Make sure src is aligned too. */
   433     if ((((size_t) src) & 15) == 0) {
   434         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   435         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   436         while (i >= 8) {   /* 8 * 16-bit */
   437             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   438             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   439             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   440             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   441             const __m128i b = _mm_srai_epi32(ints, 16);
   442             /* Interleave these back into the right order, convert to float, multiply, store. */
   443             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   444             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   445             i -= 8; src -= 8; dst -= 8;
   446         }
   447     }
   448 
   449     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   450 
   451     /* Finish off any leftovers with scalar operations. */
   452     while (i) {
   453         *dst = ((float) *src) * DIVBY32768;
   454         i--; src--; dst--;
   455     }
   456 
   457     cvt->len_cvt *= 2;
   458     if (cvt->filters[++cvt->filter_index]) {
   459         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   460     }
   461 }
   462 
   463 static void SDLCALL
   464 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   465 {
   466     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   467     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   468     int i;
   469 
   470     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   471 
   472     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   473     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   474         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   475     }
   476 
   477     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   478     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   479 
   480     /* Make sure src is aligned too. */
   481     if ((((size_t) src) & 15) == 0) {
   482         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   483         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   484         const __m128 minus1 = _mm_set1_ps(1.0f);
   485         while (i >= 8) {   /* 8 * 16-bit */
   486             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   487             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   488             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   489             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   490             const __m128i b = _mm_srli_epi32(ints, 16);
   491             /* Interleave these back into the right order, convert to float, multiply, store. */
   492             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   493             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   494             i -= 8; src -= 8; dst -= 8;
   495         }
   496     }
   497 
   498     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   499 
   500     /* Finish off any leftovers with scalar operations. */
   501     while (i) {
   502         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   503         i--; src--; dst--;
   504     }
   505 
   506     cvt->len_cvt *= 2;
   507     if (cvt->filters[++cvt->filter_index]) {
   508         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   509     }
   510 }
   511 
   512 static void SDLCALL
   513 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   514 {
   515     const Sint32 *src = (const Sint32 *) cvt->buf;
   516     float *dst = (float *) cvt->buf;
   517     int i;
   518 
   519     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   520 
   521     /* Get dst aligned to 16 bytes */
   522     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   523         *dst = ((float) (*src>>8)) * DIVBY8388607;
   524     }
   525 
   526     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   527     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   528 
   529     {
   530         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   531         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
   532         const __m128i *mmsrc = (const __m128i *) src;
   533         while (i >= 4) {   /* 4 * sint32 */
   534             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
   535             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
   536             i -= 4; mmsrc++; dst += 4;
   537         }
   538         src = (const Sint32 *) mmsrc;
   539     }
   540 
   541     /* Finish off any leftovers with scalar operations. */
   542     while (i) {
   543         *dst = ((float) (*src>>8)) * DIVBY8388607;
   544         i--; src++; dst++;
   545     }
   546 
   547     if (cvt->filters[++cvt->filter_index]) {
   548         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   549     }
   550 }
   551 
   552 static void SDLCALL
   553 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   554 {
   555     const float *src = (const float *) cvt->buf;
   556     Sint8 *dst = (Sint8 *) cvt->buf;
   557     int i;
   558 
   559     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   560 
   561     /* Get dst aligned to 16 bytes */
   562     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   563         const float sample = *src;
   564         if (sample >= 1.0f) {
   565             *dst = 127;
   566         } else if (sample <= -1.0f) {
   567             *dst = -128;
   568         } else {
   569             *dst = (Sint8)(sample * 127.0f);
   570         }
   571     }
   572 
   573     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   574 
   575     /* Make sure src is aligned too. */
   576     if ((((size_t) src) & 15) == 0) {
   577         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   578         const __m128 one = _mm_set1_ps(1.0f);
   579         const __m128 negone = _mm_set1_ps(-1.0f);
   580         const __m128 mulby127 = _mm_set1_ps(127.0f);
   581         __m128i *mmdst = (__m128i *) dst;
   582         while (i >= 16) {   /* 16 * float32 */
   583             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   584             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   585             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   586             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   587             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   588             i -= 16; src += 16; mmdst++;
   589         }
   590         dst = (Sint8 *) mmdst;
   591     }
   592 
   593     /* Finish off any leftovers with scalar operations. */
   594     while (i) {
   595         const float sample = *src;
   596         if (sample >= 1.0f) {
   597             *dst = 127;
   598         } else if (sample <= -1.0f) {
   599             *dst = -128;
   600         } else {
   601             *dst = (Sint8)(sample * 127.0f);
   602         }
   603         i--; src++; dst++;
   604     }
   605 
   606     cvt->len_cvt /= 4;
   607     if (cvt->filters[++cvt->filter_index]) {
   608         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   609     }
   610 }
   611 
   612 static void SDLCALL
   613 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   614 {
   615     const float *src = (const float *) cvt->buf;
   616     Uint8 *dst = (Uint8 *) cvt->buf;
   617     int i;
   618 
   619     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   620 
   621     /* Get dst aligned to 16 bytes */
   622     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   623         const float sample = *src;
   624         if (sample >= 1.0f) {
   625             *dst = 255;
   626         } else if (sample <= -1.0f) {
   627             *dst = 0;
   628         } else {
   629             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   630         }
   631     }
   632 
   633     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   634 
   635     /* Make sure src is aligned too. */
   636     if ((((size_t) src) & 15) == 0) {
   637         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   638         const __m128 one = _mm_set1_ps(1.0f);
   639         const __m128 negone = _mm_set1_ps(-1.0f);
   640         const __m128 mulby127 = _mm_set1_ps(127.0f);
   641         __m128i *mmdst = (__m128i *) dst;
   642         while (i >= 16) {   /* 16 * float32 */
   643             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   644             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   645             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   646             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   647             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   648             i -= 16; src += 16; mmdst++;
   649         }
   650         dst = (Uint8 *) mmdst;
   651     }
   652 
   653     /* Finish off any leftovers with scalar operations. */
   654     while (i) {
   655         const float sample = *src;
   656         if (sample >= 1.0f) {
   657             *dst = 255;
   658         } else if (sample <= -1.0f) {
   659             *dst = 0;
   660         } else {
   661             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   662         }
   663         i--; src++; dst++;
   664     }
   665 
   666     cvt->len_cvt /= 4;
   667     if (cvt->filters[++cvt->filter_index]) {
   668         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   669     }
   670 }
   671 
   672 static void SDLCALL
   673 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   674 {
   675     const float *src = (const float *) cvt->buf;
   676     Sint16 *dst = (Sint16 *) cvt->buf;
   677     int i;
   678 
   679     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   680 
   681     /* Get dst aligned to 16 bytes */
   682     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   683         const float sample = *src;
   684         if (sample >= 1.0f) {
   685             *dst = 32767;
   686         } else if (sample <= -1.0f) {
   687             *dst = -32768;
   688         } else {
   689             *dst = (Sint16)(sample * 32767.0f);
   690         }
   691     }
   692 
   693     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   694 
   695     /* Make sure src is aligned too. */
   696     if ((((size_t) src) & 15) == 0) {
   697         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   698         const __m128 one = _mm_set1_ps(1.0f);
   699         const __m128 negone = _mm_set1_ps(-1.0f);
   700         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   701         __m128i *mmdst = (__m128i *) dst;
   702         while (i >= 8) {   /* 8 * float32 */
   703             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   704             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   705             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   706             i -= 8; src += 8; mmdst++;
   707         }
   708         dst = (Sint16 *) mmdst;
   709     }
   710 
   711     /* Finish off any leftovers with scalar operations. */
   712     while (i) {
   713         const float sample = *src;
   714         if (sample >= 1.0f) {
   715             *dst = 32767;
   716         } else if (sample <= -1.0f) {
   717             *dst = -32768;
   718         } else {
   719             *dst = (Sint16)(sample * 32767.0f);
   720         }
   721         i--; src++; dst++;
   722     }
   723 
   724     cvt->len_cvt /= 2;
   725     if (cvt->filters[++cvt->filter_index]) {
   726         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   727     }
   728 }
   729 
   730 static void SDLCALL
   731 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   732 {
   733     const float *src = (const float *) cvt->buf;
   734     Uint16 *dst = (Uint16 *) cvt->buf;
   735     int i;
   736 
   737     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   738 
   739     /* Get dst aligned to 16 bytes */
   740     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   741         const float sample = *src;
   742         if (sample >= 1.0f) {
   743             *dst = 65535;
   744         } else if (sample <= -1.0f) {
   745             *dst = 0;
   746         } else {
   747             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   748         }
   749     }
   750 
   751     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   752 
   753     /* Make sure src is aligned too. */
   754     if ((((size_t) src) & 15) == 0) {
   755         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   756         /* This calculates differently than the scalar path because SSE2 can't
   757            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   758            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   759            but not before SSE 4.1. So we convert from float to sint16, packing
   760            that down with legit signed saturation, and then xor the top bit
   761            against 1. This results in the correct unsigned 16-bit value, even
   762            though it looks like dark magic. */
   763         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   764         const __m128i topbit = _mm_set1_epi16(-32768);
   765         const __m128 one = _mm_set1_ps(1.0f);
   766         const __m128 negone = _mm_set1_ps(-1.0f);
   767         __m128i *mmdst = (__m128i *) dst;
   768         while (i >= 8) {   /* 8 * float32 */
   769             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   770             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   771             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   772             i -= 8; src += 8; mmdst++;
   773         }
   774         dst = (Uint16 *) mmdst;
   775     }
   776 
   777     /* Finish off any leftovers with scalar operations. */
   778     while (i) {
   779         const float sample = *src;
   780         if (sample >= 1.0f) {
   781             *dst = 65535;
   782         } else if (sample <= -1.0f) {
   783             *dst = 0;
   784         } else {
   785             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   786         }
   787         i--; src++; dst++;
   788     }
   789 
   790     cvt->len_cvt /= 2;
   791     if (cvt->filters[++cvt->filter_index]) {
   792         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   793     }
   794 }
   795 
   796 static void SDLCALL
   797 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   798 {
   799     const float *src = (const float *) cvt->buf;
   800     Sint32 *dst = (Sint32 *) cvt->buf;
   801     int i;
   802 
   803     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   804 
   805     /* Get dst aligned to 16 bytes */
   806     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   807         const float sample = *src;
   808         if (sample >= 1.0f) {
   809             *dst = 2147483647;
   810         } else if (sample <= -1.0f) {
   811             *dst = -2147483648;
   812         } else {
   813             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   814         }
   815     }
   816 
   817     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   818     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   819 
   820     {
   821         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   822         const __m128 one = _mm_set1_ps(1.0f);
   823         const __m128 negone = _mm_set1_ps(-1.0f);
   824         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
   825         __m128i *mmdst = (__m128i *) dst;
   826         while (i >= 4) {   /* 4 * float32 */
   827             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
   828             i -= 4; src += 4; mmdst++;
   829         }
   830         dst = (Sint32 *) mmdst;
   831     }
   832 
   833     /* Finish off any leftovers with scalar operations. */
   834     while (i) {
   835         const float sample = *src;
   836         if (sample >= 1.0f) {
   837             *dst = 2147483647;
   838         } else if (sample <= -1.0f) {
   839             *dst = -2147483648;
   840         } else {
   841             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   842         }
   843         i--; src++; dst++;
   844     }
   845 
   846     if (cvt->filters[++cvt->filter_index]) {
   847         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   848     }
   849 }
   850 #endif
   851 
   852 
   853 void SDL_ChooseAudioConverters(void)
   854 {
   855     static SDL_bool converters_chosen = SDL_FALSE;
   856 
   857     if (converters_chosen) {
   858         return;
   859     }
   860 
   861 #define SET_CONVERTER_FUNCS(fntype) \
   862         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   863         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   864         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   865         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   866         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   867         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   868         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   869         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   870         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   871         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   872         converters_chosen = SDL_TRUE
   873 
   874 #if HAVE_SSE2_INTRINSICS
   875     if (SDL_HasSSE2()) {
   876         SET_CONVERTER_FUNCS(SSE2);
   877         return;
   878     }
   879 #endif
   880 
   881 #if NEED_SCALAR_CONVERTER_FALLBACKS
   882     SET_CONVERTER_FUNCS(Scalar);
   883 #endif
   884 
   885 #undef SET_CONVERTER_FUNCS
   886 
   887     SDL_assert(converters_chosen == SDL_TRUE);
   888 }
   889 
   890 /* vi: set ts=4 sw=4 expandtab: */