src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Tue, 15 May 2018 01:35:53 -0400
changeset 11989 d194f4f41437
parent 11987 0c284754e25b
child 11990 ce0099c8d037
permissions -rw-r--r--
audio: float to int converters should clamp inclusively.

If we have to test if a sample is > 1.0f anyhow, we might as well use this
to avoid the unnecessary multiplication when it's == 1.0f, too. (etc).
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: write NEON code. */
    29 #define HAVE_NEON_INTRINSICS 0
    30 
    31 #ifdef __SSE2__
    32 #define HAVE_SSE2_INTRINSICS 1
    33 #endif
    34 
    35 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    36 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    37 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    39 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    41 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    43 #endif
    44 
    45 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    46 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    47 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    48 #endif
    49 
    50 /* Function pointers set to a CPU-specific implementation. */
    51 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    52 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    53 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    57 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    61 
    62 
    63 #define DIVBY128 0.0078125f
    64 #define DIVBY32768 0.000030517578125f
    65 #define DIVBY8388607 0.00000011920930376163766f
    66 
    67 
    68 #if NEED_SCALAR_CONVERTER_FALLBACKS
    69 static void SDLCALL
    70 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    71 {
    72     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    73     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    74     int i;
    75 
    76     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    77 
    78     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    79         *dst = ((float) *src) * DIVBY128;
    80     }
    81 
    82     cvt->len_cvt *= 4;
    83     if (cvt->filters[++cvt->filter_index]) {
    84         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    85     }
    86 }
    87 
    88 static void SDLCALL
    89 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    90 {
    91     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    92     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    93     int i;
    94 
    95     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    96 
    97     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    98         *dst = (((float) *src) * DIVBY128) - 1.0f;
    99     }
   100 
   101     cvt->len_cvt *= 4;
   102     if (cvt->filters[++cvt->filter_index]) {
   103         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   104     }
   105 }
   106 
   107 static void SDLCALL
   108 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   109 {
   110     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   111     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   112     int i;
   113 
   114     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   115 
   116     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   117         *dst = ((float) *src) * DIVBY32768;
   118     }
   119 
   120     cvt->len_cvt *= 2;
   121     if (cvt->filters[++cvt->filter_index]) {
   122         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   123     }
   124 }
   125 
   126 static void SDLCALL
   127 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   128 {
   129     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   130     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   131     int i;
   132 
   133     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   134 
   135     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   136         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   137     }
   138 
   139     cvt->len_cvt *= 2;
   140     if (cvt->filters[++cvt->filter_index]) {
   141         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   142     }
   143 }
   144 
   145 static void SDLCALL
   146 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   147 {
   148     const Sint32 *src = (const Sint32 *) cvt->buf;
   149     float *dst = (float *) cvt->buf;
   150     int i;
   151 
   152     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   153 
   154     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   155         *dst = ((float) (*src>>8)) * DIVBY8388607;
   156     }
   157 
   158     if (cvt->filters[++cvt->filter_index]) {
   159         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   160     }
   161 }
   162 
   163 static void SDLCALL
   164 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   165 {
   166     const float *src = (const float *) cvt->buf;
   167     Sint8 *dst = (Sint8 *) cvt->buf;
   168     int i;
   169 
   170     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   171 
   172     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   173         const float sample = *src;
   174         if (sample >= 1.0f) {
   175             *dst = 127;
   176         } else if (sample <= -1.0f) {
   177             *dst = -127;
   178         } else {
   179             *dst = (Sint8)(sample * 127.0f);
   180         }
   181     }
   182 
   183     cvt->len_cvt /= 4;
   184     if (cvt->filters[++cvt->filter_index]) {
   185         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   186     }
   187 }
   188 
   189 static void SDLCALL
   190 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   191 {
   192     const float *src = (const float *) cvt->buf;
   193     Uint8 *dst = (Uint8 *) cvt->buf;
   194     int i;
   195 
   196     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   197 
   198     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   199         const float sample = *src;
   200         if (sample >= 1.0f) {
   201             *dst = 255;
   202         } else if (sample <= -1.0f) {
   203             *dst = 0;
   204         } else {
   205             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   206         }
   207     }
   208 
   209     cvt->len_cvt /= 4;
   210     if (cvt->filters[++cvt->filter_index]) {
   211         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   212     }
   213 }
   214 
   215 static void SDLCALL
   216 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   217 {
   218     const float *src = (const float *) cvt->buf;
   219     Sint16 *dst = (Sint16 *) cvt->buf;
   220     int i;
   221 
   222     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   223 
   224     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   225         const float sample = *src;
   226         if (sample >= 1.0f) {
   227             *dst = 32767;
   228         } else if (sample <= -1.0f) {
   229             *dst = -32767;
   230         } else {
   231             *dst = (Sint16)(sample * 32767.0f);
   232         }
   233     }
   234 
   235     cvt->len_cvt /= 2;
   236     if (cvt->filters[++cvt->filter_index]) {
   237         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   238     }
   239 }
   240 
   241 static void SDLCALL
   242 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   243 {
   244     const float *src = (const float *) cvt->buf;
   245     Uint16 *dst = (Uint16 *) cvt->buf;
   246     int i;
   247 
   248     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   249 
   250     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   251         const float sample = *src;
   252         if (sample >= 1.0f) {
   253             *dst = 65534;
   254         } else if (sample <= -1.0f) {
   255             *dst = 0;
   256         } else {
   257             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   258         }
   259     }
   260 
   261     cvt->len_cvt /= 2;
   262     if (cvt->filters[++cvt->filter_index]) {
   263         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   264     }
   265 }
   266 
   267 static void SDLCALL
   268 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   269 {
   270     const float *src = (const float *) cvt->buf;
   271     Sint32 *dst = (Sint32 *) cvt->buf;
   272     int i;
   273 
   274     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   275 
   276     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   277         const float sample = *src;
   278         if (sample >= 1.0f) {
   279             *dst = 2147483647;
   280         } else if (sample <= -1.0f) {
   281             *dst = -2147483647;
   282         } else {
   283             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   284         }
   285     }
   286 
   287     if (cvt->filters[++cvt->filter_index]) {
   288         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   289     }
   290 }
   291 #endif
   292 
   293 
   294 #if HAVE_SSE2_INTRINSICS
   295 static void SDLCALL
   296 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   297 {
   298     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   299     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   300     int i;
   301 
   302     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   303 
   304     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   305     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   306         *dst = ((float) *src) * DIVBY128;
   307     }
   308 
   309     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   310     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   311 
   312     /* Make sure src is aligned too. */
   313     if ((((size_t) src) & 15) == 0) {
   314         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   315         const __m128i *mmsrc = (const __m128i *) src;
   316         const __m128i zero = _mm_setzero_si128();
   317         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   318         while (i >= 16) {   /* 16 * 8-bit */
   319             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   320             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   321             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   322             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   323             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   324             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   325             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
   326             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
   327             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
   328             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
   329             /* Interleave back into correct order, store. */
   330             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   331             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   332             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   333             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   334             i -= 16; mmsrc--; dst -= 16;
   335         }
   336 
   337         src = (const Sint8 *) mmsrc;
   338     }
   339 
   340     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   341 
   342     /* Finish off any leftovers with scalar operations. */
   343     while (i) {
   344         *dst = ((float) *src) * DIVBY128;
   345         i--; src--; dst--;
   346     }
   347 
   348     cvt->len_cvt *= 4;
   349     if (cvt->filters[++cvt->filter_index]) {
   350         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   351     }
   352 }
   353 
   354 static void SDLCALL
   355 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   356 {
   357     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   358     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   359     int i;
   360 
   361     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   362 
   363     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   364     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   365         *dst = (((float) *src) * DIVBY128) - 1.0f;
   366     }
   367 
   368     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   369     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   370 
   371     /* Make sure src is aligned too. */
   372     if ((((size_t) src) & 15) == 0) {
   373         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   374         const __m128i *mmsrc = (const __m128i *) src;
   375         const __m128i zero = _mm_setzero_si128();
   376         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   377         const __m128 minus1 = _mm_set1_ps(-1.0f);
   378         while (i >= 16) {   /* 16 * 8-bit */
   379             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   380             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   381             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   382             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   383             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   384             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   385             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   386             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   387             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   388             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   389             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   390             /* Interleave back into correct order, store. */
   391             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   392             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   393             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   394             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   395             i -= 16; mmsrc--; dst -= 16;
   396         }
   397 
   398         src = (const Uint8 *) mmsrc;
   399     }
   400 
   401     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   402 
   403     /* Finish off any leftovers with scalar operations. */
   404     while (i) {
   405         *dst = (((float) *src) * DIVBY128) - 1.0f;
   406         i--; src--; dst--;
   407     }
   408 
   409     cvt->len_cvt *= 4;
   410     if (cvt->filters[++cvt->filter_index]) {
   411         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   412     }
   413 }
   414 
   415 static void SDLCALL
   416 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   417 {
   418     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   419     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   420     int i;
   421 
   422     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   423 
   424     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   425     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   426         *dst = ((float) *src) * DIVBY32768;
   427     }
   428 
   429     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   430     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   431 
   432     /* Make sure src is aligned too. */
   433     if ((((size_t) src) & 15) == 0) {
   434         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   435         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   436         while (i >= 8) {   /* 8 * 16-bit */
   437             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   438             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   439             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   440             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   441             const __m128i b = _mm_srai_epi32(ints, 16);
   442             /* Interleave these back into the right order, convert to float, multiply, store. */
   443             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   444             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   445             i -= 8; src -= 8; dst -= 8;
   446         }
   447     }
   448 
   449     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   450 
   451     /* Finish off any leftovers with scalar operations. */
   452     while (i) {
   453         *dst = ((float) *src) * DIVBY32768;
   454         i--; src--; dst--;
   455     }
   456 
   457     cvt->len_cvt *= 2;
   458     if (cvt->filters[++cvt->filter_index]) {
   459         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   460     }
   461 }
   462 
   463 static void SDLCALL
   464 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   465 {
   466     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   467     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   468     int i;
   469 
   470     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   471 
   472     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   473     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   474         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   475     }
   476 
   477     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   478     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   479 
   480     /* Make sure src is aligned too. */
   481     if ((((size_t) src) & 15) == 0) {
   482         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   483         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   484         const __m128 minus1 = _mm_set1_ps(1.0f);
   485         while (i >= 8) {   /* 8 * 16-bit */
   486             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   487             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   488             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   489             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   490             const __m128i b = _mm_srli_epi32(ints, 16);
   491             /* Interleave these back into the right order, convert to float, multiply, store. */
   492             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   493             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   494             i -= 8; src -= 8; dst -= 8;
   495         }
   496     }
   497 
   498     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   499 
   500     /* Finish off any leftovers with scalar operations. */
   501     while (i) {
   502         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   503         i--; src--; dst--;
   504     }
   505 
   506     cvt->len_cvt *= 2;
   507     if (cvt->filters[++cvt->filter_index]) {
   508         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   509     }
   510 }
   511 
   512 static void SDLCALL
   513 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   514 {
   515     const Sint32 *src = (const Sint32 *) cvt->buf;
   516     float *dst = (float *) cvt->buf;
   517     int i;
   518 
   519     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   520 
   521     /* Get dst aligned to 16 bytes */
   522     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   523         *dst = ((float) (*src>>8)) * DIVBY8388607;
   524     }
   525 
   526     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   527     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   528 
   529     {
   530         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   531         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
   532         const __m128i *mmsrc = (const __m128i *) src;
   533         while (i >= 4) {   /* 4 * sint32 */
   534             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
   535             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
   536             i -= 4; mmsrc++; dst += 4;
   537         }
   538         src = (const Sint32 *) mmsrc;
   539     }
   540 
   541     /* Finish off any leftovers with scalar operations. */
   542     while (i) {
   543         *dst = ((float) (*src>>8)) * DIVBY8388607;
   544         i--; src++; dst++;
   545     }
   546 
   547     if (cvt->filters[++cvt->filter_index]) {
   548         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   549     }
   550 }
   551 
   552 static void SDLCALL
   553 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   554 {
   555     const float *src = (const float *) cvt->buf;
   556     Sint8 *dst = (Sint8 *) cvt->buf;
   557     int i;
   558 
   559     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   560 
   561     /* Get dst aligned to 16 bytes */
   562     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   563         *dst = (Sint8) (*src * 127.0f);
   564     }
   565 
   566     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   567 
   568     /* Make sure src is aligned too. */
   569     if ((((size_t) src) & 15) == 0) {
   570         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   571         const __m128 mulby127 = _mm_set1_ps(127.0f);
   572         __m128i *mmdst = (__m128i *) dst;
   573         while (i >= 16) {   /* 16 * float32 */
   574             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
   575             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
   576             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
   577             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
   578             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   579             i -= 16; src += 16; mmdst++;
   580         }
   581         dst = (Sint8 *) mmdst;
   582     }
   583 
   584     /* Finish off any leftovers with scalar operations. */
   585     while (i) {
   586         *dst = (Sint8) (*src * 127.0f);
   587         i--; src++; dst++;
   588     }
   589 
   590     cvt->len_cvt /= 4;
   591     if (cvt->filters[++cvt->filter_index]) {
   592         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   593     }
   594 }
   595 
   596 static void SDLCALL
   597 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   598 {
   599     const float *src = (const float *) cvt->buf;
   600     Uint8 *dst = (Uint8 *) cvt->buf;
   601     int i;
   602 
   603     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   604 
   605     /* Get dst aligned to 16 bytes */
   606     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   607         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   608     }
   609 
   610     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   611 
   612     /* Make sure src is aligned too. */
   613     if ((((size_t) src) & 15) == 0) {
   614         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   615         const __m128 add1 = _mm_set1_ps(1.0f);
   616         const __m128 mulby127 = _mm_set1_ps(127.0f);
   617         __m128i *mmdst = (__m128i *) dst;
   618         while (i >= 16) {   /* 16 * float32 */
   619             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
   620             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
   621             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
   622             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
   623             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   624             i -= 16; src += 16; mmdst++;
   625         }
   626         dst = (Uint8 *) mmdst;
   627     }
   628 
   629     /* Finish off any leftovers with scalar operations. */
   630     while (i) {
   631         *dst = (Uint8) ((*src + 1.0f) * 127.0f);
   632         i--; src++; dst++;
   633     }
   634 
   635     cvt->len_cvt /= 4;
   636     if (cvt->filters[++cvt->filter_index]) {
   637         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   638     }
   639 }
   640 
   641 static void SDLCALL
   642 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   643 {
   644     const float *src = (const float *) cvt->buf;
   645     Sint16 *dst = (Sint16 *) cvt->buf;
   646     int i;
   647 
   648     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   649 
   650     /* Get dst aligned to 16 bytes */
   651     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   652         *dst = (Sint16) (*src * 32767.0f);
   653     }
   654 
   655     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   656 
   657     /* Make sure src is aligned too. */
   658     if ((((size_t) src) & 15) == 0) {
   659         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   660         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   661         __m128i *mmdst = (__m128i *) dst;
   662         while (i >= 8) {   /* 8 * float32 */
   663             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   664             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   665             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   666             i -= 8; src += 8; mmdst++;
   667         }
   668         dst = (Sint16 *) mmdst;
   669     }
   670 
   671     /* Finish off any leftovers with scalar operations. */
   672     while (i) {
   673         *dst = (Sint16) (*src * 32767.0f);
   674         i--; src++; dst++;
   675     }
   676 
   677     cvt->len_cvt /= 2;
   678     if (cvt->filters[++cvt->filter_index]) {
   679         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   680     }
   681 }
   682 
   683 static void SDLCALL
   684 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   685 {
   686     const float *src = (const float *) cvt->buf;
   687     Uint16 *dst = (Uint16 *) cvt->buf;
   688     int i;
   689 
   690     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   691 
   692     /* Get dst aligned to 16 bytes */
   693     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   694         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   695     }
   696 
   697     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   698 
   699     /* Make sure src is aligned too. */
   700     if ((((size_t) src) & 15) == 0) {
   701         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   702         /* This calculates differently than the scalar path because SSE2 can't
   703            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   704            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   705            but not before SSE 4.1. So we convert from float to sint16, packing
   706            that down with legit signed saturation, and then xor the top bit
   707            against 1. This results in the correct unsigned 16-bit value, even
   708            though it looks like dark magic. */
   709         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   710         const __m128i topbit = _mm_set1_epi16(-32768);
   711         __m128i *mmdst = (__m128i *) dst;
   712         while (i >= 8) {   /* 8 * float32 */
   713             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
   714             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
   715             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   716             i -= 8; src += 8; mmdst++;
   717         }
   718         dst = (Uint16 *) mmdst;
   719     }
   720 
   721     /* Finish off any leftovers with scalar operations. */
   722     while (i) {
   723         *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
   724         i--; src++; dst++;
   725     }
   726 
   727     cvt->len_cvt /= 2;
   728     if (cvt->filters[++cvt->filter_index]) {
   729         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   730     }
   731 }
   732 
   733 static void SDLCALL
   734 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   735 {
   736     const float *src = (const float *) cvt->buf;
   737     Sint32 *dst = (Sint32 *) cvt->buf;
   738     int i;
   739 
   740     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   741 
   742     /* Get dst aligned to 16 bytes */
   743     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   744         *dst = ((Sint32)(*src * 8388607.0f)) << 8;
   745     }
   746 
   747     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   748     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   749 
   750     {
   751         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   752         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
   753         __m128i *mmdst = (__m128i *) dst;
   754         while (i >= 4) {   /* 4 * float32 */
   755             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby8388607)), 8));
   756             i -= 4; src += 4; mmdst++;
   757         }
   758         dst = (Sint32 *) mmdst;
   759     }
   760 
   761     /* Finish off any leftovers with scalar operations. */
   762     while (i) {
   763         *dst = ((Sint32)(*src * 8388607.0f)) << 8;
   764         i--; src++; dst++;
   765     }
   766 
   767     if (cvt->filters[++cvt->filter_index]) {
   768         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   769     }
   770 }
   771 #endif
   772 
   773 
   774 void SDL_ChooseAudioConverters(void)
   775 {
   776     static SDL_bool converters_chosen = SDL_FALSE;
   777 
   778     if (converters_chosen) {
   779         return;
   780     }
   781 
   782 #define SET_CONVERTER_FUNCS(fntype) \
   783         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
   784         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
   785         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
   786         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
   787         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
   788         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
   789         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
   790         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
   791         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
   792         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
   793         converters_chosen = SDL_TRUE
   794 
   795 #if HAVE_SSE2_INTRINSICS
   796     if (SDL_HasSSE2()) {
   797         SET_CONVERTER_FUNCS(SSE2);
   798         return;
   799     }
   800 #endif
   801 
   802 #if NEED_SCALAR_CONVERTER_FALLBACKS
   803     SET_CONVERTER_FUNCS(Scalar);
   804 #endif
   805 
   806 #undef SET_CONVERTER_FUNCS
   807 
   808     SDL_assert(converters_chosen == SDL_TRUE);
   809 }
   810 
   811 /* vi: set ts=4 sw=4 expandtab: */