src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Mon, 21 May 2018 11:54:09 -0400
changeset 11993 fdf104726ced
parent 11992 08c415f14810
child 11995 b34d86386ee1
permissions -rw-r--r--
audio: Patched to compile on Visual Studio.

(It gets upset at the -2147483648, thinking this should be an unsigned value
because 2147483648 is too large for an int32, so the negative sign upsets the
compiler.)
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 #ifdef __ARM_NEON__
    29 #define HAVE_NEON_INTRINSICS 1
    30 #endif
    31 
    32 #ifdef __SSE2__
    33 #define HAVE_SSE2_INTRINSICS 1
    34 #endif
    35 
    36 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    37 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    38 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    39 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    40 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    41 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    42 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    43 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    44 #endif
    45 
    46 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    47 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    48 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    49 #endif
    50 
    51 /* Function pointers set to a CPU-specific implementation. */
    52 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    53 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    57 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    61 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    62 
    63 
    64 #define DIVBY128 0.0078125f
    65 #define DIVBY32768 0.000030517578125f
    66 #define DIVBY8388607 0.00000011920930376163766f
    67 
    68 
    69 #if NEED_SCALAR_CONVERTER_FALLBACKS
    70 static void SDLCALL
    71 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    72 {
    73     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    74     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    75     int i;
    76 
    77     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    78 
    79     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    80         *dst = ((float) *src) * DIVBY128;
    81     }
    82 
    83     cvt->len_cvt *= 4;
    84     if (cvt->filters[++cvt->filter_index]) {
    85         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    86     }
    87 }
    88 
    89 static void SDLCALL
    90 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    91 {
    92     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    93     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    94     int i;
    95 
    96     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    97 
    98     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    99         *dst = (((float) *src) * DIVBY128) - 1.0f;
   100     }
   101 
   102     cvt->len_cvt *= 4;
   103     if (cvt->filters[++cvt->filter_index]) {
   104         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   105     }
   106 }
   107 
   108 static void SDLCALL
   109 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   110 {
   111     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   112     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   113     int i;
   114 
   115     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   116 
   117     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   118         *dst = ((float) *src) * DIVBY32768;
   119     }
   120 
   121     cvt->len_cvt *= 2;
   122     if (cvt->filters[++cvt->filter_index]) {
   123         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   124     }
   125 }
   126 
   127 static void SDLCALL
   128 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   129 {
   130     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   131     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   132     int i;
   133 
   134     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   135 
   136     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   137         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   138     }
   139 
   140     cvt->len_cvt *= 2;
   141     if (cvt->filters[++cvt->filter_index]) {
   142         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   143     }
   144 }
   145 
   146 static void SDLCALL
   147 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   148 {
   149     const Sint32 *src = (const Sint32 *) cvt->buf;
   150     float *dst = (float *) cvt->buf;
   151     int i;
   152 
   153     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   154 
   155     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   156         *dst = ((float) (*src>>8)) * DIVBY8388607;
   157     }
   158 
   159     if (cvt->filters[++cvt->filter_index]) {
   160         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   161     }
   162 }
   163 
   164 static void SDLCALL
   165 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   166 {
   167     const float *src = (const float *) cvt->buf;
   168     Sint8 *dst = (Sint8 *) cvt->buf;
   169     int i;
   170 
   171     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   172 
   173     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   174         const float sample = *src;
   175         if (sample >= 1.0f) {
   176             *dst = 127;
   177         } else if (sample <= -1.0f) {
   178             *dst = -128;
   179         } else {
   180             *dst = (Sint8)(sample * 127.0f);
   181         }
   182     }
   183 
   184     cvt->len_cvt /= 4;
   185     if (cvt->filters[++cvt->filter_index]) {
   186         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   187     }
   188 }
   189 
   190 static void SDLCALL
   191 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   192 {
   193     const float *src = (const float *) cvt->buf;
   194     Uint8 *dst = (Uint8 *) cvt->buf;
   195     int i;
   196 
   197     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   198 
   199     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   200         const float sample = *src;
   201         if (sample >= 1.0f) {
   202             *dst = 255;
   203         } else if (sample <= -1.0f) {
   204             *dst = 0;
   205         } else {
   206             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   207         }
   208     }
   209 
   210     cvt->len_cvt /= 4;
   211     if (cvt->filters[++cvt->filter_index]) {
   212         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   213     }
   214 }
   215 
   216 static void SDLCALL
   217 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   218 {
   219     const float *src = (const float *) cvt->buf;
   220     Sint16 *dst = (Sint16 *) cvt->buf;
   221     int i;
   222 
   223     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   224 
   225     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   226         const float sample = *src;
   227         if (sample >= 1.0f) {
   228             *dst = 32767;
   229         } else if (sample <= -1.0f) {
   230             *dst = -32768;
   231         } else {
   232             *dst = (Sint16)(sample * 32767.0f);
   233         }
   234     }
   235 
   236     cvt->len_cvt /= 2;
   237     if (cvt->filters[++cvt->filter_index]) {
   238         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   239     }
   240 }
   241 
   242 static void SDLCALL
   243 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   244 {
   245     const float *src = (const float *) cvt->buf;
   246     Uint16 *dst = (Uint16 *) cvt->buf;
   247     int i;
   248 
   249     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   250 
   251     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   252         const float sample = *src;
   253         if (sample >= 1.0f) {
   254             *dst = 65535;
   255         } else if (sample <= -1.0f) {
   256             *dst = 0;
   257         } else {
   258             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   259         }
   260     }
   261 
   262     cvt->len_cvt /= 2;
   263     if (cvt->filters[++cvt->filter_index]) {
   264         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   265     }
   266 }
   267 
   268 static void SDLCALL
   269 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   270 {
   271     const float *src = (const float *) cvt->buf;
   272     Sint32 *dst = (Sint32 *) cvt->buf;
   273     int i;
   274 
   275     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   276 
   277     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   278         const float sample = *src;
   279         if (sample >= 1.0f) {
   280             *dst = 2147483647;
   281         } else if (sample <= -1.0f) {
   282             *dst = (Sint32) -2147483648LL;
   283         } else {
   284             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   285         }
   286     }
   287 
   288     if (cvt->filters[++cvt->filter_index]) {
   289         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   290     }
   291 }
   292 #endif
   293 
   294 
   295 #if HAVE_SSE2_INTRINSICS
   296 static void SDLCALL
   297 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   298 {
   299     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   300     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   301     int i;
   302 
   303     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   304 
   305     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   306     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   307         *dst = ((float) *src) * DIVBY128;
   308     }
   309 
   310     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   311     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   312 
   313     /* Make sure src is aligned too. */
   314     if ((((size_t) src) & 15) == 0) {
   315         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   316         const __m128i *mmsrc = (const __m128i *) src;
   317         const __m128i zero = _mm_setzero_si128();
   318         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   319         while (i >= 16) {   /* 16 * 8-bit */
   320             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   321             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   322             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   323             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   324             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   325             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   326             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
   327             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
   328             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
   329             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
   330             /* Interleave back into correct order, store. */
   331             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   332             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   333             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   334             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   335             i -= 16; mmsrc--; dst -= 16;
   336         }
   337 
   338         src = (const Sint8 *) mmsrc;
   339     }
   340 
   341     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   342 
   343     /* Finish off any leftovers with scalar operations. */
   344     while (i) {
   345         *dst = ((float) *src) * DIVBY128;
   346         i--; src--; dst--;
   347     }
   348 
   349     cvt->len_cvt *= 4;
   350     if (cvt->filters[++cvt->filter_index]) {
   351         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   352     }
   353 }
   354 
   355 static void SDLCALL
   356 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   357 {
   358     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   359     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   360     int i;
   361 
   362     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   363 
   364     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   365     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   366         *dst = (((float) *src) * DIVBY128) - 1.0f;
   367     }
   368 
   369     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   370     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   371 
   372     /* Make sure src is aligned too. */
   373     if ((((size_t) src) & 15) == 0) {
   374         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   375         const __m128i *mmsrc = (const __m128i *) src;
   376         const __m128i zero = _mm_setzero_si128();
   377         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   378         const __m128 minus1 = _mm_set1_ps(-1.0f);
   379         while (i >= 16) {   /* 16 * 8-bit */
   380             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   381             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   382             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   383             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   384             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   385             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   386             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   387             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   388             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   389             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   390             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   391             /* Interleave back into correct order, store. */
   392             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   393             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   394             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   395             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   396             i -= 16; mmsrc--; dst -= 16;
   397         }
   398 
   399         src = (const Uint8 *) mmsrc;
   400     }
   401 
   402     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   403 
   404     /* Finish off any leftovers with scalar operations. */
   405     while (i) {
   406         *dst = (((float) *src) * DIVBY128) - 1.0f;
   407         i--; src--; dst--;
   408     }
   409 
   410     cvt->len_cvt *= 4;
   411     if (cvt->filters[++cvt->filter_index]) {
   412         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   413     }
   414 }
   415 
   416 static void SDLCALL
   417 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   418 {
   419     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   420     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   421     int i;
   422 
   423     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   424 
   425     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   426     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   427         *dst = ((float) *src) * DIVBY32768;
   428     }
   429 
   430     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   431     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   432 
   433     /* Make sure src is aligned too. */
   434     if ((((size_t) src) & 15) == 0) {
   435         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   436         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   437         while (i >= 8) {   /* 8 * 16-bit */
   438             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   439             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   440             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   441             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   442             const __m128i b = _mm_srai_epi32(ints, 16);
   443             /* Interleave these back into the right order, convert to float, multiply, store. */
   444             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   445             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   446             i -= 8; src -= 8; dst -= 8;
   447         }
   448     }
   449 
   450     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   451 
   452     /* Finish off any leftovers with scalar operations. */
   453     while (i) {
   454         *dst = ((float) *src) * DIVBY32768;
   455         i--; src--; dst--;
   456     }
   457 
   458     cvt->len_cvt *= 2;
   459     if (cvt->filters[++cvt->filter_index]) {
   460         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   461     }
   462 }
   463 
   464 static void SDLCALL
   465 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   466 {
   467     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   468     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   469     int i;
   470 
   471     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   472 
   473     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   474     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   475         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   476     }
   477 
   478     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   479     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   480 
   481     /* Make sure src is aligned too. */
   482     if ((((size_t) src) & 15) == 0) {
   483         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   484         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   485         const __m128 minus1 = _mm_set1_ps(1.0f);
   486         while (i >= 8) {   /* 8 * 16-bit */
   487             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   488             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   489             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   490             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   491             const __m128i b = _mm_srli_epi32(ints, 16);
   492             /* Interleave these back into the right order, convert to float, multiply, store. */
   493             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   494             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   495             i -= 8; src -= 8; dst -= 8;
   496         }
   497     }
   498 
   499     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   500 
   501     /* Finish off any leftovers with scalar operations. */
   502     while (i) {
   503         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   504         i--; src--; dst--;
   505     }
   506 
   507     cvt->len_cvt *= 2;
   508     if (cvt->filters[++cvt->filter_index]) {
   509         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   510     }
   511 }
   512 
   513 static void SDLCALL
   514 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   515 {
   516     const Sint32 *src = (const Sint32 *) cvt->buf;
   517     float *dst = (float *) cvt->buf;
   518     int i;
   519 
   520     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   521 
   522     /* Get dst aligned to 16 bytes */
   523     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   524         *dst = ((float) (*src>>8)) * DIVBY8388607;
   525     }
   526 
   527     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   528     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   529 
   530     {
   531         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   532         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
   533         const __m128i *mmsrc = (const __m128i *) src;
   534         while (i >= 4) {   /* 4 * sint32 */
   535             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
   536             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
   537             i -= 4; mmsrc++; dst += 4;
   538         }
   539         src = (const Sint32 *) mmsrc;
   540     }
   541 
   542     /* Finish off any leftovers with scalar operations. */
   543     while (i) {
   544         *dst = ((float) (*src>>8)) * DIVBY8388607;
   545         i--; src++; dst++;
   546     }
   547 
   548     if (cvt->filters[++cvt->filter_index]) {
   549         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   550     }
   551 }
   552 
   553 static void SDLCALL
   554 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   555 {
   556     const float *src = (const float *) cvt->buf;
   557     Sint8 *dst = (Sint8 *) cvt->buf;
   558     int i;
   559 
   560     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   561 
   562     /* Get dst aligned to 16 bytes */
   563     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   564         const float sample = *src;
   565         if (sample >= 1.0f) {
   566             *dst = 127;
   567         } else if (sample <= -1.0f) {
   568             *dst = -128;
   569         } else {
   570             *dst = (Sint8)(sample * 127.0f);
   571         }
   572     }
   573 
   574     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   575 
   576     /* Make sure src is aligned too. */
   577     if ((((size_t) src) & 15) == 0) {
   578         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   579         const __m128 one = _mm_set1_ps(1.0f);
   580         const __m128 negone = _mm_set1_ps(-1.0f);
   581         const __m128 mulby127 = _mm_set1_ps(127.0f);
   582         __m128i *mmdst = (__m128i *) dst;
   583         while (i >= 16) {   /* 16 * float32 */
   584             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   585             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   586             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   587             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   588             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   589             i -= 16; src += 16; mmdst++;
   590         }
   591         dst = (Sint8 *) mmdst;
   592     }
   593 
   594     /* Finish off any leftovers with scalar operations. */
   595     while (i) {
   596         const float sample = *src;
   597         if (sample >= 1.0f) {
   598             *dst = 127;
   599         } else if (sample <= -1.0f) {
   600             *dst = -128;
   601         } else {
   602             *dst = (Sint8)(sample * 127.0f);
   603         }
   604         i--; src++; dst++;
   605     }
   606 
   607     cvt->len_cvt /= 4;
   608     if (cvt->filters[++cvt->filter_index]) {
   609         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   610     }
   611 }
   612 
   613 static void SDLCALL
   614 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   615 {
   616     const float *src = (const float *) cvt->buf;
   617     Uint8 *dst = (Uint8 *) cvt->buf;
   618     int i;
   619 
   620     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   621 
   622     /* Get dst aligned to 16 bytes */
   623     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   624         const float sample = *src;
   625         if (sample >= 1.0f) {
   626             *dst = 255;
   627         } else if (sample <= -1.0f) {
   628             *dst = 0;
   629         } else {
   630             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   631         }
   632     }
   633 
   634     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   635 
   636     /* Make sure src is aligned too. */
   637     if ((((size_t) src) & 15) == 0) {
   638         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   639         const __m128 one = _mm_set1_ps(1.0f);
   640         const __m128 negone = _mm_set1_ps(-1.0f);
   641         const __m128 mulby127 = _mm_set1_ps(127.0f);
   642         __m128i *mmdst = (__m128i *) dst;
   643         while (i >= 16) {   /* 16 * float32 */
   644             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   645             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   646             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   647             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   648             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   649             i -= 16; src += 16; mmdst++;
   650         }
   651         dst = (Uint8 *) mmdst;
   652     }
   653 
   654     /* Finish off any leftovers with scalar operations. */
   655     while (i) {
   656         const float sample = *src;
   657         if (sample >= 1.0f) {
   658             *dst = 255;
   659         } else if (sample <= -1.0f) {
   660             *dst = 0;
   661         } else {
   662             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   663         }
   664         i--; src++; dst++;
   665     }
   666 
   667     cvt->len_cvt /= 4;
   668     if (cvt->filters[++cvt->filter_index]) {
   669         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   670     }
   671 }
   672 
   673 static void SDLCALL
   674 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   675 {
   676     const float *src = (const float *) cvt->buf;
   677     Sint16 *dst = (Sint16 *) cvt->buf;
   678     int i;
   679 
   680     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   681 
   682     /* Get dst aligned to 16 bytes */
   683     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   684         const float sample = *src;
   685         if (sample >= 1.0f) {
   686             *dst = 32767;
   687         } else if (sample <= -1.0f) {
   688             *dst = -32768;
   689         } else {
   690             *dst = (Sint16)(sample * 32767.0f);
   691         }
   692     }
   693 
   694     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   695 
   696     /* Make sure src is aligned too. */
   697     if ((((size_t) src) & 15) == 0) {
   698         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   699         const __m128 one = _mm_set1_ps(1.0f);
   700         const __m128 negone = _mm_set1_ps(-1.0f);
   701         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   702         __m128i *mmdst = (__m128i *) dst;
   703         while (i >= 8) {   /* 8 * float32 */
   704             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   705             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   706             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   707             i -= 8; src += 8; mmdst++;
   708         }
   709         dst = (Sint16 *) mmdst;
   710     }
   711 
   712     /* Finish off any leftovers with scalar operations. */
   713     while (i) {
   714         const float sample = *src;
   715         if (sample >= 1.0f) {
   716             *dst = 32767;
   717         } else if (sample <= -1.0f) {
   718             *dst = -32768;
   719         } else {
   720             *dst = (Sint16)(sample * 32767.0f);
   721         }
   722         i--; src++; dst++;
   723     }
   724 
   725     cvt->len_cvt /= 2;
   726     if (cvt->filters[++cvt->filter_index]) {
   727         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   728     }
   729 }
   730 
   731 static void SDLCALL
   732 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   733 {
   734     const float *src = (const float *) cvt->buf;
   735     Uint16 *dst = (Uint16 *) cvt->buf;
   736     int i;
   737 
   738     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   739 
   740     /* Get dst aligned to 16 bytes */
   741     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   742         const float sample = *src;
   743         if (sample >= 1.0f) {
   744             *dst = 65535;
   745         } else if (sample <= -1.0f) {
   746             *dst = 0;
   747         } else {
   748             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   749         }
   750     }
   751 
   752     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   753 
   754     /* Make sure src is aligned too. */
   755     if ((((size_t) src) & 15) == 0) {
   756         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   757         /* This calculates differently than the scalar path because SSE2 can't
   758            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   759            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   760            but not before SSE 4.1. So we convert from float to sint16, packing
   761            that down with legit signed saturation, and then xor the top bit
   762            against 1. This results in the correct unsigned 16-bit value, even
   763            though it looks like dark magic. */
   764         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   765         const __m128i topbit = _mm_set1_epi16(-32768);
   766         const __m128 one = _mm_set1_ps(1.0f);
   767         const __m128 negone = _mm_set1_ps(-1.0f);
   768         __m128i *mmdst = (__m128i *) dst;
   769         while (i >= 8) {   /* 8 * float32 */
   770             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   771             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   772             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   773             i -= 8; src += 8; mmdst++;
   774         }
   775         dst = (Uint16 *) mmdst;
   776     }
   777 
   778     /* Finish off any leftovers with scalar operations. */
   779     while (i) {
   780         const float sample = *src;
   781         if (sample >= 1.0f) {
   782             *dst = 65535;
   783         } else if (sample <= -1.0f) {
   784             *dst = 0;
   785         } else {
   786             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   787         }
   788         i--; src++; dst++;
   789     }
   790 
   791     cvt->len_cvt /= 2;
   792     if (cvt->filters[++cvt->filter_index]) {
   793         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   794     }
   795 }
   796 
   797 static void SDLCALL
   798 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   799 {
   800     const float *src = (const float *) cvt->buf;
   801     Sint32 *dst = (Sint32 *) cvt->buf;
   802     int i;
   803 
   804     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   805 
   806     /* Get dst aligned to 16 bytes */
   807     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   808         const float sample = *src;
   809         if (sample >= 1.0f) {
   810             *dst = 2147483647;
   811         } else if (sample <= -1.0f) {
   812             *dst = -2147483648;
   813         } else {
   814             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   815         }
   816     }
   817 
   818     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   819     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   820 
   821     {
   822         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   823         const __m128 one = _mm_set1_ps(1.0f);
   824         const __m128 negone = _mm_set1_ps(-1.0f);
   825         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
   826         __m128i *mmdst = (__m128i *) dst;
   827         while (i >= 4) {   /* 4 * float32 */
   828             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
   829             i -= 4; src += 4; mmdst++;
   830         }
   831         dst = (Sint32 *) mmdst;
   832     }
   833 
   834     /* Finish off any leftovers with scalar operations. */
   835     while (i) {
   836         const float sample = *src;
   837         if (sample >= 1.0f) {
   838             *dst = 2147483647;
   839         } else if (sample <= -1.0f) {
   840             *dst = -2147483648;
   841         } else {
   842             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   843         }
   844         i--; src++; dst++;
   845     }
   846 
   847     if (cvt->filters[++cvt->filter_index]) {
   848         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   849     }
   850 }
   851 #endif
   852 
   853 
   854 #if HAVE_NEON_INTRINSICS
   855 static void SDLCALL
   856 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   857 {
   858     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   859     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   860     int i;
   861 
   862     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
   863 
   864     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   865     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   866         *dst = ((float) *src) * DIVBY128;
   867     }
   868 
   869     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
   870     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   871 
   872     /* Make sure src is aligned too. */
   873     if ((((size_t) src) & 15) == 0) {
   874         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   875         const int8_t *mmsrc = (const int8_t *) src;
   876         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   877         while (i >= 16) {   /* 16 * 8-bit */
   878             const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
   879             const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
   880             const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
   881             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
   882             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
   883             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
   884             vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
   885             vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
   886             i -= 16; mmsrc -= 16; dst -= 16;
   887         }
   888 
   889         src = (const Sint8 *) mmsrc;
   890     }
   891 
   892     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   893 
   894     /* Finish off any leftovers with scalar operations. */
   895     while (i) {
   896         *dst = ((float) *src) * DIVBY128;
   897         i--; src--; dst--;
   898     }
   899 
   900     cvt->len_cvt *= 4;
   901     if (cvt->filters[++cvt->filter_index]) {
   902         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   903     }
   904 }
   905 
   906 static void SDLCALL
   907 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   908 {
   909     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   910     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   911     int i;
   912 
   913     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
   914 
   915     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   916     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   917         *dst = (((float) *src) * DIVBY128) - 1.0f;
   918     }
   919 
   920     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
   921     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   922 
   923     /* Make sure src is aligned too. */
   924     if ((((size_t) src) & 15) == 0) {
   925         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   926         const uint8_t *mmsrc = (const uint8_t *) src;
   927         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   928         const float32x4_t one = vdupq_n_f32(1.0f);
   929         while (i >= 16) {   /* 16 * 8-bit */
   930             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
   931             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
   932             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
   933             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
   934             vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
   935             vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
   936             vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
   937             vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
   938             i -= 16; mmsrc -= 16; dst -= 16;
   939         }
   940 
   941         src = (const Uint8 *) mmsrc;
   942     }
   943 
   944     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   945 
   946     /* Finish off any leftovers with scalar operations. */
   947     while (i) {
   948         *dst = (((float) *src) * DIVBY128) - 1.0f;
   949         i--; src--; dst--;
   950     }
   951 
   952     cvt->len_cvt *= 4;
   953     if (cvt->filters[++cvt->filter_index]) {
   954         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   955     }
   956 }
   957 
   958 static void SDLCALL
   959 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   960 {
   961     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   962     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   963     int i;
   964 
   965     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
   966 
   967     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   968     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   969         *dst = ((float) *src) * DIVBY32768;
   970     }
   971 
   972     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
   973     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   974 
   975     /* Make sure src is aligned too. */
   976     if ((((size_t) src) & 15) == 0) {
   977         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   978         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
   979         while (i >= 8) {   /* 8 * 16-bit */
   980             const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
   981             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
   982             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
   983             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
   984             i -= 8; src -= 8; dst -= 8;
   985         }
   986     }
   987 
   988     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   989 
   990     /* Finish off any leftovers with scalar operations. */
   991     while (i) {
   992         *dst = ((float) *src) * DIVBY32768;
   993         i--; src--; dst--;
   994     }
   995 
   996     cvt->len_cvt *= 2;
   997     if (cvt->filters[++cvt->filter_index]) {
   998         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   999     }
  1000 }
  1001 
  1002 static void SDLCALL
  1003 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1004 {
  1005     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
  1006     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
  1007     int i;
  1008 
  1009     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
  1010 
  1011     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
  1012     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
  1013         *dst = (((float) *src) * DIVBY32768) - 1.0f;
  1014     }
  1015 
  1016     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
  1017     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1018 
  1019     /* Make sure src is aligned too. */
  1020     if ((((size_t) src) & 15) == 0) {
  1021         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1022         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
  1023         const float32x4_t one = vdupq_n_f32(1.0f);
  1024         while (i >= 8) {   /* 8 * 16-bit */
  1025             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
  1026             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
  1027             vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
  1028             vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
  1029             i -= 8; src -= 8; dst -= 8;
  1030         }
  1031     }
  1032 
  1033     src += 7; dst += 7;  /* adjust for any scalar finishing. */
  1034 
  1035     /* Finish off any leftovers with scalar operations. */
  1036     while (i) {
  1037         *dst = (((float) *src) * DIVBY32768) - 1.0f;
  1038         i--; src--; dst--;
  1039     }
  1040 
  1041     cvt->len_cvt *= 2;
  1042     if (cvt->filters[++cvt->filter_index]) {
  1043         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
  1044     }
  1045 }
  1046 
  1047 static void SDLCALL
  1048 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1049 {
  1050     const Sint32 *src = (const Sint32 *) cvt->buf;
  1051     float *dst = (float *) cvt->buf;
  1052     int i;
  1053 
  1054     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
  1055 
  1056     /* Get dst aligned to 16 bytes */
  1057     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1058         *dst = ((float) (*src>>8)) * DIVBY8388607;
  1059     }
  1060 
  1061     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1062     SDL_assert(!i || ((((size_t) src) & 15) == 0));
  1063 
  1064     {
  1065         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1066         const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
  1067         const int32_t *mmsrc = (const int32_t *) src;
  1068         while (i >= 4) {   /* 4 * sint32 */
  1069             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
  1070             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
  1071             i -= 4; mmsrc += 4; dst += 4;
  1072         }
  1073         src = (const Sint32 *) mmsrc;
  1074     }
  1075 
  1076     /* Finish off any leftovers with scalar operations. */
  1077     while (i) {
  1078         *dst = ((float) (*src>>8)) * DIVBY8388607;
  1079         i--; src++; dst++;
  1080     }
  1081 
  1082     if (cvt->filters[++cvt->filter_index]) {
  1083         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
  1084     }
  1085 }
  1086 
  1087 static void SDLCALL
  1088 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1089 {
  1090     const float *src = (const float *) cvt->buf;
  1091     Sint8 *dst = (Sint8 *) cvt->buf;
  1092     int i;
  1093 
  1094     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
  1095 
  1096     /* Get dst aligned to 16 bytes */
  1097     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1098         const float sample = *src;
  1099         if (sample >= 1.0f) {
  1100             *dst = 127;
  1101         } else if (sample <= -1.0f) {
  1102             *dst = -128;
  1103         } else {
  1104             *dst = (Sint8)(sample * 127.0f);
  1105         }
  1106     }
  1107 
  1108     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1109 
  1110     /* Make sure src is aligned too. */
  1111     if ((((size_t) src) & 15) == 0) {
  1112         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1113         const float32x4_t one = vdupq_n_f32(1.0f);
  1114         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1115         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  1116         int8_t *mmdst = (int8_t *) dst;
  1117         while (i >= 16) {   /* 16 * float32 */
  1118             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1119             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1120             const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1121             const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1122             const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
  1123             const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
  1124             vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
  1125             i -= 16; src += 16; mmdst += 16;
  1126         }
  1127         dst = (Sint8 *) mmdst;
  1128     }
  1129 
  1130     /* Finish off any leftovers with scalar operations. */
  1131     while (i) {
  1132         const float sample = *src;
  1133         if (sample >= 1.0f) {
  1134             *dst = 127;
  1135         } else if (sample <= -1.0f) {
  1136             *dst = -128;
  1137         } else {
  1138             *dst = (Sint8)(sample * 127.0f);
  1139         }
  1140         i--; src++; dst++;
  1141     }
  1142 
  1143     cvt->len_cvt /= 4;
  1144     if (cvt->filters[++cvt->filter_index]) {
  1145         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
  1146     }
  1147 }
  1148 
  1149 static void SDLCALL
  1150 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1151 {
  1152     const float *src = (const float *) cvt->buf;
  1153     Uint8 *dst = (Uint8 *) cvt->buf;
  1154     int i;
  1155 
  1156     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
  1157 
  1158     /* Get dst aligned to 16 bytes */
  1159     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1160         const float sample = *src;
  1161         if (sample >= 1.0f) {
  1162             *dst = 255;
  1163         } else if (sample <= -1.0f) {
  1164             *dst = 0;
  1165         } else {
  1166             *dst = (Uint8)((sample + 1.0f) * 127.0f);
  1167         }
  1168     }
  1169 
  1170     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1171 
  1172     /* Make sure src is aligned too. */
  1173     if ((((size_t) src) & 15) == 0) {
  1174         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1175         const float32x4_t one = vdupq_n_f32(1.0f);
  1176         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1177         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  1178         uint8_t *mmdst = (uint8_t *) dst;
  1179         while (i >= 16) {   /* 16 * float32 */
  1180             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1181             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1182             const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1183             const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1184             const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
  1185             const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
  1186             vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
  1187             i -= 16; src += 16; mmdst += 16;
  1188         }
  1189 
  1190         dst = (Uint8 *) mmdst;
  1191     }
  1192 
  1193     /* Finish off any leftovers with scalar operations. */
  1194     while (i) {
  1195         const float sample = *src;
  1196         if (sample >= 1.0f) {
  1197             *dst = 255;
  1198         } else if (sample <= -1.0f) {
  1199             *dst = 0;
  1200         } else {
  1201             *dst = (Uint8)((sample + 1.0f) * 127.0f);
  1202         }
  1203         i--; src++; dst++;
  1204     }
  1205 
  1206     cvt->len_cvt /= 4;
  1207     if (cvt->filters[++cvt->filter_index]) {
  1208         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
  1209     }
  1210 }
  1211 
  1212 static void SDLCALL
  1213 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1214 {
  1215     const float *src = (const float *) cvt->buf;
  1216     Sint16 *dst = (Sint16 *) cvt->buf;
  1217     int i;
  1218 
  1219     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
  1220 
  1221     /* Get dst aligned to 16 bytes */
  1222     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1223         const float sample = *src;
  1224         if (sample >= 1.0f) {
  1225             *dst = 32767;
  1226         } else if (sample <= -1.0f) {
  1227             *dst = -32768;
  1228         } else {
  1229             *dst = (Sint16)(sample * 32767.0f);
  1230         }
  1231     }
  1232 
  1233     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1234 
  1235     /* Make sure src is aligned too. */
  1236     if ((((size_t) src) & 15) == 0) {
  1237         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1238         const float32x4_t one = vdupq_n_f32(1.0f);
  1239         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1240         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
  1241         int16_t *mmdst = (int16_t *) dst;
  1242         while (i >= 8) {   /* 8 * float32 */
  1243             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
  1244             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
  1245             vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
  1246             i -= 8; src += 8; mmdst += 8;
  1247         }
  1248         dst = (Sint16 *) mmdst;
  1249     }
  1250 
  1251     /* Finish off any leftovers with scalar operations. */
  1252     while (i) {
  1253         const float sample = *src;
  1254         if (sample >= 1.0f) {
  1255             *dst = 32767;
  1256         } else if (sample <= -1.0f) {
  1257             *dst = -32768;
  1258         } else {
  1259             *dst = (Sint16)(sample * 32767.0f);
  1260         }
  1261         i--; src++; dst++;
  1262     }
  1263 
  1264     cvt->len_cvt /= 2;
  1265     if (cvt->filters[++cvt->filter_index]) {
  1266         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
  1267     }
  1268 }
  1269 
  1270 static void SDLCALL
  1271 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1272 {
  1273     const float *src = (const float *) cvt->buf;
  1274     Uint16 *dst = (Uint16 *) cvt->buf;
  1275     int i;
  1276 
  1277     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
  1278 
  1279     /* Get dst aligned to 16 bytes */
  1280     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1281         const float sample = *src;
  1282         if (sample >= 1.0f) {
  1283             *dst = 65535;
  1284         } else if (sample <= -1.0f) {
  1285             *dst = 0;
  1286         } else {
  1287             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
  1288         }
  1289     }
  1290 
  1291     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1292 
  1293     /* Make sure src is aligned too. */
  1294     if ((((size_t) src) & 15) == 0) {
  1295         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1296         const float32x4_t one = vdupq_n_f32(1.0f);
  1297         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1298         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
  1299         uint16_t *mmdst = (uint16_t *) dst;
  1300         while (i >= 8) {   /* 8 * float32 */
  1301             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
  1302             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
  1303             vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
  1304             i -= 8; src += 8; mmdst += 8;
  1305         }
  1306         dst = (Uint16 *) mmdst;
  1307     }
  1308 
  1309     /* Finish off any leftovers with scalar operations. */
  1310     while (i) {
  1311         const float sample = *src;
  1312         if (sample >= 1.0f) {
  1313             *dst = 65535;
  1314         } else if (sample <= -1.0f) {
  1315             *dst = 0;
  1316         } else {
  1317             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
  1318         }
  1319         i--; src++; dst++;
  1320     }
  1321 
  1322     cvt->len_cvt /= 2;
  1323     if (cvt->filters[++cvt->filter_index]) {
  1324         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
  1325     }
  1326 }
  1327 
  1328 static void SDLCALL
  1329 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1330 {
  1331     const float *src = (const float *) cvt->buf;
  1332     Sint32 *dst = (Sint32 *) cvt->buf;
  1333     int i;
  1334 
  1335     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
  1336 
  1337     /* Get dst aligned to 16 bytes */
  1338     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1339         const float sample = *src;
  1340         if (sample >= 1.0f) {
  1341             *dst = 2147483647;
  1342         } else if (sample <= -1.0f) {
  1343             *dst = -2147483648;
  1344         } else {
  1345             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  1346         }
  1347     }
  1348 
  1349     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1350     SDL_assert(!i || ((((size_t) src) & 15) == 0));
  1351 
  1352     {
  1353         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1354         const float32x4_t one = vdupq_n_f32(1.0f);
  1355         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1356         const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
  1357         int32_t *mmdst = (int32_t *) dst;
  1358         while (i >= 4) {   /* 4 * float32 */
  1359             vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
  1360             i -= 4; src += 4; mmdst += 4;
  1361         }
  1362         dst = (Sint32 *) mmdst;
  1363     }
  1364 
  1365     /* Finish off any leftovers with scalar operations. */
  1366     while (i) {
  1367         const float sample = *src;
  1368         if (sample >= 1.0f) {
  1369             *dst = 2147483647;
  1370         } else if (sample <= -1.0f) {
  1371             *dst = -2147483648;
  1372         } else {
  1373             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  1374         }
  1375         i--; src++; dst++;
  1376     }
  1377 
  1378     if (cvt->filters[++cvt->filter_index]) {
  1379         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
  1380     }
  1381 }
  1382 #endif
  1383 
  1384 
  1385 
  1386 void SDL_ChooseAudioConverters(void)
  1387 {
  1388     static SDL_bool converters_chosen = SDL_FALSE;
  1389 
  1390     if (converters_chosen) {
  1391         return;
  1392     }
  1393 
  1394 #define SET_CONVERTER_FUNCS(fntype) \
  1395         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
  1396         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
  1397         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
  1398         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
  1399         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
  1400         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
  1401         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
  1402         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
  1403         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
  1404         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
  1405         converters_chosen = SDL_TRUE
  1406 
  1407 #if HAVE_SSE2_INTRINSICS
  1408     if (SDL_HasSSE2()) {
  1409         SET_CONVERTER_FUNCS(SSE2);
  1410         return;
  1411     }
  1412 #endif
  1413 
  1414 #if HAVE_NEON_INTRINSICS
  1415     if (SDL_HasNEON()) {
  1416         SET_CONVERTER_FUNCS(NEON);
  1417         return;
  1418     }
  1419 #endif
  1420 
  1421 #if NEED_SCALAR_CONVERTER_FALLBACKS
  1422     SET_CONVERTER_FUNCS(Scalar);
  1423 #endif
  1424 
  1425 #undef SET_CONVERTER_FUNCS
  1426 
  1427     SDL_assert(converters_chosen == SDL_TRUE);
  1428 }
  1429 
  1430 /* vi: set ts=4 sw=4 expandtab: */