src/audio/SDL_audiotypecvt.c
author Sam Lantinga <slouken@libsdl.org>
Tue, 21 May 2019 17:33:31 -0700
changeset 12747 cdf53e16feb7
parent 12503 806492103856
permissions -rw-r--r--
Fixed bug 4639 - CMake build does not generate libhidapi.so for Android

Manuel Sabogal

I noticed that the current Android.mk builds a libhidapi.so library for Android but the CMake build hasn't been updated to do so. I'll attach a patch that fixes this issue.
     1 /*
     2   Simple DirectMedia Layer
     3   Copyright (C) 1997-2019 Sam Lantinga <slouken@libsdl.org>
     4 
     5   This software is provided 'as-is', without any express or implied
     6   warranty.  In no event will the authors be held liable for any damages
     7   arising from the use of this software.
     8 
     9   Permission is granted to anyone to use this software for any purpose,
    10   including commercial applications, and to alter it and redistribute it
    11   freely, subject to the following restrictions:
    12 
    13   1. The origin of this software must not be misrepresented; you must not
    14      claim that you wrote the original software. If you use this software
    15      in a product, an acknowledgment in the product documentation would be
    16      appreciated but is not required.
    17   2. Altered source versions must be plainly marked as such, and must not be
    18      misrepresented as being the original software.
    19   3. This notice may not be removed or altered from any source distribution.
    20 */
    21 
    22 #include "../SDL_internal.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    27 
    28 /* !!! FIXME: disabled until we fix https://bugzilla.libsdl.org/show_bug.cgi?id=4186 */
    29 #if 0 /*def __ARM_NEON */
    30 #define HAVE_NEON_INTRINSICS 1
    31 #endif
    32 
    33 #ifdef __SSE2__
    34 #define HAVE_SSE2_INTRINSICS 1
    35 #endif
    36 
    37 #if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
    38 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
    39 #elif __MACOSX__ && HAVE_SSE2_INTRINSICS
    40 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
    41 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
    42 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
    43 #elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
    44 #define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
    45 #endif
    46 
    47 /* Set to zero if platform is guaranteed to use a SIMD codepath here. */
    48 #ifndef NEED_SCALAR_CONVERTER_FALLBACKS
    49 #define NEED_SCALAR_CONVERTER_FALLBACKS 1
    50 #endif
    51 
    52 /* Function pointers set to a CPU-specific implementation. */
    53 SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
    54 SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
    55 SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
    56 SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
    57 SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
    58 SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
    59 SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
    60 SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
    61 SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
    62 SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
    63 
    64 
    65 #define DIVBY128 0.0078125f
    66 #define DIVBY32768 0.000030517578125f
    67 #define DIVBY8388607 0.00000011920930376163766f
    68 
    69 
    70 #if NEED_SCALAR_CONVERTER_FALLBACKS
    71 static void SDLCALL
    72 SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    73 {
    74     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    75     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    76     int i;
    77 
    78     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
    79 
    80     for (i = cvt->len_cvt; i; --i, --src, --dst) {
    81         *dst = ((float) *src) * DIVBY128;
    82     }
    83 
    84     cvt->len_cvt *= 4;
    85     if (cvt->filters[++cvt->filter_index]) {
    86         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
    87     }
    88 }
    89 
    90 static void SDLCALL
    91 SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
    92 {
    93     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
    94     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
    95     int i;
    96 
    97     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
    98 
    99     for (i = cvt->len_cvt; i; --i, --src, --dst) {
   100         *dst = (((float) *src) * DIVBY128) - 1.0f;
   101     }
   102 
   103     cvt->len_cvt *= 4;
   104     if (cvt->filters[++cvt->filter_index]) {
   105         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   106     }
   107 }
   108 
   109 static void SDLCALL
   110 SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   111 {
   112     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   113     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   114     int i;
   115 
   116     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
   117 
   118     for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
   119         *dst = ((float) *src) * DIVBY32768;
   120     }
   121 
   122     cvt->len_cvt *= 2;
   123     if (cvt->filters[++cvt->filter_index]) {
   124         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   125     }
   126 }
   127 
   128 static void SDLCALL
   129 SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   130 {
   131     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   132     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   133     int i;
   134 
   135     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
   136 
   137     for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
   138         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   139     }
   140 
   141     cvt->len_cvt *= 2;
   142     if (cvt->filters[++cvt->filter_index]) {
   143         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   144     }
   145 }
   146 
   147 static void SDLCALL
   148 SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   149 {
   150     const Sint32 *src = (const Sint32 *) cvt->buf;
   151     float *dst = (float *) cvt->buf;
   152     int i;
   153 
   154     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
   155 
   156     for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
   157         *dst = ((float) (*src>>8)) * DIVBY8388607;
   158     }
   159 
   160     if (cvt->filters[++cvt->filter_index]) {
   161         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   162     }
   163 }
   164 
   165 static void SDLCALL
   166 SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   167 {
   168     const float *src = (const float *) cvt->buf;
   169     Sint8 *dst = (Sint8 *) cvt->buf;
   170     int i;
   171 
   172     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
   173 
   174     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   175         const float sample = *src;
   176         if (sample >= 1.0f) {
   177             *dst = 127;
   178         } else if (sample <= -1.0f) {
   179             *dst = -128;
   180         } else {
   181             *dst = (Sint8)(sample * 127.0f);
   182         }
   183     }
   184 
   185     cvt->len_cvt /= 4;
   186     if (cvt->filters[++cvt->filter_index]) {
   187         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   188     }
   189 }
   190 
   191 static void SDLCALL
   192 SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   193 {
   194     const float *src = (const float *) cvt->buf;
   195     Uint8 *dst = (Uint8 *) cvt->buf;
   196     int i;
   197 
   198     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
   199 
   200     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   201         const float sample = *src;
   202         if (sample >= 1.0f) {
   203             *dst = 255;
   204         } else if (sample <= -1.0f) {
   205             *dst = 0;
   206         } else {
   207             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   208         }
   209     }
   210 
   211     cvt->len_cvt /= 4;
   212     if (cvt->filters[++cvt->filter_index]) {
   213         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   214     }
   215 }
   216 
   217 static void SDLCALL
   218 SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   219 {
   220     const float *src = (const float *) cvt->buf;
   221     Sint16 *dst = (Sint16 *) cvt->buf;
   222     int i;
   223 
   224     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
   225 
   226     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   227         const float sample = *src;
   228         if (sample >= 1.0f) {
   229             *dst = 32767;
   230         } else if (sample <= -1.0f) {
   231             *dst = -32768;
   232         } else {
   233             *dst = (Sint16)(sample * 32767.0f);
   234         }
   235     }
   236 
   237     cvt->len_cvt /= 2;
   238     if (cvt->filters[++cvt->filter_index]) {
   239         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   240     }
   241 }
   242 
   243 static void SDLCALL
   244 SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   245 {
   246     const float *src = (const float *) cvt->buf;
   247     Uint16 *dst = (Uint16 *) cvt->buf;
   248     int i;
   249 
   250     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
   251 
   252     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   253         const float sample = *src;
   254         if (sample >= 1.0f) {
   255             *dst = 65535;
   256         } else if (sample <= -1.0f) {
   257             *dst = 0;
   258         } else {
   259             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   260         }
   261     }
   262 
   263     cvt->len_cvt /= 2;
   264     if (cvt->filters[++cvt->filter_index]) {
   265         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   266     }
   267 }
   268 
   269 static void SDLCALL
   270 SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   271 {
   272     const float *src = (const float *) cvt->buf;
   273     Sint32 *dst = (Sint32 *) cvt->buf;
   274     int i;
   275 
   276     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
   277 
   278     for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
   279         const float sample = *src;
   280         if (sample >= 1.0f) {
   281             *dst = 2147483647;
   282         } else if (sample <= -1.0f) {
   283             *dst = (Sint32) -2147483648LL;
   284         } else {
   285             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   286         }
   287     }
   288 
   289     if (cvt->filters[++cvt->filter_index]) {
   290         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   291     }
   292 }
   293 #endif
   294 
   295 
   296 #if HAVE_SSE2_INTRINSICS
   297 static void SDLCALL
   298 SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   299 {
   300     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   301     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   302     int i;
   303 
   304     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
   305 
   306     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   307     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   308         *dst = ((float) *src) * DIVBY128;
   309     }
   310 
   311     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   312     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   313 
   314     /* Make sure src is aligned too. */
   315     if ((((size_t) src) & 15) == 0) {
   316         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   317         const __m128i *mmsrc = (const __m128i *) src;
   318         const __m128i zero = _mm_setzero_si128();
   319         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   320         while (i >= 16) {   /* 16 * 8-bit */
   321             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
   322             /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
   323             const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
   324             /* right-shift-sign-extend gets us sint16 with the other set of values. */
   325             const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
   326             /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
   327             const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
   328             const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
   329             const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
   330             const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
   331             /* Interleave back into correct order, store. */
   332             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   333             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   334             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   335             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   336             i -= 16; mmsrc--; dst -= 16;
   337         }
   338 
   339         src = (const Sint8 *) mmsrc;
   340     }
   341 
   342     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   343 
   344     /* Finish off any leftovers with scalar operations. */
   345     while (i) {
   346         *dst = ((float) *src) * DIVBY128;
   347         i--; src--; dst--;
   348     }
   349 
   350     cvt->len_cvt *= 4;
   351     if (cvt->filters[++cvt->filter_index]) {
   352         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   353     }
   354 }
   355 
   356 static void SDLCALL
   357 SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   358 {
   359     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   360     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   361     int i;
   362 
   363     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
   364 
   365     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   366     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   367         *dst = (((float) *src) * DIVBY128) - 1.0f;
   368     }
   369 
   370     src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
   371     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   372 
   373     /* Make sure src is aligned too. */
   374     if ((((size_t) src) & 15) == 0) {
   375         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   376         const __m128i *mmsrc = (const __m128i *) src;
   377         const __m128i zero = _mm_setzero_si128();
   378         const __m128 divby128 = _mm_set1_ps(DIVBY128);
   379         const __m128 minus1 = _mm_set1_ps(-1.0f);
   380         while (i >= 16) {   /* 16 * 8-bit */
   381             const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
   382             /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
   383             const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
   384             /* right-shift-zero-extend gets us uint16 with the other set of values. */
   385             const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
   386             /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
   387             /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
   388             const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
   389             const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
   390             const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
   391             const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
   392             /* Interleave back into correct order, store. */
   393             _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
   394             _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
   395             _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
   396             _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
   397             i -= 16; mmsrc--; dst -= 16;
   398         }
   399 
   400         src = (const Uint8 *) mmsrc;
   401     }
   402 
   403     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   404 
   405     /* Finish off any leftovers with scalar operations. */
   406     while (i) {
   407         *dst = (((float) *src) * DIVBY128) - 1.0f;
   408         i--; src--; dst--;
   409     }
   410 
   411     cvt->len_cvt *= 4;
   412     if (cvt->filters[++cvt->filter_index]) {
   413         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   414     }
   415 }
   416 
   417 static void SDLCALL
   418 SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   419 {
   420     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   421     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   422     int i;
   423 
   424     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
   425 
   426     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   427     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   428         *dst = ((float) *src) * DIVBY32768;
   429     }
   430 
   431     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   432     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   433 
   434     /* Make sure src is aligned too. */
   435     if ((((size_t) src) & 15) == 0) {
   436         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   437         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   438         while (i >= 8) {   /* 8 * 16-bit */
   439             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   440             /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
   441             const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
   442             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   443             const __m128i b = _mm_srai_epi32(ints, 16);
   444             /* Interleave these back into the right order, convert to float, multiply, store. */
   445             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
   446             _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
   447             i -= 8; src -= 8; dst -= 8;
   448         }
   449     }
   450 
   451     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   452 
   453     /* Finish off any leftovers with scalar operations. */
   454     while (i) {
   455         *dst = ((float) *src) * DIVBY32768;
   456         i--; src--; dst--;
   457     }
   458 
   459     cvt->len_cvt *= 2;
   460     if (cvt->filters[++cvt->filter_index]) {
   461         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   462     }
   463 }
   464 
   465 static void SDLCALL
   466 SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   467 {
   468     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   469     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   470     int i;
   471 
   472     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
   473 
   474     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   475     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   476         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   477     }
   478 
   479     src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
   480     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   481 
   482     /* Make sure src is aligned too. */
   483     if ((((size_t) src) & 15) == 0) {
   484         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   485         const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
   486         const __m128 minus1 = _mm_set1_ps(1.0f);
   487         while (i >= 8) {   /* 8 * 16-bit */
   488             const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
   489             /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
   490             const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
   491             /* right-shift-sign-extend gets us sint32 with the other set of values. */
   492             const __m128i b = _mm_srli_epi32(ints, 16);
   493             /* Interleave these back into the right order, convert to float, multiply, store. */
   494             _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
   495             _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
   496             i -= 8; src -= 8; dst -= 8;
   497         }
   498     }
   499 
   500     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   501 
   502     /* Finish off any leftovers with scalar operations. */
   503     while (i) {
   504         *dst = (((float) *src) * DIVBY32768) - 1.0f;
   505         i--; src--; dst--;
   506     }
   507 
   508     cvt->len_cvt *= 2;
   509     if (cvt->filters[++cvt->filter_index]) {
   510         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   511     }
   512 }
   513 
   514 static void SDLCALL
   515 SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   516 {
   517     const Sint32 *src = (const Sint32 *) cvt->buf;
   518     float *dst = (float *) cvt->buf;
   519     int i;
   520 
   521     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
   522 
   523     /* Get dst aligned to 16 bytes */
   524     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   525         *dst = ((float) (*src>>8)) * DIVBY8388607;
   526     }
   527 
   528     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   529     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   530 
   531     {
   532         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   533         const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
   534         const __m128i *mmsrc = (const __m128i *) src;
   535         while (i >= 4) {   /* 4 * sint32 */
   536             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
   537             _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
   538             i -= 4; mmsrc++; dst += 4;
   539         }
   540         src = (const Sint32 *) mmsrc;
   541     }
   542 
   543     /* Finish off any leftovers with scalar operations. */
   544     while (i) {
   545         *dst = ((float) (*src>>8)) * DIVBY8388607;
   546         i--; src++; dst++;
   547     }
   548 
   549     if (cvt->filters[++cvt->filter_index]) {
   550         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   551     }
   552 }
   553 
   554 static void SDLCALL
   555 SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   556 {
   557     const float *src = (const float *) cvt->buf;
   558     Sint8 *dst = (Sint8 *) cvt->buf;
   559     int i;
   560 
   561     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
   562 
   563     /* Get dst aligned to 16 bytes */
   564     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   565         const float sample = *src;
   566         if (sample >= 1.0f) {
   567             *dst = 127;
   568         } else if (sample <= -1.0f) {
   569             *dst = -128;
   570         } else {
   571             *dst = (Sint8)(sample * 127.0f);
   572         }
   573     }
   574 
   575     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   576 
   577     /* Make sure src is aligned too. */
   578     if ((((size_t) src) & 15) == 0) {
   579         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   580         const __m128 one = _mm_set1_ps(1.0f);
   581         const __m128 negone = _mm_set1_ps(-1.0f);
   582         const __m128 mulby127 = _mm_set1_ps(127.0f);
   583         __m128i *mmdst = (__m128i *) dst;
   584         while (i >= 16) {   /* 16 * float32 */
   585             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   586             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   587             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   588             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   589             _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   590             i -= 16; src += 16; mmdst++;
   591         }
   592         dst = (Sint8 *) mmdst;
   593     }
   594 
   595     /* Finish off any leftovers with scalar operations. */
   596     while (i) {
   597         const float sample = *src;
   598         if (sample >= 1.0f) {
   599             *dst = 127;
   600         } else if (sample <= -1.0f) {
   601             *dst = -128;
   602         } else {
   603             *dst = (Sint8)(sample * 127.0f);
   604         }
   605         i--; src++; dst++;
   606     }
   607 
   608     cvt->len_cvt /= 4;
   609     if (cvt->filters[++cvt->filter_index]) {
   610         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
   611     }
   612 }
   613 
   614 static void SDLCALL
   615 SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   616 {
   617     const float *src = (const float *) cvt->buf;
   618     Uint8 *dst = (Uint8 *) cvt->buf;
   619     int i;
   620 
   621     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
   622 
   623     /* Get dst aligned to 16 bytes */
   624     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   625         const float sample = *src;
   626         if (sample >= 1.0f) {
   627             *dst = 255;
   628         } else if (sample <= -1.0f) {
   629             *dst = 0;
   630         } else {
   631             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   632         }
   633     }
   634 
   635     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   636 
   637     /* Make sure src is aligned too. */
   638     if ((((size_t) src) & 15) == 0) {
   639         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   640         const __m128 one = _mm_set1_ps(1.0f);
   641         const __m128 negone = _mm_set1_ps(-1.0f);
   642         const __m128 mulby127 = _mm_set1_ps(127.0f);
   643         __m128i *mmdst = (__m128i *) dst;
   644         while (i >= 16) {   /* 16 * float32 */
   645             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   646             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   647             const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   648             const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
   649             _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
   650             i -= 16; src += 16; mmdst++;
   651         }
   652         dst = (Uint8 *) mmdst;
   653     }
   654 
   655     /* Finish off any leftovers with scalar operations. */
   656     while (i) {
   657         const float sample = *src;
   658         if (sample >= 1.0f) {
   659             *dst = 255;
   660         } else if (sample <= -1.0f) {
   661             *dst = 0;
   662         } else {
   663             *dst = (Uint8)((sample + 1.0f) * 127.0f);
   664         }
   665         i--; src++; dst++;
   666     }
   667 
   668     cvt->len_cvt /= 4;
   669     if (cvt->filters[++cvt->filter_index]) {
   670         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
   671     }
   672 }
   673 
   674 static void SDLCALL
   675 SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   676 {
   677     const float *src = (const float *) cvt->buf;
   678     Sint16 *dst = (Sint16 *) cvt->buf;
   679     int i;
   680 
   681     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
   682 
   683     /* Get dst aligned to 16 bytes */
   684     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   685         const float sample = *src;
   686         if (sample >= 1.0f) {
   687             *dst = 32767;
   688         } else if (sample <= -1.0f) {
   689             *dst = -32768;
   690         } else {
   691             *dst = (Sint16)(sample * 32767.0f);
   692         }
   693     }
   694 
   695     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   696 
   697     /* Make sure src is aligned too. */
   698     if ((((size_t) src) & 15) == 0) {
   699         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   700         const __m128 one = _mm_set1_ps(1.0f);
   701         const __m128 negone = _mm_set1_ps(-1.0f);
   702         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   703         __m128i *mmdst = (__m128i *) dst;
   704         while (i >= 8) {   /* 8 * float32 */
   705             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   706             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   707             _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
   708             i -= 8; src += 8; mmdst++;
   709         }
   710         dst = (Sint16 *) mmdst;
   711     }
   712 
   713     /* Finish off any leftovers with scalar operations. */
   714     while (i) {
   715         const float sample = *src;
   716         if (sample >= 1.0f) {
   717             *dst = 32767;
   718         } else if (sample <= -1.0f) {
   719             *dst = -32768;
   720         } else {
   721             *dst = (Sint16)(sample * 32767.0f);
   722         }
   723         i--; src++; dst++;
   724     }
   725 
   726     cvt->len_cvt /= 2;
   727     if (cvt->filters[++cvt->filter_index]) {
   728         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
   729     }
   730 }
   731 
   732 static void SDLCALL
   733 SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   734 {
   735     const float *src = (const float *) cvt->buf;
   736     Uint16 *dst = (Uint16 *) cvt->buf;
   737     int i;
   738 
   739     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
   740 
   741     /* Get dst aligned to 16 bytes */
   742     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   743         const float sample = *src;
   744         if (sample >= 1.0f) {
   745             *dst = 65535;
   746         } else if (sample <= -1.0f) {
   747             *dst = 0;
   748         } else {
   749             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   750         }
   751     }
   752 
   753     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   754 
   755     /* Make sure src is aligned too. */
   756     if ((((size_t) src) & 15) == 0) {
   757         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   758         /* This calculates differently than the scalar path because SSE2 can't
   759            pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
   760            saturation, so that would corrupt our data. _mm_packus_epi32 exists,
   761            but not before SSE 4.1. So we convert from float to sint16, packing
   762            that down with legit signed saturation, and then xor the top bit
   763            against 1. This results in the correct unsigned 16-bit value, even
   764            though it looks like dark magic. */
   765         const __m128 mulby32767 = _mm_set1_ps(32767.0f);
   766         const __m128i topbit = _mm_set1_epi16(-32768);
   767         const __m128 one = _mm_set1_ps(1.0f);
   768         const __m128 negone = _mm_set1_ps(-1.0f);
   769         __m128i *mmdst = (__m128i *) dst;
   770         while (i >= 8) {   /* 8 * float32 */
   771             const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   772             const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
   773             _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
   774             i -= 8; src += 8; mmdst++;
   775         }
   776         dst = (Uint16 *) mmdst;
   777     }
   778 
   779     /* Finish off any leftovers with scalar operations. */
   780     while (i) {
   781         const float sample = *src;
   782         if (sample >= 1.0f) {
   783             *dst = 65535;
   784         } else if (sample <= -1.0f) {
   785             *dst = 0;
   786         } else {
   787             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
   788         }
   789         i--; src++; dst++;
   790     }
   791 
   792     cvt->len_cvt /= 2;
   793     if (cvt->filters[++cvt->filter_index]) {
   794         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
   795     }
   796 }
   797 
   798 static void SDLCALL
   799 SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   800 {
   801     const float *src = (const float *) cvt->buf;
   802     Sint32 *dst = (Sint32 *) cvt->buf;
   803     int i;
   804 
   805     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
   806 
   807     /* Get dst aligned to 16 bytes */
   808     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
   809         const float sample = *src;
   810         if (sample >= 1.0f) {
   811             *dst = 2147483647;
   812         } else if (sample <= -1.0f) {
   813             *dst = (Sint32) -2147483648LL;
   814         } else {
   815             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   816         }
   817     }
   818 
   819     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   820     SDL_assert(!i || ((((size_t) src) & 15) == 0));
   821 
   822     {
   823         /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
   824         const __m128 one = _mm_set1_ps(1.0f);
   825         const __m128 negone = _mm_set1_ps(-1.0f);
   826         const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
   827         __m128i *mmdst = (__m128i *) dst;
   828         while (i >= 4) {   /* 4 * float32 */
   829             _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
   830             i -= 4; src += 4; mmdst++;
   831         }
   832         dst = (Sint32 *) mmdst;
   833     }
   834 
   835     /* Finish off any leftovers with scalar operations. */
   836     while (i) {
   837         const float sample = *src;
   838         if (sample >= 1.0f) {
   839             *dst = 2147483647;
   840         } else if (sample <= -1.0f) {
   841             *dst = (Sint32) -2147483648LL;
   842         } else {
   843             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
   844         }
   845         i--; src++; dst++;
   846     }
   847 
   848     if (cvt->filters[++cvt->filter_index]) {
   849         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
   850     }
   851 }
   852 #endif
   853 
   854 
   855 #if HAVE_NEON_INTRINSICS
   856 static void SDLCALL
   857 SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   858 {
   859     const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   860     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   861     int i;
   862 
   863     LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
   864 
   865     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   866     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   867         *dst = ((float) *src) * DIVBY128;
   868     }
   869 
   870     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
   871     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   872 
   873     /* Make sure src is aligned too. */
   874     if ((((size_t) src) & 15) == 0) {
   875         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   876         const int8_t *mmsrc = (const int8_t *) src;
   877         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   878         while (i >= 16) {   /* 16 * 8-bit */
   879             const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
   880             const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
   881             const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
   882             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
   883             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
   884             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
   885             vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
   886             vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
   887             i -= 16; mmsrc -= 16; dst -= 16;
   888         }
   889 
   890         src = (const Sint8 *) mmsrc;
   891     }
   892 
   893     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   894 
   895     /* Finish off any leftovers with scalar operations. */
   896     while (i) {
   897         *dst = ((float) *src) * DIVBY128;
   898         i--; src--; dst--;
   899     }
   900 
   901     cvt->len_cvt *= 4;
   902     if (cvt->filters[++cvt->filter_index]) {
   903         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   904     }
   905 }
   906 
   907 static void SDLCALL
   908 SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   909 {
   910     const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
   911     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
   912     int i;
   913 
   914     LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
   915 
   916     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   917     for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
   918         *dst = (((float) *src) * DIVBY128) - 1.0f;
   919     }
   920 
   921     src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
   922     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   923 
   924     /* Make sure src is aligned too. */
   925     if ((((size_t) src) & 15) == 0) {
   926         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   927         const uint8_t *mmsrc = (const uint8_t *) src;
   928         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   929         const float32x4_t one = vdupq_n_f32(1.0f);
   930         while (i >= 16) {   /* 16 * 8-bit */
   931             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
   932             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
   933             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
   934             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
   935             vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
   936             vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
   937             vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
   938             vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
   939             i -= 16; mmsrc -= 16; dst -= 16;
   940         }
   941 
   942         src = (const Uint8 *) mmsrc;
   943     }
   944 
   945     src += 15; dst += 15;  /* adjust for any scalar finishing. */
   946 
   947     /* Finish off any leftovers with scalar operations. */
   948     while (i) {
   949         *dst = (((float) *src) * DIVBY128) - 1.0f;
   950         i--; src--; dst--;
   951     }
   952 
   953     cvt->len_cvt *= 4;
   954     if (cvt->filters[++cvt->filter_index]) {
   955         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
   956     }
   957 }
   958 
   959 static void SDLCALL
   960 SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
   961 {
   962     const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
   963     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
   964     int i;
   965 
   966     LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
   967 
   968     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
   969     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
   970         *dst = ((float) *src) * DIVBY32768;
   971     }
   972 
   973     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
   974     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
   975 
   976     /* Make sure src is aligned too. */
   977     if ((((size_t) src) & 15) == 0) {
   978         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   979         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
   980         while (i >= 8) {   /* 8 * 16-bit */
   981             const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
   982             /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
   983             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
   984             vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
   985             i -= 8; src -= 8; dst -= 8;
   986         }
   987     }
   988 
   989     src += 7; dst += 7;  /* adjust for any scalar finishing. */
   990 
   991     /* Finish off any leftovers with scalar operations. */
   992     while (i) {
   993         *dst = ((float) *src) * DIVBY32768;
   994         i--; src--; dst--;
   995     }
   996 
   997     cvt->len_cvt *= 2;
   998     if (cvt->filters[++cvt->filter_index]) {
   999         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
  1000     }
  1001 }
  1002 
  1003 static void SDLCALL
  1004 SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1005 {
  1006     const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
  1007     float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
  1008     int i;
  1009 
  1010     LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
  1011 
  1012     /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
  1013     for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
  1014         *dst = (((float) *src) * DIVBY32768) - 1.0f;
  1015     }
  1016 
  1017     src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
  1018     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1019 
  1020     /* Make sure src is aligned too. */
  1021     if ((((size_t) src) & 15) == 0) {
  1022         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1023         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
  1024         const float32x4_t one = vdupq_n_f32(1.0f);
  1025         while (i >= 8) {   /* 8 * 16-bit */
  1026             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
  1027             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
  1028             vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
  1029             vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
  1030             i -= 8; src -= 8; dst -= 8;
  1031         }
  1032     }
  1033 
  1034     src += 7; dst += 7;  /* adjust for any scalar finishing. */
  1035 
  1036     /* Finish off any leftovers with scalar operations. */
  1037     while (i) {
  1038         *dst = (((float) *src) * DIVBY32768) - 1.0f;
  1039         i--; src--; dst--;
  1040     }
  1041 
  1042     cvt->len_cvt *= 2;
  1043     if (cvt->filters[++cvt->filter_index]) {
  1044         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
  1045     }
  1046 }
  1047 
  1048 static void SDLCALL
  1049 SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1050 {
  1051     const Sint32 *src = (const Sint32 *) cvt->buf;
  1052     float *dst = (float *) cvt->buf;
  1053     int i;
  1054 
  1055     LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
  1056 
  1057     /* Get dst aligned to 16 bytes */
  1058     for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1059         *dst = ((float) (*src>>8)) * DIVBY8388607;
  1060     }
  1061 
  1062     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1063     SDL_assert(!i || ((((size_t) src) & 15) == 0));
  1064 
  1065     {
  1066         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1067         const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
  1068         const int32_t *mmsrc = (const int32_t *) src;
  1069         while (i >= 4) {   /* 4 * sint32 */
  1070             /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
  1071             vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
  1072             i -= 4; mmsrc += 4; dst += 4;
  1073         }
  1074         src = (const Sint32 *) mmsrc;
  1075     }
  1076 
  1077     /* Finish off any leftovers with scalar operations. */
  1078     while (i) {
  1079         *dst = ((float) (*src>>8)) * DIVBY8388607;
  1080         i--; src++; dst++;
  1081     }
  1082 
  1083     if (cvt->filters[++cvt->filter_index]) {
  1084         cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
  1085     }
  1086 }
  1087 
  1088 static void SDLCALL
  1089 SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1090 {
  1091     const float *src = (const float *) cvt->buf;
  1092     Sint8 *dst = (Sint8 *) cvt->buf;
  1093     int i;
  1094 
  1095     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
  1096 
  1097     /* Get dst aligned to 16 bytes */
  1098     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1099         const float sample = *src;
  1100         if (sample >= 1.0f) {
  1101             *dst = 127;
  1102         } else if (sample <= -1.0f) {
  1103             *dst = -128;
  1104         } else {
  1105             *dst = (Sint8)(sample * 127.0f);
  1106         }
  1107     }
  1108 
  1109     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1110 
  1111     /* Make sure src is aligned too. */
  1112     if ((((size_t) src) & 15) == 0) {
  1113         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1114         const float32x4_t one = vdupq_n_f32(1.0f);
  1115         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1116         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  1117         int8_t *mmdst = (int8_t *) dst;
  1118         while (i >= 16) {   /* 16 * float32 */
  1119             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1120             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1121             const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1122             const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
  1123             const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
  1124             const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
  1125             vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
  1126             i -= 16; src += 16; mmdst += 16;
  1127         }
  1128         dst = (Sint8 *) mmdst;
  1129     }
  1130 
  1131     /* Finish off any leftovers with scalar operations. */
  1132     while (i) {
  1133         const float sample = *src;
  1134         if (sample >= 1.0f) {
  1135             *dst = 127;
  1136         } else if (sample <= -1.0f) {
  1137             *dst = -128;
  1138         } else {
  1139             *dst = (Sint8)(sample * 127.0f);
  1140         }
  1141         i--; src++; dst++;
  1142     }
  1143 
  1144     cvt->len_cvt /= 4;
  1145     if (cvt->filters[++cvt->filter_index]) {
  1146         cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
  1147     }
  1148 }
  1149 
  1150 static void SDLCALL
  1151 SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1152 {
  1153     const float *src = (const float *) cvt->buf;
  1154     Uint8 *dst = (Uint8 *) cvt->buf;
  1155     int i;
  1156 
  1157     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
  1158 
  1159     /* Get dst aligned to 16 bytes */
  1160     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1161         const float sample = *src;
  1162         if (sample >= 1.0f) {
  1163             *dst = 255;
  1164         } else if (sample <= -1.0f) {
  1165             *dst = 0;
  1166         } else {
  1167             *dst = (Uint8)((sample + 1.0f) * 127.0f);
  1168         }
  1169     }
  1170 
  1171     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1172 
  1173     /* Make sure src is aligned too. */
  1174     if ((((size_t) src) & 15) == 0) {
  1175         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1176         const float32x4_t one = vdupq_n_f32(1.0f);
  1177         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1178         const float32x4_t mulby127 = vdupq_n_f32(127.0f);
  1179         uint8_t *mmdst = (uint8_t *) dst;
  1180         while (i >= 16) {   /* 16 * float32 */
  1181             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1182             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1183             const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1184             const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
  1185             const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
  1186             const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
  1187             vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
  1188             i -= 16; src += 16; mmdst += 16;
  1189         }
  1190 
  1191         dst = (Uint8 *) mmdst;
  1192     }
  1193 
  1194     /* Finish off any leftovers with scalar operations. */
  1195     while (i) {
  1196         const float sample = *src;
  1197         if (sample >= 1.0f) {
  1198             *dst = 255;
  1199         } else if (sample <= -1.0f) {
  1200             *dst = 0;
  1201         } else {
  1202             *dst = (Uint8)((sample + 1.0f) * 127.0f);
  1203         }
  1204         i--; src++; dst++;
  1205     }
  1206 
  1207     cvt->len_cvt /= 4;
  1208     if (cvt->filters[++cvt->filter_index]) {
  1209         cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
  1210     }
  1211 }
  1212 
  1213 static void SDLCALL
  1214 SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1215 {
  1216     const float *src = (const float *) cvt->buf;
  1217     Sint16 *dst = (Sint16 *) cvt->buf;
  1218     int i;
  1219 
  1220     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
  1221 
  1222     /* Get dst aligned to 16 bytes */
  1223     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1224         const float sample = *src;
  1225         if (sample >= 1.0f) {
  1226             *dst = 32767;
  1227         } else if (sample <= -1.0f) {
  1228             *dst = -32768;
  1229         } else {
  1230             *dst = (Sint16)(sample * 32767.0f);
  1231         }
  1232     }
  1233 
  1234     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1235 
  1236     /* Make sure src is aligned too. */
  1237     if ((((size_t) src) & 15) == 0) {
  1238         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1239         const float32x4_t one = vdupq_n_f32(1.0f);
  1240         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1241         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
  1242         int16_t *mmdst = (int16_t *) dst;
  1243         while (i >= 8) {   /* 8 * float32 */
  1244             const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
  1245             const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
  1246             vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
  1247             i -= 8; src += 8; mmdst += 8;
  1248         }
  1249         dst = (Sint16 *) mmdst;
  1250     }
  1251 
  1252     /* Finish off any leftovers with scalar operations. */
  1253     while (i) {
  1254         const float sample = *src;
  1255         if (sample >= 1.0f) {
  1256             *dst = 32767;
  1257         } else if (sample <= -1.0f) {
  1258             *dst = -32768;
  1259         } else {
  1260             *dst = (Sint16)(sample * 32767.0f);
  1261         }
  1262         i--; src++; dst++;
  1263     }
  1264 
  1265     cvt->len_cvt /= 2;
  1266     if (cvt->filters[++cvt->filter_index]) {
  1267         cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
  1268     }
  1269 }
  1270 
  1271 static void SDLCALL
  1272 SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1273 {
  1274     const float *src = (const float *) cvt->buf;
  1275     Uint16 *dst = (Uint16 *) cvt->buf;
  1276     int i;
  1277 
  1278     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
  1279 
  1280     /* Get dst aligned to 16 bytes */
  1281     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1282         const float sample = *src;
  1283         if (sample >= 1.0f) {
  1284             *dst = 65535;
  1285         } else if (sample <= -1.0f) {
  1286             *dst = 0;
  1287         } else {
  1288             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
  1289         }
  1290     }
  1291 
  1292     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1293 
  1294     /* Make sure src is aligned too. */
  1295     if ((((size_t) src) & 15) == 0) {
  1296         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1297         const float32x4_t one = vdupq_n_f32(1.0f);
  1298         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1299         const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
  1300         uint16_t *mmdst = (uint16_t *) dst;
  1301         while (i >= 8) {   /* 8 * float32 */
  1302             const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
  1303             const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
  1304             vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
  1305             i -= 8; src += 8; mmdst += 8;
  1306         }
  1307         dst = (Uint16 *) mmdst;
  1308     }
  1309 
  1310     /* Finish off any leftovers with scalar operations. */
  1311     while (i) {
  1312         const float sample = *src;
  1313         if (sample >= 1.0f) {
  1314             *dst = 65535;
  1315         } else if (sample <= -1.0f) {
  1316             *dst = 0;
  1317         } else {
  1318             *dst = (Uint16)((sample + 1.0f) * 32767.0f);
  1319         }
  1320         i--; src++; dst++;
  1321     }
  1322 
  1323     cvt->len_cvt /= 2;
  1324     if (cvt->filters[++cvt->filter_index]) {
  1325         cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
  1326     }
  1327 }
  1328 
  1329 static void SDLCALL
  1330 SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
  1331 {
  1332     const float *src = (const float *) cvt->buf;
  1333     Sint32 *dst = (Sint32 *) cvt->buf;
  1334     int i;
  1335 
  1336     LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
  1337 
  1338     /* Get dst aligned to 16 bytes */
  1339     for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
  1340         const float sample = *src;
  1341         if (sample >= 1.0f) {
  1342             *dst = 2147483647;
  1343         } else if (sample <= -1.0f) {
  1344             *dst = -2147483648;
  1345         } else {
  1346             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  1347         }
  1348     }
  1349 
  1350     SDL_assert(!i || ((((size_t) dst) & 15) == 0));
  1351     SDL_assert(!i || ((((size_t) src) & 15) == 0));
  1352 
  1353     {
  1354         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1355         const float32x4_t one = vdupq_n_f32(1.0f);
  1356         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1357         const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
  1358         int32_t *mmdst = (int32_t *) dst;
  1359         while (i >= 4) {   /* 4 * float32 */
  1360             vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
  1361             i -= 4; src += 4; mmdst += 4;
  1362         }
  1363         dst = (Sint32 *) mmdst;
  1364     }
  1365 
  1366     /* Finish off any leftovers with scalar operations. */
  1367     while (i) {
  1368         const float sample = *src;
  1369         if (sample >= 1.0f) {
  1370             *dst = 2147483647;
  1371         } else if (sample <= -1.0f) {
  1372             *dst = -2147483648;
  1373         } else {
  1374             *dst = ((Sint32)(sample * 8388607.0f)) << 8;
  1375         }
  1376         i--; src++; dst++;
  1377     }
  1378 
  1379     if (cvt->filters[++cvt->filter_index]) {
  1380         cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
  1381     }
  1382 }
  1383 #endif
  1384 
  1385 
  1386 
  1387 void SDL_ChooseAudioConverters(void)
  1388 {
  1389     static SDL_bool converters_chosen = SDL_FALSE;
  1390 
  1391     if (converters_chosen) {
  1392         return;
  1393     }
  1394 
  1395 #define SET_CONVERTER_FUNCS(fntype) \
  1396         SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
  1397         SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
  1398         SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
  1399         SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
  1400         SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
  1401         SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
  1402         SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
  1403         SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
  1404         SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
  1405         SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
  1406         converters_chosen = SDL_TRUE
  1407 
  1408 #if HAVE_SSE2_INTRINSICS
  1409     if (SDL_HasSSE2()) {
  1410         SET_CONVERTER_FUNCS(SSE2);
  1411         return;
  1412     }
  1413 #endif
  1414 
  1415 #if HAVE_NEON_INTRINSICS
  1416     if (SDL_HasNEON()) {
  1417         SET_CONVERTER_FUNCS(NEON);
  1418         return;
  1419     }
  1420 #endif
  1421 
  1422 #if NEED_SCALAR_CONVERTER_FALLBACKS
  1423     SET_CONVERTER_FUNCS(Scalar);
  1424 #endif
  1425 
  1426 #undef SET_CONVERTER_FUNCS
  1427 
  1428     SDL_assert(converters_chosen == SDL_TRUE);
  1429 }
  1430 
  1431 /* vi: set ts=4 sw=4 expandtab: */