src/audio/SDL_audiotypecvt.c
author Ethan Lee <flibitijibibo@flibitijibibo.com>
Fri, 14 Jun 2019 09:51:22 -0400
changeset 12856 cfcd6e8aee7f
parent 12503 806492103856
child 12857 93cc810247aa
permissions -rw-r--r--
Check src alignment for S32_to_F32 conversions
icculus@1982
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@12503
     3
  Copyright (C) 1997-2019 Sam Lantinga <slouken@libsdl.org>
slouken@5535
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@5535
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@5535
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
icculus@1982
    20
*/
icculus@1982
    21
icculus@8093
    22
#include "../SDL_internal.h"
icculus@1982
    23
#include "SDL_audio.h"
icculus@1982
    24
#include "SDL_audio_c.h"
icculus@10815
    25
#include "SDL_cpuinfo.h"
icculus@10575
    26
#include "SDL_assert.h"
icculus@1982
    27
icculus@12265
    28
/* !!! FIXME: disabled until we fix https://bugzilla.libsdl.org/show_bug.cgi?id=4186 */
sylvain@12449
    29
#if 0 /*def __ARM_NEON */
icculus@11992
    30
#define HAVE_NEON_INTRINSICS 1
icculus@11992
    31
#endif
icculus@10815
    32
icculus@10835
    33
#ifdef __SSE2__
icculus@10835
    34
#define HAVE_SSE2_INTRINSICS 1
icculus@10815
    35
#endif
icculus@10815
    36
icculus@10815
    37
#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
icculus@10815
    38
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
icculus@10815
    39
#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
icculus@10815
    40
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
icculus@10815
    41
#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
icculus@10815
    42
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
icculus@10815
    43
#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
icculus@10815
    44
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
icculus@10815
    45
#endif
icculus@10815
    46
icculus@10815
    47
/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
icculus@10815
    48
#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    49
#define NEED_SCALAR_CONVERTER_FALLBACKS 1
icculus@10815
    50
#endif
icculus@10815
    51
icculus@10815
    52
/* Function pointers set to a CPU-specific implementation. */
icculus@10815
    53
SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
icculus@10815
    54
SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
icculus@10815
    55
SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
icculus@10815
    56
SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
icculus@10815
    57
SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
icculus@10815
    58
SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
icculus@10815
    59
SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
icculus@10815
    60
SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
icculus@10815
    61
SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
icculus@10815
    62
SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
icculus@10815
    63
icculus@10815
    64
icculus@11403
    65
#define DIVBY128 0.0078125f
icculus@11403
    66
#define DIVBY32768 0.000030517578125f
icculus@11987
    67
#define DIVBY8388607 0.00000011920930376163766f
icculus@1982
    68
icculus@10815
    69
icculus@10815
    70
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    71
static void SDLCALL
icculus@10815
    72
SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
    73
{
icculus@10814
    74
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    75
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10575
    76
    int i;
icculus@3021
    77
icculus@10575
    78
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
icculus@1982
    79
icculus@10814
    80
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
    81
        *dst = ((float) *src) * DIVBY128;
icculus@1982
    82
    }
icculus@1982
    83
icculus@10575
    84
    cvt->len_cvt *= 4;
icculus@1982
    85
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
    86
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
    87
    }
icculus@1982
    88
}
icculus@1982
    89
icculus@10815
    90
static void SDLCALL
icculus@10815
    91
SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
    92
{
icculus@10575
    93
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    94
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@1982
    95
    int i;
icculus@1982
    96
icculus@10575
    97
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
icculus@1982
    98
icculus@10814
    99
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
   100
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10575
   101
    }
icculus@10575
   102
icculus@10575
   103
    cvt->len_cvt *= 4;
icculus@10575
   104
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   105
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10575
   106
    }
icculus@10575
   107
}
icculus@10575
   108
icculus@10815
   109
static void SDLCALL
icculus@10815
   110
SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
   111
{
icculus@10575
   112
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   113
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10575
   114
    int i;
icculus@10575
   115
icculus@10575
   116
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
icculus@10575
   117
icculus@10575
   118
    for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
icculus@11403
   119
        *dst = ((float) *src) * DIVBY32768;
icculus@1982
   120
    }
icculus@1982
   121
icculus@1982
   122
    cvt->len_cvt *= 2;
icculus@1982
   123
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   124
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   125
    }
icculus@1982
   126
}
icculus@1982
   127
icculus@10815
   128
static void SDLCALL
icculus@10815
   129
SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   130
{
icculus@10575
   131
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   132
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@1982
   133
    int i;
icculus@1982
   134
icculus@10575
   135
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
icculus@1982
   136
icculus@10575
   137
    for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
icculus@11403
   138
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@1982
   139
    }
icculus@1982
   140
icculus@1982
   141
    cvt->len_cvt *= 2;
icculus@1982
   142
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   143
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   144
    }
icculus@1982
   145
}
icculus@1982
   146
icculus@10815
   147
static void SDLCALL
icculus@10815
   148
SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   149
{
icculus@10814
   150
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10575
   151
    float *dst = (float *) cvt->buf;
icculus@1982
   152
    int i;
icculus@1982
   153
icculus@10575
   154
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
icculus@1982
   155
icculus@10575
   156
    for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
icculus@11987
   157
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@1982
   158
    }
icculus@1982
   159
icculus@1982
   160
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   161
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   162
    }
icculus@1982
   163
}
icculus@1982
   164
icculus@10815
   165
static void SDLCALL
icculus@10815
   166
SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   167
{
icculus@10575
   168
    const float *src = (const float *) cvt->buf;
icculus@10575
   169
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@1982
   170
    int i;
icculus@1982
   171
icculus@10575
   172
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
icculus@1982
   173
icculus@10575
   174
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   175
        const float sample = *src;
icculus@11989
   176
        if (sample >= 1.0f) {
slouken@11633
   177
            *dst = 127;
icculus@11989
   178
        } else if (sample <= -1.0f) {
icculus@11990
   179
            *dst = -128;
slouken@11633
   180
        } else {
slouken@11633
   181
            *dst = (Sint8)(sample * 127.0f);
slouken@11633
   182
        }
icculus@1982
   183
    }
icculus@1982
   184
icculus@10575
   185
    cvt->len_cvt /= 4;
icculus@1982
   186
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   187
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@1982
   188
    }
icculus@1982
   189
}
icculus@1982
   190
icculus@10815
   191
static void SDLCALL
icculus@10815
   192
SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   193
{
icculus@10575
   194
    const float *src = (const float *) cvt->buf;
icculus@10575
   195
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@1982
   196
    int i;
icculus@1982
   197
icculus@10575
   198
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
icculus@1982
   199
icculus@10575
   200
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   201
        const float sample = *src;
icculus@11989
   202
        if (sample >= 1.0f) {
slouken@11633
   203
            *dst = 255;
icculus@11989
   204
        } else if (sample <= -1.0f) {
slouken@11633
   205
            *dst = 0;
slouken@11633
   206
        } else {
slouken@11633
   207
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
slouken@11633
   208
        }
icculus@1982
   209
    }
icculus@1982
   210
icculus@10575
   211
    cvt->len_cvt /= 4;
icculus@1982
   212
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   213
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@1982
   214
    }
icculus@1982
   215
}
icculus@1982
   216
icculus@10815
   217
static void SDLCALL
icculus@10815
   218
SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   219
{
icculus@10575
   220
    const float *src = (const float *) cvt->buf;
icculus@10575
   221
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@1982
   222
    int i;
icculus@1982
   223
icculus@10575
   224
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
icculus@1982
   225
icculus@10575
   226
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   227
        const float sample = *src;
icculus@11989
   228
        if (sample >= 1.0f) {
slouken@11633
   229
            *dst = 32767;
icculus@11989
   230
        } else if (sample <= -1.0f) {
icculus@11990
   231
            *dst = -32768;
slouken@11633
   232
        } else {
slouken@11633
   233
            *dst = (Sint16)(sample * 32767.0f);
slouken@11633
   234
        }
icculus@1982
   235
    }
icculus@1982
   236
icculus@1982
   237
    cvt->len_cvt /= 2;
icculus@1982
   238
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   239
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@1982
   240
    }
icculus@1982
   241
}
icculus@1982
   242
icculus@10815
   243
static void SDLCALL
icculus@10815
   244
SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   245
{
icculus@10575
   246
    const float *src = (const float *) cvt->buf;
icculus@10575
   247
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@1982
   248
    int i;
icculus@1982
   249
icculus@10575
   250
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
icculus@1982
   251
icculus@10575
   252
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   253
        const float sample = *src;
icculus@11989
   254
        if (sample >= 1.0f) {
icculus@11990
   255
            *dst = 65535;
icculus@11989
   256
        } else if (sample <= -1.0f) {
slouken@11633
   257
            *dst = 0;
slouken@11633
   258
        } else {
slouken@11633
   259
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
slouken@11633
   260
        }
icculus@1982
   261
    }
icculus@1982
   262
icculus@1982
   263
    cvt->len_cvt /= 2;
icculus@1982
   264
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   265
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@1982
   266
    }
icculus@1982
   267
}
icculus@1982
   268
icculus@10815
   269
static void SDLCALL
icculus@10815
   270
SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   271
{
icculus@10575
   272
    const float *src = (const float *) cvt->buf;
icculus@10575
   273
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@1982
   274
    int i;
icculus@1982
   275
icculus@10575
   276
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
icculus@1982
   277
icculus@10575
   278
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   279
        const float sample = *src;
icculus@11989
   280
        if (sample >= 1.0f) {
slouken@11633
   281
            *dst = 2147483647;
icculus@11989
   282
        } else if (sample <= -1.0f) {
icculus@11993
   283
            *dst = (Sint32) -2147483648LL;
slouken@11633
   284
        } else {
icculus@11987
   285
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
slouken@11633
   286
        }
icculus@1982
   287
    }
icculus@1982
   288
icculus@1982
   289
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   290
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@1982
   291
    }
icculus@1982
   292
}
icculus@10815
   293
#endif
icculus@10815
   294
icculus@10815
   295
icculus@10815
   296
#if HAVE_SSE2_INTRINSICS
icculus@10815
   297
static void SDLCALL
icculus@10815
   298
SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   299
{
icculus@10815
   300
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   301
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   302
    int i;
icculus@10815
   303
icculus@10815
   304
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
icculus@10815
   305
icculus@10815
   306
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   307
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   308
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   309
    }
icculus@10815
   310
icculus@10815
   311
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   312
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   313
icculus@10815
   314
    /* Make sure src is aligned too. */
icculus@10815
   315
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   316
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   317
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   318
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   319
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   320
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   321
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
icculus@10815
   322
            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
icculus@10815
   323
            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   324
            /* right-shift-sign-extend gets us sint16 with the other set of values. */
icculus@10815
   325
            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
icculus@10815
   326
            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
icculus@11403
   327
            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   328
            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
icculus@11403
   329
            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   330
            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
icculus@10815
   331
            /* Interleave back into correct order, store. */
icculus@10815
   332
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   333
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   334
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   335
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   336
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   337
        }
icculus@10815
   338
icculus@10815
   339
        src = (const Sint8 *) mmsrc;
icculus@10815
   340
    }
icculus@10815
   341
icculus@10815
   342
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   343
icculus@10815
   344
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   345
    while (i) {
icculus@11403
   346
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   347
        i--; src--; dst--;
icculus@10815
   348
    }
icculus@10815
   349
icculus@10815
   350
    cvt->len_cvt *= 4;
icculus@10815
   351
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   352
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   353
    }
icculus@10815
   354
}
icculus@10815
   355
icculus@10815
   356
static void SDLCALL
icculus@10815
   357
SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   358
{
icculus@10815
   359
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   360
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   361
    int i;
icculus@10815
   362
icculus@10815
   363
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
icculus@10815
   364
icculus@10815
   365
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   366
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   367
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   368
    }
icculus@10815
   369
icculus@10815
   370
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   371
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   372
icculus@10815
   373
    /* Make sure src is aligned too. */
icculus@10815
   374
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   375
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   376
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   377
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   378
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   379
        const __m128 minus1 = _mm_set1_ps(-1.0f);
icculus@10815
   380
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   381
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
icculus@10815
   382
            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
icculus@10815
   383
            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   384
            /* right-shift-zero-extend gets us uint16 with the other set of values. */
icculus@10815
   385
            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
icculus@10815
   386
            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
icculus@10815
   387
            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
icculus@11403
   388
            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   389
            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
icculus@11403
   390
            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   391
            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
icculus@10815
   392
            /* Interleave back into correct order, store. */
icculus@10815
   393
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   394
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   395
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   396
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   397
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   398
        }
icculus@10815
   399
icculus@10815
   400
        src = (const Uint8 *) mmsrc;
icculus@10815
   401
    }
icculus@10815
   402
icculus@10815
   403
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   404
icculus@10815
   405
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   406
    while (i) {
icculus@11403
   407
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   408
        i--; src--; dst--;
icculus@10815
   409
    }
icculus@10815
   410
icculus@10815
   411
    cvt->len_cvt *= 4;
icculus@10815
   412
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   413
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   414
    }
icculus@10815
   415
}
icculus@10815
   416
icculus@10815
   417
static void SDLCALL
icculus@10815
   418
SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   419
{
icculus@10815
   420
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   421
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   422
    int i;
icculus@10815
   423
icculus@10815
   424
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
icculus@10815
   425
icculus@10815
   426
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   427
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   428
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   429
    }
icculus@10815
   430
icculus@10815
   431
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   432
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   433
icculus@10815
   434
    /* Make sure src is aligned too. */
icculus@10815
   435
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   436
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   437
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   438
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   439
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   440
            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
icculus@10815
   441
            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   442
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   443
            const __m128i b = _mm_srai_epi32(ints, 16);
icculus@10815
   444
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   445
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
icculus@11403
   446
            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
icculus@10815
   447
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   448
        }
icculus@10815
   449
    }
icculus@10815
   450
icculus@10815
   451
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   452
icculus@10815
   453
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   454
    while (i) {
icculus@11403
   455
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   456
        i--; src--; dst--;
icculus@10815
   457
    }
icculus@10815
   458
icculus@10815
   459
    cvt->len_cvt *= 2;
icculus@10815
   460
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   461
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   462
    }
icculus@10815
   463
}
icculus@10815
   464
icculus@10815
   465
static void SDLCALL
icculus@10815
   466
SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   467
{
icculus@10815
   468
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   469
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   470
    int i;
icculus@10815
   471
icculus@10815
   472
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
icculus@10815
   473
icculus@10815
   474
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   475
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   476
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   477
    }
icculus@10815
   478
icculus@10815
   479
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   480
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   481
icculus@10815
   482
    /* Make sure src is aligned too. */
icculus@10815
   483
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   484
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   485
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   486
        const __m128 minus1 = _mm_set1_ps(1.0f);
icculus@10815
   487
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   488
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   489
            /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
icculus@10815
   490
            const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   491
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   492
            const __m128i b = _mm_srli_epi32(ints, 16);
icculus@10815
   493
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   494
            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
icculus@11403
   495
            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
icculus@10815
   496
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   497
        }
icculus@10815
   498
    }
icculus@10815
   499
icculus@10815
   500
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   501
icculus@10815
   502
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   503
    while (i) {
icculus@11403
   504
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   505
        i--; src--; dst--;
icculus@10815
   506
    }
icculus@10815
   507
icculus@10815
   508
    cvt->len_cvt *= 2;
icculus@10815
   509
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   510
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   511
    }
icculus@10815
   512
}
icculus@10815
   513
icculus@10815
   514
static void SDLCALL
icculus@10815
   515
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   516
{
icculus@10815
   517
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10815
   518
    float *dst = (float *) cvt->buf;
icculus@10815
   519
    int i;
icculus@10815
   520
icculus@10815
   521
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
icculus@10815
   522
icculus@10815
   523
    /* Get dst aligned to 16 bytes */
icculus@10815
   524
    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11987
   525
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@10815
   526
    }
icculus@10815
   527
icculus@10815
   528
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   529
flibitijibibo@12856
   530
    /* Make sure src is aligned too. */
flibitijibibo@12856
   531
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   532
    {
icculus@10815
   533
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11987
   534
        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
icculus@10815
   535
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   536
        while (i >= 4) {   /* 4 * sint32 */
icculus@11987
   537
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
admin@12056
   538
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
icculus@10815
   539
            i -= 4; mmsrc++; dst += 4;
icculus@10815
   540
        }
icculus@10815
   541
        src = (const Sint32 *) mmsrc;
icculus@10815
   542
    }
icculus@10815
   543
icculus@10815
   544
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   545
    while (i) {
icculus@11987
   546
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@10815
   547
        i--; src++; dst++;
icculus@10815
   548
    }
icculus@10815
   549
icculus@10815
   550
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   551
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   552
    }
icculus@10815
   553
}
icculus@10815
   554
icculus@10815
   555
static void SDLCALL
icculus@10815
   556
SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   557
{
icculus@10815
   558
    const float *src = (const float *) cvt->buf;
icculus@10815
   559
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@10815
   560
    int i;
icculus@10815
   561
icculus@10815
   562
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
icculus@10815
   563
icculus@10815
   564
    /* Get dst aligned to 16 bytes */
icculus@10815
   565
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   566
        const float sample = *src;
icculus@11991
   567
        if (sample >= 1.0f) {
icculus@11991
   568
            *dst = 127;
icculus@11991
   569
        } else if (sample <= -1.0f) {
icculus@11991
   570
            *dst = -128;
icculus@11991
   571
        } else {
icculus@11991
   572
            *dst = (Sint8)(sample * 127.0f);
icculus@11991
   573
        }
icculus@10815
   574
    }
icculus@10815
   575
icculus@10815
   576
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   577
icculus@10815
   578
    /* Make sure src is aligned too. */
icculus@10815
   579
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   580
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   581
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   582
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   583
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   584
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   585
        while (i >= 16) {   /* 16 * float32 */
icculus@11991
   586
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   587
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   588
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   589
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   590
            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   591
            i -= 16; src += 16; mmdst++;
icculus@10815
   592
        }
icculus@10815
   593
        dst = (Sint8 *) mmdst;
icculus@10815
   594
    }
icculus@10815
   595
icculus@10815
   596
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   597
    while (i) {
icculus@11991
   598
        const float sample = *src;
icculus@11991
   599
        if (sample >= 1.0f) {
icculus@11991
   600
            *dst = 127;
icculus@11991
   601
        } else if (sample <= -1.0f) {
icculus@11991
   602
            *dst = -128;
icculus@11991
   603
        } else {
icculus@11991
   604
            *dst = (Sint8)(sample * 127.0f);
icculus@11991
   605
        }
icculus@10815
   606
        i--; src++; dst++;
icculus@10815
   607
    }
icculus@10815
   608
icculus@10815
   609
    cvt->len_cvt /= 4;
icculus@10815
   610
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   611
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@10815
   612
    }
icculus@10815
   613
}
icculus@10815
   614
icculus@10815
   615
static void SDLCALL
icculus@10815
   616
SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   617
{
icculus@10815
   618
    const float *src = (const float *) cvt->buf;
icculus@10815
   619
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@10815
   620
    int i;
icculus@10815
   621
icculus@10815
   622
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
icculus@10815
   623
icculus@10815
   624
    /* Get dst aligned to 16 bytes */
icculus@10815
   625
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   626
        const float sample = *src;
icculus@11991
   627
        if (sample >= 1.0f) {
icculus@11991
   628
            *dst = 255;
icculus@11991
   629
        } else if (sample <= -1.0f) {
icculus@11991
   630
            *dst = 0;
icculus@11991
   631
        } else {
icculus@11991
   632
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11991
   633
        }
icculus@10815
   634
    }
icculus@10815
   635
icculus@10815
   636
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   637
icculus@10815
   638
    /* Make sure src is aligned too. */
icculus@10815
   639
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   640
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   641
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   642
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   643
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   644
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   645
        while (i >= 16) {   /* 16 * float32 */
icculus@11991
   646
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   647
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   648
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   649
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   650
            _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   651
            i -= 16; src += 16; mmdst++;
icculus@10815
   652
        }
icculus@10815
   653
        dst = (Uint8 *) mmdst;
icculus@10815
   654
    }
icculus@10815
   655
icculus@10815
   656
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   657
    while (i) {
icculus@11991
   658
        const float sample = *src;
icculus@11991
   659
        if (sample >= 1.0f) {
icculus@11991
   660
            *dst = 255;
icculus@11991
   661
        } else if (sample <= -1.0f) {
icculus@11991
   662
            *dst = 0;
icculus@11991
   663
        } else {
icculus@11991
   664
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11991
   665
        }
icculus@10815
   666
        i--; src++; dst++;
icculus@10815
   667
    }
icculus@10815
   668
icculus@10815
   669
    cvt->len_cvt /= 4;
icculus@10815
   670
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   671
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@10815
   672
    }
icculus@10815
   673
}
icculus@10815
   674
icculus@10815
   675
static void SDLCALL
icculus@10815
   676
SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   677
{
icculus@10815
   678
    const float *src = (const float *) cvt->buf;
icculus@10815
   679
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@10815
   680
    int i;
icculus@10815
   681
icculus@10815
   682
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
icculus@10815
   683
icculus@10815
   684
    /* Get dst aligned to 16 bytes */
icculus@10815
   685
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   686
        const float sample = *src;
icculus@11991
   687
        if (sample >= 1.0f) {
icculus@11991
   688
            *dst = 32767;
icculus@11991
   689
        } else if (sample <= -1.0f) {
icculus@11991
   690
            *dst = -32768;
icculus@11991
   691
        } else {
icculus@11991
   692
            *dst = (Sint16)(sample * 32767.0f);
icculus@11991
   693
        }
icculus@10815
   694
    }
icculus@10815
   695
icculus@10815
   696
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   697
icculus@10815
   698
    /* Make sure src is aligned too. */
icculus@10815
   699
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   700
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   701
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   702
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   703
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   704
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   705
        while (i >= 8) {   /* 8 * float32 */
icculus@11991
   706
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   707
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   708
            _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
icculus@10815
   709
            i -= 8; src += 8; mmdst++;
icculus@10815
   710
        }
icculus@10815
   711
        dst = (Sint16 *) mmdst;
icculus@10815
   712
    }
icculus@10815
   713
icculus@10815
   714
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   715
    while (i) {
icculus@11991
   716
        const float sample = *src;
icculus@11991
   717
        if (sample >= 1.0f) {
icculus@11991
   718
            *dst = 32767;
icculus@11991
   719
        } else if (sample <= -1.0f) {
icculus@11991
   720
            *dst = -32768;
icculus@11991
   721
        } else {
icculus@11991
   722
            *dst = (Sint16)(sample * 32767.0f);
icculus@11991
   723
        }
icculus@10815
   724
        i--; src++; dst++;
icculus@10815
   725
    }
icculus@10815
   726
icculus@10815
   727
    cvt->len_cvt /= 2;
icculus@10815
   728
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   729
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@10815
   730
    }
icculus@10815
   731
}
icculus@10815
   732
icculus@10815
   733
static void SDLCALL
icculus@10815
   734
SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   735
{
icculus@10815
   736
    const float *src = (const float *) cvt->buf;
icculus@10815
   737
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@10815
   738
    int i;
icculus@10815
   739
icculus@10815
   740
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
icculus@10815
   741
icculus@10815
   742
    /* Get dst aligned to 16 bytes */
icculus@10815
   743
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   744
        const float sample = *src;
icculus@11991
   745
        if (sample >= 1.0f) {
icculus@11991
   746
            *dst = 65535;
icculus@11991
   747
        } else if (sample <= -1.0f) {
icculus@11991
   748
            *dst = 0;
icculus@11991
   749
        } else {
icculus@11991
   750
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11991
   751
        }
icculus@10815
   752
    }
icculus@10815
   753
icculus@10815
   754
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   755
icculus@10815
   756
    /* Make sure src is aligned too. */
icculus@10815
   757
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   758
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   759
        /* This calculates differently than the scalar path because SSE2 can't
icculus@10815
   760
           pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
icculus@10815
   761
           saturation, so that would corrupt our data. _mm_packus_epi32 exists,
icculus@10815
   762
           but not before SSE 4.1. So we convert from float to sint16, packing
icculus@10815
   763
           that down with legit signed saturation, and then xor the top bit
icculus@10815
   764
           against 1. This results in the correct unsigned 16-bit value, even
icculus@10815
   765
           though it looks like dark magic. */
icculus@10815
   766
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   767
        const __m128i topbit = _mm_set1_epi16(-32768);
icculus@11991
   768
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   769
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   770
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   771
        while (i >= 8) {   /* 8 * float32 */
icculus@11991
   772
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   773
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   774
            _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
icculus@10815
   775
            i -= 8; src += 8; mmdst++;
icculus@10815
   776
        }
icculus@10815
   777
        dst = (Uint16 *) mmdst;
icculus@10815
   778
    }
icculus@10815
   779
icculus@10815
   780
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   781
    while (i) {
icculus@11991
   782
        const float sample = *src;
icculus@11991
   783
        if (sample >= 1.0f) {
icculus@11991
   784
            *dst = 65535;
icculus@11991
   785
        } else if (sample <= -1.0f) {
icculus@11991
   786
            *dst = 0;
icculus@11991
   787
        } else {
icculus@11991
   788
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11991
   789
        }
icculus@10815
   790
        i--; src++; dst++;
icculus@10815
   791
    }
icculus@10815
   792
icculus@10815
   793
    cvt->len_cvt /= 2;
icculus@10815
   794
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   795
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@10815
   796
    }
icculus@10815
   797
}
icculus@10815
   798
icculus@10815
   799
static void SDLCALL
icculus@10815
   800
SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   801
{
icculus@10815
   802
    const float *src = (const float *) cvt->buf;
icculus@10815
   803
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@10815
   804
    int i;
icculus@10815
   805
icculus@10815
   806
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
icculus@10815
   807
icculus@10815
   808
    /* Get dst aligned to 16 bytes */
icculus@10815
   809
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   810
        const float sample = *src;
icculus@11991
   811
        if (sample >= 1.0f) {
icculus@11991
   812
            *dst = 2147483647;
icculus@11991
   813
        } else if (sample <= -1.0f) {
icculus@11995
   814
            *dst = (Sint32) -2147483648LL;
icculus@11991
   815
        } else {
icculus@11991
   816
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11991
   817
        }
icculus@10815
   818
    }
icculus@10815
   819
icculus@10815
   820
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   821
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@10815
   822
icculus@10815
   823
    {
icculus@10815
   824
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   825
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   826
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@11987
   827
        const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
icculus@10815
   828
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   829
        while (i >= 4) {   /* 4 * float32 */
icculus@11991
   830
            _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   831
            i -= 4; src += 4; mmdst++;
icculus@10815
   832
        }
icculus@10815
   833
        dst = (Sint32 *) mmdst;
icculus@10815
   834
    }
icculus@10815
   835
icculus@10815
   836
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   837
    while (i) {
icculus@11991
   838
        const float sample = *src;
icculus@11991
   839
        if (sample >= 1.0f) {
icculus@11991
   840
            *dst = 2147483647;
icculus@11991
   841
        } else if (sample <= -1.0f) {
icculus@11995
   842
            *dst = (Sint32) -2147483648LL;
icculus@11991
   843
        } else {
icculus@11991
   844
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11991
   845
        }
icculus@10815
   846
        i--; src++; dst++;
icculus@10815
   847
    }
icculus@10815
   848
icculus@10815
   849
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   850
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@10815
   851
    }
icculus@10815
   852
}
icculus@10815
   853
#endif
icculus@10815
   854
icculus@10815
   855
icculus@11992
   856
#if HAVE_NEON_INTRINSICS
icculus@11992
   857
static void SDLCALL
icculus@11992
   858
SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   859
{
icculus@11992
   860
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   861
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@11992
   862
    int i;
icculus@11992
   863
icculus@11992
   864
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
icculus@11992
   865
icculus@11992
   866
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   867
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11992
   868
        *dst = ((float) *src) * DIVBY128;
icculus@11992
   869
    }
icculus@11992
   870
icculus@11992
   871
    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
icculus@11992
   872
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   873
icculus@11992
   874
    /* Make sure src is aligned too. */
icculus@11992
   875
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   876
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   877
        const int8_t *mmsrc = (const int8_t *) src;
icculus@11992
   878
        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
icculus@11992
   879
        while (i >= 16) {   /* 16 * 8-bit */
icculus@11992
   880
            const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
icculus@11992
   881
            const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
icculus@11992
   882
            const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
icculus@11992
   883
            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
icculus@11992
   884
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
icculus@11992
   885
            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
icculus@11992
   886
            vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
icculus@11992
   887
            vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
icculus@11992
   888
            i -= 16; mmsrc -= 16; dst -= 16;
icculus@11992
   889
        }
icculus@11992
   890
icculus@11992
   891
        src = (const Sint8 *) mmsrc;
icculus@11992
   892
    }
icculus@11992
   893
icculus@11992
   894
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@11992
   895
icculus@11992
   896
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   897
    while (i) {
icculus@11992
   898
        *dst = ((float) *src) * DIVBY128;
icculus@11992
   899
        i--; src--; dst--;
icculus@11992
   900
    }
icculus@11992
   901
icculus@11992
   902
    cvt->len_cvt *= 4;
icculus@11992
   903
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
   904
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
   905
    }
icculus@11992
   906
}
icculus@11992
   907
icculus@11992
   908
static void SDLCALL
icculus@11992
   909
SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   910
{
icculus@11992
   911
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   912
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@11992
   913
    int i;
icculus@11992
   914
icculus@11992
   915
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
icculus@11992
   916
icculus@11992
   917
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   918
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11992
   919
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@11992
   920
    }
icculus@11992
   921
icculus@11992
   922
    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
icculus@11992
   923
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   924
icculus@11992
   925
    /* Make sure src is aligned too. */
icculus@11992
   926
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   927
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   928
        const uint8_t *mmsrc = (const uint8_t *) src;
icculus@11992
   929
        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
icculus@11992
   930
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
   931
        while (i >= 16) {   /* 16 * 8-bit */
icculus@11992
   932
            const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
icculus@11992
   933
            const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
icculus@11992
   934
            const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
icculus@11992
   935
            /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
icculus@11992
   936
            vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
icculus@11992
   937
            vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
icculus@11992
   938
            vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
icculus@11992
   939
            vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
icculus@11992
   940
            i -= 16; mmsrc -= 16; dst -= 16;
icculus@11992
   941
        }
icculus@11992
   942
icculus@11992
   943
        src = (const Uint8 *) mmsrc;
icculus@11992
   944
    }
icculus@11992
   945
icculus@11992
   946
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@11992
   947
icculus@11992
   948
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   949
    while (i) {
icculus@11992
   950
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@11992
   951
        i--; src--; dst--;
icculus@11992
   952
    }
icculus@11992
   953
icculus@11992
   954
    cvt->len_cvt *= 4;
icculus@11992
   955
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
   956
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
   957
    }
icculus@11992
   958
}
icculus@11992
   959
icculus@11992
   960
static void SDLCALL
icculus@11992
   961
SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   962
{
icculus@11992
   963
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   964
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@11992
   965
    int i;
icculus@11992
   966
icculus@11992
   967
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
icculus@11992
   968
icculus@11992
   969
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   970
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11992
   971
        *dst = ((float) *src) * DIVBY32768;
icculus@11992
   972
    }
icculus@11992
   973
icculus@11992
   974
    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
icculus@11992
   975
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   976
icculus@11992
   977
    /* Make sure src is aligned too. */
icculus@11992
   978
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   979
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   980
        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
icculus@11992
   981
        while (i >= 8) {   /* 8 * 16-bit */
icculus@11992
   982
            const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
icculus@11992
   983
            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
icculus@11992
   984
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
icculus@11992
   985
            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
icculus@11992
   986
            i -= 8; src -= 8; dst -= 8;
icculus@11992
   987
        }
icculus@11992
   988
    }
icculus@11992
   989
icculus@11992
   990
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@11992
   991
icculus@11992
   992
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   993
    while (i) {
icculus@11992
   994
        *dst = ((float) *src) * DIVBY32768;
icculus@11992
   995
        i--; src--; dst--;
icculus@11992
   996
    }
icculus@11992
   997
icculus@11992
   998
    cvt->len_cvt *= 2;
icculus@11992
   999
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1000
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
  1001
    }
icculus@11992
  1002
}
icculus@11992
  1003
icculus@11992
  1004
static void SDLCALL
icculus@11992
  1005
SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1006
{
icculus@11992
  1007
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
  1008
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@11992
  1009
    int i;
icculus@11992
  1010
icculus@11992
  1011
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
icculus@11992
  1012
icculus@11992
  1013
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
  1014
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11992
  1015
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@11992
  1016
    }
icculus@11992
  1017
icculus@11992
  1018
    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
icculus@11992
  1019
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1020
icculus@11992
  1021
    /* Make sure src is aligned too. */
icculus@11992
  1022
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1023
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1024
        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
icculus@11992
  1025
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1026
        while (i >= 8) {   /* 8 * 16-bit */
icculus@11992
  1027
            const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
icculus@11992
  1028
            /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
icculus@11992
  1029
            vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
icculus@11992
  1030
            vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
icculus@11992
  1031
            i -= 8; src -= 8; dst -= 8;
icculus@11992
  1032
        }
icculus@11992
  1033
    }
icculus@11992
  1034
icculus@11992
  1035
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@11992
  1036
icculus@11992
  1037
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1038
    while (i) {
icculus@11992
  1039
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@11992
  1040
        i--; src--; dst--;
icculus@11992
  1041
    }
icculus@11992
  1042
icculus@11992
  1043
    cvt->len_cvt *= 2;
icculus@11992
  1044
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1045
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
  1046
    }
icculus@11992
  1047
}
icculus@11992
  1048
icculus@11992
  1049
static void SDLCALL
icculus@11992
  1050
SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1051
{
icculus@11992
  1052
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@11992
  1053
    float *dst = (float *) cvt->buf;
icculus@11992
  1054
    int i;
icculus@11992
  1055
icculus@11992
  1056
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
icculus@11992
  1057
icculus@11992
  1058
    /* Get dst aligned to 16 bytes */
icculus@11992
  1059
    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1060
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@11992
  1061
    }
icculus@11992
  1062
icculus@11992
  1063
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1064
flibitijibibo@12856
  1065
    /* Make sure src is aligned too. */
flibitijibibo@12856
  1066
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1067
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1068
        const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
icculus@11992
  1069
        const int32_t *mmsrc = (const int32_t *) src;
icculus@11992
  1070
        while (i >= 4) {   /* 4 * sint32 */
icculus@11992
  1071
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
icculus@11992
  1072
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
icculus@11992
  1073
            i -= 4; mmsrc += 4; dst += 4;
icculus@11992
  1074
        }
icculus@11992
  1075
        src = (const Sint32 *) mmsrc;
icculus@11992
  1076
    }
icculus@11992
  1077
icculus@11992
  1078
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1079
    while (i) {
icculus@11992
  1080
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@11992
  1081
        i--; src++; dst++;
icculus@11992
  1082
    }
icculus@11992
  1083
icculus@11992
  1084
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1085
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
  1086
    }
icculus@11992
  1087
}
icculus@11992
  1088
icculus@11992
  1089
static void SDLCALL
icculus@11992
  1090
SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1091
{
icculus@11992
  1092
    const float *src = (const float *) cvt->buf;
icculus@11992
  1093
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@11992
  1094
    int i;
icculus@11992
  1095
icculus@11992
  1096
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
icculus@11992
  1097
icculus@11992
  1098
    /* Get dst aligned to 16 bytes */
icculus@11992
  1099
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1100
        const float sample = *src;
icculus@11992
  1101
        if (sample >= 1.0f) {
icculus@11992
  1102
            *dst = 127;
icculus@11992
  1103
        } else if (sample <= -1.0f) {
icculus@11992
  1104
            *dst = -128;
icculus@11992
  1105
        } else {
icculus@11992
  1106
            *dst = (Sint8)(sample * 127.0f);
icculus@11992
  1107
        }
icculus@11992
  1108
    }
icculus@11992
  1109
icculus@11992
  1110
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1111
icculus@11992
  1112
    /* Make sure src is aligned too. */
icculus@11992
  1113
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1114
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1115
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1116
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1117
        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
icculus@11992
  1118
        int8_t *mmdst = (int8_t *) dst;
icculus@11992
  1119
        while (i >= 16) {   /* 16 * float32 */
icculus@11992
  1120
            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1121
            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1122
            const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1123
            const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1124
            const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
icculus@11992
  1125
            const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
icculus@11992
  1126
            vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
icculus@11992
  1127
            i -= 16; src += 16; mmdst += 16;
icculus@11992
  1128
        }
icculus@11992
  1129
        dst = (Sint8 *) mmdst;
icculus@11992
  1130
    }
icculus@11992
  1131
icculus@11992
  1132
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1133
    while (i) {
icculus@11992
  1134
        const float sample = *src;
icculus@11992
  1135
        if (sample >= 1.0f) {
icculus@11992
  1136
            *dst = 127;
icculus@11992
  1137
        } else if (sample <= -1.0f) {
icculus@11992
  1138
            *dst = -128;
icculus@11992
  1139
        } else {
icculus@11992
  1140
            *dst = (Sint8)(sample * 127.0f);
icculus@11992
  1141
        }
icculus@11992
  1142
        i--; src++; dst++;
icculus@11992
  1143
    }
icculus@11992
  1144
icculus@11992
  1145
    cvt->len_cvt /= 4;
icculus@11992
  1146
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1147
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@11992
  1148
    }
icculus@11992
  1149
}
icculus@11992
  1150
icculus@11992
  1151
static void SDLCALL
icculus@11992
  1152
SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1153
{
icculus@11992
  1154
    const float *src = (const float *) cvt->buf;
icculus@11992
  1155
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@11992
  1156
    int i;
icculus@11992
  1157
icculus@11992
  1158
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
icculus@11992
  1159
icculus@11992
  1160
    /* Get dst aligned to 16 bytes */
icculus@11992
  1161
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1162
        const float sample = *src;
icculus@11992
  1163
        if (sample >= 1.0f) {
icculus@11992
  1164
            *dst = 255;
icculus@11992
  1165
        } else if (sample <= -1.0f) {
icculus@11992
  1166
            *dst = 0;
icculus@11992
  1167
        } else {
icculus@11992
  1168
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11992
  1169
        }
icculus@11992
  1170
    }
icculus@11992
  1171
icculus@11992
  1172
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1173
icculus@11992
  1174
    /* Make sure src is aligned too. */
icculus@11992
  1175
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1176
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1177
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1178
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1179
        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
icculus@11992
  1180
        uint8_t *mmdst = (uint8_t *) dst;
icculus@11992
  1181
        while (i >= 16) {   /* 16 * float32 */
icculus@11992
  1182
            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1183
            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1184
            const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1185
            const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1186
            const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
icculus@11992
  1187
            const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
icculus@11992
  1188
            vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
icculus@11992
  1189
            i -= 16; src += 16; mmdst += 16;
icculus@11992
  1190
        }
icculus@11992
  1191
icculus@11992
  1192
        dst = (Uint8 *) mmdst;
icculus@11992
  1193
    }
icculus@11992
  1194
icculus@11992
  1195
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1196
    while (i) {
icculus@11992
  1197
        const float sample = *src;
icculus@11992
  1198
        if (sample >= 1.0f) {
icculus@11992
  1199
            *dst = 255;
icculus@11992
  1200
        } else if (sample <= -1.0f) {
icculus@11992
  1201
            *dst = 0;
icculus@11992
  1202
        } else {
icculus@11992
  1203
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11992
  1204
        }
icculus@11992
  1205
        i--; src++; dst++;
icculus@11992
  1206
    }
icculus@11992
  1207
icculus@11992
  1208
    cvt->len_cvt /= 4;
icculus@11992
  1209
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1210
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@11992
  1211
    }
icculus@11992
  1212
}
icculus@11992
  1213
icculus@11992
  1214
static void SDLCALL
icculus@11992
  1215
SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1216
{
icculus@11992
  1217
    const float *src = (const float *) cvt->buf;
icculus@11992
  1218
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@11992
  1219
    int i;
icculus@11992
  1220
icculus@11992
  1221
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
icculus@11992
  1222
icculus@11992
  1223
    /* Get dst aligned to 16 bytes */
icculus@11992
  1224
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1225
        const float sample = *src;
icculus@11992
  1226
        if (sample >= 1.0f) {
icculus@11992
  1227
            *dst = 32767;
icculus@11992
  1228
        } else if (sample <= -1.0f) {
icculus@11992
  1229
            *dst = -32768;
icculus@11992
  1230
        } else {
icculus@11992
  1231
            *dst = (Sint16)(sample * 32767.0f);
icculus@11992
  1232
        }
icculus@11992
  1233
    }
icculus@11992
  1234
icculus@11992
  1235
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1236
icculus@11992
  1237
    /* Make sure src is aligned too. */
icculus@11992
  1238
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1239
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1240
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1241
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1242
        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
icculus@11992
  1243
        int16_t *mmdst = (int16_t *) dst;
icculus@11992
  1244
        while (i >= 8) {   /* 8 * float32 */
icculus@11992
  1245
            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1246
            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1247
            vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
icculus@11992
  1248
            i -= 8; src += 8; mmdst += 8;
icculus@11992
  1249
        }
icculus@11992
  1250
        dst = (Sint16 *) mmdst;
icculus@11992
  1251
    }
icculus@11992
  1252
icculus@11992
  1253
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1254
    while (i) {
icculus@11992
  1255
        const float sample = *src;
icculus@11992
  1256
        if (sample >= 1.0f) {
icculus@11992
  1257
            *dst = 32767;
icculus@11992
  1258
        } else if (sample <= -1.0f) {
icculus@11992
  1259
            *dst = -32768;
icculus@11992
  1260
        } else {
icculus@11992
  1261
            *dst = (Sint16)(sample * 32767.0f);
icculus@11992
  1262
        }
icculus@11992
  1263
        i--; src++; dst++;
icculus@11992
  1264
    }
icculus@11992
  1265
icculus@11992
  1266
    cvt->len_cvt /= 2;
icculus@11992
  1267
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1268
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@11992
  1269
    }
icculus@11992
  1270
}
icculus@11992
  1271
icculus@11992
  1272
static void SDLCALL
icculus@11992
  1273
SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1274
{
icculus@11992
  1275
    const float *src = (const float *) cvt->buf;
icculus@11992
  1276
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@11992
  1277
    int i;
icculus@11992
  1278
icculus@11992
  1279
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
icculus@11992
  1280
icculus@11992
  1281
    /* Get dst aligned to 16 bytes */
icculus@11992
  1282
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1283
        const float sample = *src;
icculus@11992
  1284
        if (sample >= 1.0f) {
icculus@11992
  1285
            *dst = 65535;
icculus@11992
  1286
        } else if (sample <= -1.0f) {
icculus@11992
  1287
            *dst = 0;
icculus@11992
  1288
        } else {
icculus@11992
  1289
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11992
  1290
        }
icculus@11992
  1291
    }
icculus@11992
  1292
icculus@11992
  1293
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1294
icculus@11992
  1295
    /* Make sure src is aligned too. */
icculus@11992
  1296
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1297
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1298
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1299
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1300
        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
icculus@11992
  1301
        uint16_t *mmdst = (uint16_t *) dst;
icculus@11992
  1302
        while (i >= 8) {   /* 8 * float32 */
icculus@11992
  1303
            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1304
            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1305
            vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
icculus@11992
  1306
            i -= 8; src += 8; mmdst += 8;
icculus@11992
  1307
        }
icculus@11992
  1308
        dst = (Uint16 *) mmdst;
icculus@11992
  1309
    }
icculus@11992
  1310
icculus@11992
  1311
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1312
    while (i) {
icculus@11992
  1313
        const float sample = *src;
icculus@11992
  1314
        if (sample >= 1.0f) {
icculus@11992
  1315
            *dst = 65535;
icculus@11992
  1316
        } else if (sample <= -1.0f) {
icculus@11992
  1317
            *dst = 0;
icculus@11992
  1318
        } else {
icculus@11992
  1319
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11992
  1320
        }
icculus@11992
  1321
        i--; src++; dst++;
icculus@11992
  1322
    }
icculus@11992
  1323
icculus@11992
  1324
    cvt->len_cvt /= 2;
icculus@11992
  1325
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1326
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@11992
  1327
    }
icculus@11992
  1328
}
icculus@11992
  1329
icculus@11992
  1330
static void SDLCALL
icculus@11992
  1331
SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1332
{
icculus@11992
  1333
    const float *src = (const float *) cvt->buf;
icculus@11992
  1334
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@11992
  1335
    int i;
icculus@11992
  1336
icculus@11992
  1337
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
icculus@11992
  1338
icculus@11992
  1339
    /* Get dst aligned to 16 bytes */
icculus@11992
  1340
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1341
        const float sample = *src;
icculus@11992
  1342
        if (sample >= 1.0f) {
icculus@11992
  1343
            *dst = 2147483647;
icculus@11992
  1344
        } else if (sample <= -1.0f) {
icculus@11992
  1345
            *dst = -2147483648;
icculus@11992
  1346
        } else {
icculus@11992
  1347
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11992
  1348
        }
icculus@11992
  1349
    }
icculus@11992
  1350
icculus@11992
  1351
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1352
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@11992
  1353
icculus@11992
  1354
    {
icculus@11992
  1355
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1356
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1357
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1358
        const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
icculus@11992
  1359
        int32_t *mmdst = (int32_t *) dst;
icculus@11992
  1360
        while (i >= 4) {   /* 4 * float32 */
icculus@11992
  1361
            vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
icculus@11992
  1362
            i -= 4; src += 4; mmdst += 4;
icculus@11992
  1363
        }
icculus@11992
  1364
        dst = (Sint32 *) mmdst;
icculus@11992
  1365
    }
icculus@11992
  1366
icculus@11992
  1367
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1368
    while (i) {
icculus@11992
  1369
        const float sample = *src;
icculus@11992
  1370
        if (sample >= 1.0f) {
icculus@11992
  1371
            *dst = 2147483647;
icculus@11992
  1372
        } else if (sample <= -1.0f) {
icculus@11992
  1373
            *dst = -2147483648;
icculus@11992
  1374
        } else {
icculus@11992
  1375
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11992
  1376
        }
icculus@11992
  1377
        i--; src++; dst++;
icculus@11992
  1378
    }
icculus@11992
  1379
icculus@11992
  1380
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1381
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@11992
  1382
    }
icculus@11992
  1383
}
icculus@11992
  1384
#endif
icculus@11992
  1385
icculus@11992
  1386
icculus@11992
  1387
icculus@10815
  1388
void SDL_ChooseAudioConverters(void)
icculus@10815
  1389
{
icculus@10815
  1390
    static SDL_bool converters_chosen = SDL_FALSE;
icculus@10815
  1391
icculus@10815
  1392
    if (converters_chosen) {
icculus@10815
  1393
        return;
icculus@10815
  1394
    }
icculus@10815
  1395
slouken@11406
  1396
#define SET_CONVERTER_FUNCS(fntype) \
icculus@10815
  1397
        SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
icculus@10815
  1398
        SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
icculus@10815
  1399
        SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
icculus@10815
  1400
        SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
icculus@10815
  1401
        SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
icculus@10815
  1402
        SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
icculus@10815
  1403
        SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
icculus@10815
  1404
        SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
icculus@10815
  1405
        SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
icculus@10815
  1406
        SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
icculus@10815
  1407
        converters_chosen = SDL_TRUE
icculus@10815
  1408
slouken@11406
  1409
#if HAVE_SSE2_INTRINSICS
icculus@10815
  1410
    if (SDL_HasSSE2()) {
icculus@10815
  1411
        SET_CONVERTER_FUNCS(SSE2);
icculus@10815
  1412
        return;
icculus@10815
  1413
    }
slouken@11406
  1414
#endif
icculus@10815
  1415
icculus@11992
  1416
#if HAVE_NEON_INTRINSICS
icculus@11992
  1417
    if (SDL_HasNEON()) {
icculus@11992
  1418
        SET_CONVERTER_FUNCS(NEON);
icculus@11992
  1419
        return;
icculus@11992
  1420
    }
icculus@11992
  1421
#endif
icculus@11992
  1422
slouken@11406
  1423
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
  1424
    SET_CONVERTER_FUNCS(Scalar);
slouken@11406
  1425
#endif
icculus@10815
  1426
slouken@11406
  1427
#undef SET_CONVERTER_FUNCS
icculus@10815
  1428
icculus@10815
  1429
    SDL_assert(converters_chosen == SDL_TRUE);
icculus@10815
  1430
}
icculus@1982
  1431
slouken@1985
  1432
/* vi: set ts=4 sw=4 expandtab: */