src/audio/SDL_audiotypecvt.c
author Ryan C. Gordon <icculus@icculus.org>
Mon, 21 May 2018 11:54:09 -0400
changeset 11993 fdf104726ced
parent 11992 08c415f14810
child 11995 b34d86386ee1
permissions -rw-r--r--
audio: Patched to compile on Visual Studio.

(It gets upset at the -2147483648, thinking this should be an unsigned value
because 2147483648 is too large for an int32, so the negative sign upsets the
compiler.)
icculus@1982
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@11811
     3
  Copyright (C) 1997-2018 Sam Lantinga <slouken@libsdl.org>
slouken@5535
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@5535
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@5535
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
icculus@1982
    20
*/
icculus@1982
    21
icculus@8093
    22
#include "../SDL_internal.h"
icculus@1982
    23
#include "SDL_audio.h"
icculus@1982
    24
#include "SDL_audio_c.h"
icculus@10815
    25
#include "SDL_cpuinfo.h"
icculus@10575
    26
#include "SDL_assert.h"
icculus@1982
    27
icculus@11992
    28
#ifdef __ARM_NEON__
icculus@11992
    29
#define HAVE_NEON_INTRINSICS 1
icculus@11992
    30
#endif
icculus@10815
    31
icculus@10835
    32
#ifdef __SSE2__
icculus@10835
    33
#define HAVE_SSE2_INTRINSICS 1
icculus@10815
    34
#endif
icculus@10815
    35
icculus@10815
    36
#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
icculus@10815
    37
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
icculus@10815
    38
#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
icculus@10815
    39
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
icculus@10815
    40
#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
icculus@10815
    41
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
icculus@10815
    42
#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
icculus@10815
    43
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
icculus@10815
    44
#endif
icculus@10815
    45
icculus@10815
    46
/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
icculus@10815
    47
#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    48
#define NEED_SCALAR_CONVERTER_FALLBACKS 1
icculus@10815
    49
#endif
icculus@10815
    50
icculus@10815
    51
/* Function pointers set to a CPU-specific implementation. */
icculus@10815
    52
SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
icculus@10815
    53
SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
icculus@10815
    54
SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
icculus@10815
    55
SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
icculus@10815
    56
SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
icculus@10815
    57
SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
icculus@10815
    58
SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
icculus@10815
    59
SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
icculus@10815
    60
SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
icculus@10815
    61
SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
icculus@10815
    62
icculus@10815
    63
icculus@11403
    64
#define DIVBY128 0.0078125f
icculus@11403
    65
#define DIVBY32768 0.000030517578125f
icculus@11987
    66
#define DIVBY8388607 0.00000011920930376163766f
icculus@1982
    67
icculus@10815
    68
icculus@10815
    69
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    70
static void SDLCALL
icculus@10815
    71
SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
    72
{
icculus@10814
    73
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    74
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10575
    75
    int i;
icculus@3021
    76
icculus@10575
    77
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
icculus@1982
    78
icculus@10814
    79
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
    80
        *dst = ((float) *src) * DIVBY128;
icculus@1982
    81
    }
icculus@1982
    82
icculus@10575
    83
    cvt->len_cvt *= 4;
icculus@1982
    84
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
    85
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
    86
    }
icculus@1982
    87
}
icculus@1982
    88
icculus@10815
    89
static void SDLCALL
icculus@10815
    90
SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
    91
{
icculus@10575
    92
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    93
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@1982
    94
    int i;
icculus@1982
    95
icculus@10575
    96
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
icculus@1982
    97
icculus@10814
    98
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
    99
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10575
   100
    }
icculus@10575
   101
icculus@10575
   102
    cvt->len_cvt *= 4;
icculus@10575
   103
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   104
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10575
   105
    }
icculus@10575
   106
}
icculus@10575
   107
icculus@10815
   108
static void SDLCALL
icculus@10815
   109
SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
   110
{
icculus@10575
   111
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   112
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10575
   113
    int i;
icculus@10575
   114
icculus@10575
   115
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
icculus@10575
   116
icculus@10575
   117
    for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
icculus@11403
   118
        *dst = ((float) *src) * DIVBY32768;
icculus@1982
   119
    }
icculus@1982
   120
icculus@1982
   121
    cvt->len_cvt *= 2;
icculus@1982
   122
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   123
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   124
    }
icculus@1982
   125
}
icculus@1982
   126
icculus@10815
   127
static void SDLCALL
icculus@10815
   128
SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   129
{
icculus@10575
   130
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   131
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@1982
   132
    int i;
icculus@1982
   133
icculus@10575
   134
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
icculus@1982
   135
icculus@10575
   136
    for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
icculus@11403
   137
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@1982
   138
    }
icculus@1982
   139
icculus@1982
   140
    cvt->len_cvt *= 2;
icculus@1982
   141
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   142
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   143
    }
icculus@1982
   144
}
icculus@1982
   145
icculus@10815
   146
static void SDLCALL
icculus@10815
   147
SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   148
{
icculus@10814
   149
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10575
   150
    float *dst = (float *) cvt->buf;
icculus@1982
   151
    int i;
icculus@1982
   152
icculus@10575
   153
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
icculus@1982
   154
icculus@10575
   155
    for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
icculus@11987
   156
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@1982
   157
    }
icculus@1982
   158
icculus@1982
   159
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   160
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   161
    }
icculus@1982
   162
}
icculus@1982
   163
icculus@10815
   164
static void SDLCALL
icculus@10815
   165
SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   166
{
icculus@10575
   167
    const float *src = (const float *) cvt->buf;
icculus@10575
   168
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@1982
   169
    int i;
icculus@1982
   170
icculus@10575
   171
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
icculus@1982
   172
icculus@10575
   173
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   174
        const float sample = *src;
icculus@11989
   175
        if (sample >= 1.0f) {
slouken@11633
   176
            *dst = 127;
icculus@11989
   177
        } else if (sample <= -1.0f) {
icculus@11990
   178
            *dst = -128;
slouken@11633
   179
        } else {
slouken@11633
   180
            *dst = (Sint8)(sample * 127.0f);
slouken@11633
   181
        }
icculus@1982
   182
    }
icculus@1982
   183
icculus@10575
   184
    cvt->len_cvt /= 4;
icculus@1982
   185
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   186
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@1982
   187
    }
icculus@1982
   188
}
icculus@1982
   189
icculus@10815
   190
static void SDLCALL
icculus@10815
   191
SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   192
{
icculus@10575
   193
    const float *src = (const float *) cvt->buf;
icculus@10575
   194
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@1982
   195
    int i;
icculus@1982
   196
icculus@10575
   197
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
icculus@1982
   198
icculus@10575
   199
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   200
        const float sample = *src;
icculus@11989
   201
        if (sample >= 1.0f) {
slouken@11633
   202
            *dst = 255;
icculus@11989
   203
        } else if (sample <= -1.0f) {
slouken@11633
   204
            *dst = 0;
slouken@11633
   205
        } else {
slouken@11633
   206
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
slouken@11633
   207
        }
icculus@1982
   208
    }
icculus@1982
   209
icculus@10575
   210
    cvt->len_cvt /= 4;
icculus@1982
   211
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   212
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@1982
   213
    }
icculus@1982
   214
}
icculus@1982
   215
icculus@10815
   216
static void SDLCALL
icculus@10815
   217
SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   218
{
icculus@10575
   219
    const float *src = (const float *) cvt->buf;
icculus@10575
   220
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@1982
   221
    int i;
icculus@1982
   222
icculus@10575
   223
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
icculus@1982
   224
icculus@10575
   225
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   226
        const float sample = *src;
icculus@11989
   227
        if (sample >= 1.0f) {
slouken@11633
   228
            *dst = 32767;
icculus@11989
   229
        } else if (sample <= -1.0f) {
icculus@11990
   230
            *dst = -32768;
slouken@11633
   231
        } else {
slouken@11633
   232
            *dst = (Sint16)(sample * 32767.0f);
slouken@11633
   233
        }
icculus@1982
   234
    }
icculus@1982
   235
icculus@1982
   236
    cvt->len_cvt /= 2;
icculus@1982
   237
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   238
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@1982
   239
    }
icculus@1982
   240
}
icculus@1982
   241
icculus@10815
   242
static void SDLCALL
icculus@10815
   243
SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   244
{
icculus@10575
   245
    const float *src = (const float *) cvt->buf;
icculus@10575
   246
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@1982
   247
    int i;
icculus@1982
   248
icculus@10575
   249
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
icculus@1982
   250
icculus@10575
   251
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   252
        const float sample = *src;
icculus@11989
   253
        if (sample >= 1.0f) {
icculus@11990
   254
            *dst = 65535;
icculus@11989
   255
        } else if (sample <= -1.0f) {
slouken@11633
   256
            *dst = 0;
slouken@11633
   257
        } else {
slouken@11633
   258
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
slouken@11633
   259
        }
icculus@1982
   260
    }
icculus@1982
   261
icculus@1982
   262
    cvt->len_cvt /= 2;
icculus@1982
   263
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   264
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@1982
   265
    }
icculus@1982
   266
}
icculus@1982
   267
icculus@10815
   268
static void SDLCALL
icculus@10815
   269
SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   270
{
icculus@10575
   271
    const float *src = (const float *) cvt->buf;
icculus@10575
   272
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@1982
   273
    int i;
icculus@1982
   274
icculus@10575
   275
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
icculus@1982
   276
icculus@10575
   277
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   278
        const float sample = *src;
icculus@11989
   279
        if (sample >= 1.0f) {
slouken@11633
   280
            *dst = 2147483647;
icculus@11989
   281
        } else if (sample <= -1.0f) {
icculus@11993
   282
            *dst = (Sint32) -2147483648LL;
slouken@11633
   283
        } else {
icculus@11987
   284
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
slouken@11633
   285
        }
icculus@1982
   286
    }
icculus@1982
   287
icculus@1982
   288
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   289
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@1982
   290
    }
icculus@1982
   291
}
icculus@10815
   292
#endif
icculus@10815
   293
icculus@10815
   294
icculus@10815
   295
#if HAVE_SSE2_INTRINSICS
icculus@10815
   296
static void SDLCALL
icculus@10815
   297
SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   298
{
icculus@10815
   299
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   300
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   301
    int i;
icculus@10815
   302
icculus@10815
   303
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
icculus@10815
   304
icculus@10815
   305
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   306
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   307
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   308
    }
icculus@10815
   309
icculus@10815
   310
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   311
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   312
icculus@10815
   313
    /* Make sure src is aligned too. */
icculus@10815
   314
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   315
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   316
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   317
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   318
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   319
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   320
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
icculus@10815
   321
            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
icculus@10815
   322
            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   323
            /* right-shift-sign-extend gets us sint16 with the other set of values. */
icculus@10815
   324
            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
icculus@10815
   325
            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
icculus@11403
   326
            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   327
            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
icculus@11403
   328
            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   329
            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
icculus@10815
   330
            /* Interleave back into correct order, store. */
icculus@10815
   331
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   332
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   333
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   334
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   335
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   336
        }
icculus@10815
   337
icculus@10815
   338
        src = (const Sint8 *) mmsrc;
icculus@10815
   339
    }
icculus@10815
   340
icculus@10815
   341
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   342
icculus@10815
   343
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   344
    while (i) {
icculus@11403
   345
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   346
        i--; src--; dst--;
icculus@10815
   347
    }
icculus@10815
   348
icculus@10815
   349
    cvt->len_cvt *= 4;
icculus@10815
   350
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   351
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   352
    }
icculus@10815
   353
}
icculus@10815
   354
icculus@10815
   355
static void SDLCALL
icculus@10815
   356
SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   357
{
icculus@10815
   358
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   359
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   360
    int i;
icculus@10815
   361
icculus@10815
   362
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
icculus@10815
   363
icculus@10815
   364
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   365
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   366
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   367
    }
icculus@10815
   368
icculus@10815
   369
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   370
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   371
icculus@10815
   372
    /* Make sure src is aligned too. */
icculus@10815
   373
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   374
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   375
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   376
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   377
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   378
        const __m128 minus1 = _mm_set1_ps(-1.0f);
icculus@10815
   379
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   380
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
icculus@10815
   381
            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
icculus@10815
   382
            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   383
            /* right-shift-zero-extend gets us uint16 with the other set of values. */
icculus@10815
   384
            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
icculus@10815
   385
            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
icculus@10815
   386
            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
icculus@11403
   387
            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   388
            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
icculus@11403
   389
            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   390
            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
icculus@10815
   391
            /* Interleave back into correct order, store. */
icculus@10815
   392
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   393
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   394
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   395
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   396
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   397
        }
icculus@10815
   398
icculus@10815
   399
        src = (const Uint8 *) mmsrc;
icculus@10815
   400
    }
icculus@10815
   401
icculus@10815
   402
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   403
icculus@10815
   404
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   405
    while (i) {
icculus@11403
   406
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   407
        i--; src--; dst--;
icculus@10815
   408
    }
icculus@10815
   409
icculus@10815
   410
    cvt->len_cvt *= 4;
icculus@10815
   411
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   412
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   413
    }
icculus@10815
   414
}
icculus@10815
   415
icculus@10815
   416
static void SDLCALL
icculus@10815
   417
SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   418
{
icculus@10815
   419
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   420
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   421
    int i;
icculus@10815
   422
icculus@10815
   423
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
icculus@10815
   424
icculus@10815
   425
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   426
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   427
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   428
    }
icculus@10815
   429
icculus@10815
   430
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   431
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   432
icculus@10815
   433
    /* Make sure src is aligned too. */
icculus@10815
   434
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   435
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   436
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   437
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   438
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   439
            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
icculus@10815
   440
            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   441
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   442
            const __m128i b = _mm_srai_epi32(ints, 16);
icculus@10815
   443
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   444
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
icculus@11403
   445
            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
icculus@10815
   446
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   447
        }
icculus@10815
   448
    }
icculus@10815
   449
icculus@10815
   450
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   451
icculus@10815
   452
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   453
    while (i) {
icculus@11403
   454
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   455
        i--; src--; dst--;
icculus@10815
   456
    }
icculus@10815
   457
icculus@10815
   458
    cvt->len_cvt *= 2;
icculus@10815
   459
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   460
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   461
    }
icculus@10815
   462
}
icculus@10815
   463
icculus@10815
   464
static void SDLCALL
icculus@10815
   465
SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   466
{
icculus@10815
   467
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   468
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   469
    int i;
icculus@10815
   470
icculus@10815
   471
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
icculus@10815
   472
icculus@10815
   473
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   474
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   475
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   476
    }
icculus@10815
   477
icculus@10815
   478
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   479
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   480
icculus@10815
   481
    /* Make sure src is aligned too. */
icculus@10815
   482
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   483
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   484
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   485
        const __m128 minus1 = _mm_set1_ps(1.0f);
icculus@10815
   486
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   487
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   488
            /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
icculus@10815
   489
            const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   490
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   491
            const __m128i b = _mm_srli_epi32(ints, 16);
icculus@10815
   492
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   493
            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
icculus@11403
   494
            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
icculus@10815
   495
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   496
        }
icculus@10815
   497
    }
icculus@10815
   498
icculus@10815
   499
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   500
icculus@10815
   501
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   502
    while (i) {
icculus@11403
   503
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   504
        i--; src--; dst--;
icculus@10815
   505
    }
icculus@10815
   506
icculus@10815
   507
    cvt->len_cvt *= 2;
icculus@10815
   508
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   509
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   510
    }
icculus@10815
   511
}
icculus@10815
   512
icculus@10815
   513
static void SDLCALL
icculus@10815
   514
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   515
{
icculus@10815
   516
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10815
   517
    float *dst = (float *) cvt->buf;
icculus@10815
   518
    int i;
icculus@10815
   519
icculus@10815
   520
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
icculus@10815
   521
icculus@10815
   522
    /* Get dst aligned to 16 bytes */
icculus@10815
   523
    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11987
   524
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@10815
   525
    }
icculus@10815
   526
icculus@10815
   527
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   528
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@10815
   529
icculus@10815
   530
    {
icculus@10815
   531
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11987
   532
        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
icculus@10815
   533
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   534
        while (i >= 4) {   /* 4 * sint32 */
icculus@11987
   535
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
icculus@11987
   536
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srli_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
icculus@10815
   537
            i -= 4; mmsrc++; dst += 4;
icculus@10815
   538
        }
icculus@10815
   539
        src = (const Sint32 *) mmsrc;
icculus@10815
   540
    }
icculus@10815
   541
icculus@10815
   542
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   543
    while (i) {
icculus@11987
   544
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@10815
   545
        i--; src++; dst++;
icculus@10815
   546
    }
icculus@10815
   547
icculus@10815
   548
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   549
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   550
    }
icculus@10815
   551
}
icculus@10815
   552
icculus@10815
   553
static void SDLCALL
icculus@10815
   554
SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   555
{
icculus@10815
   556
    const float *src = (const float *) cvt->buf;
icculus@10815
   557
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@10815
   558
    int i;
icculus@10815
   559
icculus@10815
   560
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
icculus@10815
   561
icculus@10815
   562
    /* Get dst aligned to 16 bytes */
icculus@10815
   563
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   564
        const float sample = *src;
icculus@11991
   565
        if (sample >= 1.0f) {
icculus@11991
   566
            *dst = 127;
icculus@11991
   567
        } else if (sample <= -1.0f) {
icculus@11991
   568
            *dst = -128;
icculus@11991
   569
        } else {
icculus@11991
   570
            *dst = (Sint8)(sample * 127.0f);
icculus@11991
   571
        }
icculus@10815
   572
    }
icculus@10815
   573
icculus@10815
   574
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   575
icculus@10815
   576
    /* Make sure src is aligned too. */
icculus@10815
   577
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   578
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   579
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   580
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   581
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   582
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   583
        while (i >= 16) {   /* 16 * float32 */
icculus@11991
   584
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   585
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   586
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   587
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   588
            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   589
            i -= 16; src += 16; mmdst++;
icculus@10815
   590
        }
icculus@10815
   591
        dst = (Sint8 *) mmdst;
icculus@10815
   592
    }
icculus@10815
   593
icculus@10815
   594
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   595
    while (i) {
icculus@11991
   596
        const float sample = *src;
icculus@11991
   597
        if (sample >= 1.0f) {
icculus@11991
   598
            *dst = 127;
icculus@11991
   599
        } else if (sample <= -1.0f) {
icculus@11991
   600
            *dst = -128;
icculus@11991
   601
        } else {
icculus@11991
   602
            *dst = (Sint8)(sample * 127.0f);
icculus@11991
   603
        }
icculus@10815
   604
        i--; src++; dst++;
icculus@10815
   605
    }
icculus@10815
   606
icculus@10815
   607
    cvt->len_cvt /= 4;
icculus@10815
   608
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   609
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@10815
   610
    }
icculus@10815
   611
}
icculus@10815
   612
icculus@10815
   613
static void SDLCALL
icculus@10815
   614
SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   615
{
icculus@10815
   616
    const float *src = (const float *) cvt->buf;
icculus@10815
   617
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@10815
   618
    int i;
icculus@10815
   619
icculus@10815
   620
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
icculus@10815
   621
icculus@10815
   622
    /* Get dst aligned to 16 bytes */
icculus@10815
   623
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   624
        const float sample = *src;
icculus@11991
   625
        if (sample >= 1.0f) {
icculus@11991
   626
            *dst = 255;
icculus@11991
   627
        } else if (sample <= -1.0f) {
icculus@11991
   628
            *dst = 0;
icculus@11991
   629
        } else {
icculus@11991
   630
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11991
   631
        }
icculus@10815
   632
    }
icculus@10815
   633
icculus@10815
   634
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   635
icculus@10815
   636
    /* Make sure src is aligned too. */
icculus@10815
   637
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   638
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   639
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   640
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   641
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   642
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   643
        while (i >= 16) {   /* 16 * float32 */
icculus@11991
   644
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   645
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   646
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   647
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   648
            _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   649
            i -= 16; src += 16; mmdst++;
icculus@10815
   650
        }
icculus@10815
   651
        dst = (Uint8 *) mmdst;
icculus@10815
   652
    }
icculus@10815
   653
icculus@10815
   654
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   655
    while (i) {
icculus@11991
   656
        const float sample = *src;
icculus@11991
   657
        if (sample >= 1.0f) {
icculus@11991
   658
            *dst = 255;
icculus@11991
   659
        } else if (sample <= -1.0f) {
icculus@11991
   660
            *dst = 0;
icculus@11991
   661
        } else {
icculus@11991
   662
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11991
   663
        }
icculus@10815
   664
        i--; src++; dst++;
icculus@10815
   665
    }
icculus@10815
   666
icculus@10815
   667
    cvt->len_cvt /= 4;
icculus@10815
   668
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   669
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@10815
   670
    }
icculus@10815
   671
}
icculus@10815
   672
icculus@10815
   673
static void SDLCALL
icculus@10815
   674
SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   675
{
icculus@10815
   676
    const float *src = (const float *) cvt->buf;
icculus@10815
   677
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@10815
   678
    int i;
icculus@10815
   679
icculus@10815
   680
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
icculus@10815
   681
icculus@10815
   682
    /* Get dst aligned to 16 bytes */
icculus@10815
   683
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   684
        const float sample = *src;
icculus@11991
   685
        if (sample >= 1.0f) {
icculus@11991
   686
            *dst = 32767;
icculus@11991
   687
        } else if (sample <= -1.0f) {
icculus@11991
   688
            *dst = -32768;
icculus@11991
   689
        } else {
icculus@11991
   690
            *dst = (Sint16)(sample * 32767.0f);
icculus@11991
   691
        }
icculus@10815
   692
    }
icculus@10815
   693
icculus@10815
   694
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   695
icculus@10815
   696
    /* Make sure src is aligned too. */
icculus@10815
   697
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   698
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   699
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   700
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   701
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   702
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   703
        while (i >= 8) {   /* 8 * float32 */
icculus@11991
   704
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   705
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   706
            _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
icculus@10815
   707
            i -= 8; src += 8; mmdst++;
icculus@10815
   708
        }
icculus@10815
   709
        dst = (Sint16 *) mmdst;
icculus@10815
   710
    }
icculus@10815
   711
icculus@10815
   712
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   713
    while (i) {
icculus@11991
   714
        const float sample = *src;
icculus@11991
   715
        if (sample >= 1.0f) {
icculus@11991
   716
            *dst = 32767;
icculus@11991
   717
        } else if (sample <= -1.0f) {
icculus@11991
   718
            *dst = -32768;
icculus@11991
   719
        } else {
icculus@11991
   720
            *dst = (Sint16)(sample * 32767.0f);
icculus@11991
   721
        }
icculus@10815
   722
        i--; src++; dst++;
icculus@10815
   723
    }
icculus@10815
   724
icculus@10815
   725
    cvt->len_cvt /= 2;
icculus@10815
   726
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   727
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@10815
   728
    }
icculus@10815
   729
}
icculus@10815
   730
icculus@10815
   731
static void SDLCALL
icculus@10815
   732
SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   733
{
icculus@10815
   734
    const float *src = (const float *) cvt->buf;
icculus@10815
   735
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@10815
   736
    int i;
icculus@10815
   737
icculus@10815
   738
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
icculus@10815
   739
icculus@10815
   740
    /* Get dst aligned to 16 bytes */
icculus@10815
   741
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   742
        const float sample = *src;
icculus@11991
   743
        if (sample >= 1.0f) {
icculus@11991
   744
            *dst = 65535;
icculus@11991
   745
        } else if (sample <= -1.0f) {
icculus@11991
   746
            *dst = 0;
icculus@11991
   747
        } else {
icculus@11991
   748
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11991
   749
        }
icculus@10815
   750
    }
icculus@10815
   751
icculus@10815
   752
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   753
icculus@10815
   754
    /* Make sure src is aligned too. */
icculus@10815
   755
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   756
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   757
        /* This calculates differently than the scalar path because SSE2 can't
icculus@10815
   758
           pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
icculus@10815
   759
           saturation, so that would corrupt our data. _mm_packus_epi32 exists,
icculus@10815
   760
           but not before SSE 4.1. So we convert from float to sint16, packing
icculus@10815
   761
           that down with legit signed saturation, and then xor the top bit
icculus@10815
   762
           against 1. This results in the correct unsigned 16-bit value, even
icculus@10815
   763
           though it looks like dark magic. */
icculus@10815
   764
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   765
        const __m128i topbit = _mm_set1_epi16(-32768);
icculus@11991
   766
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   767
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@10815
   768
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   769
        while (i >= 8) {   /* 8 * float32 */
icculus@11991
   770
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11991
   771
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   772
            _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
icculus@10815
   773
            i -= 8; src += 8; mmdst++;
icculus@10815
   774
        }
icculus@10815
   775
        dst = (Uint16 *) mmdst;
icculus@10815
   776
    }
icculus@10815
   777
icculus@10815
   778
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   779
    while (i) {
icculus@11991
   780
        const float sample = *src;
icculus@11991
   781
        if (sample >= 1.0f) {
icculus@11991
   782
            *dst = 65535;
icculus@11991
   783
        } else if (sample <= -1.0f) {
icculus@11991
   784
            *dst = 0;
icculus@11991
   785
        } else {
icculus@11991
   786
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11991
   787
        }
icculus@10815
   788
        i--; src++; dst++;
icculus@10815
   789
    }
icculus@10815
   790
icculus@10815
   791
    cvt->len_cvt /= 2;
icculus@10815
   792
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   793
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@10815
   794
    }
icculus@10815
   795
}
icculus@10815
   796
icculus@10815
   797
static void SDLCALL
icculus@10815
   798
SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   799
{
icculus@10815
   800
    const float *src = (const float *) cvt->buf;
icculus@10815
   801
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@10815
   802
    int i;
icculus@10815
   803
icculus@10815
   804
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
icculus@10815
   805
icculus@10815
   806
    /* Get dst aligned to 16 bytes */
icculus@10815
   807
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11991
   808
        const float sample = *src;
icculus@11991
   809
        if (sample >= 1.0f) {
icculus@11991
   810
            *dst = 2147483647;
icculus@11991
   811
        } else if (sample <= -1.0f) {
icculus@11991
   812
            *dst = -2147483648;
icculus@11991
   813
        } else {
icculus@11991
   814
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11991
   815
        }
icculus@10815
   816
    }
icculus@10815
   817
icculus@10815
   818
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   819
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@10815
   820
icculus@10815
   821
    {
icculus@10815
   822
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11991
   823
        const __m128 one = _mm_set1_ps(1.0f);
icculus@11991
   824
        const __m128 negone = _mm_set1_ps(-1.0f);
icculus@11987
   825
        const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
icculus@10815
   826
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   827
        while (i >= 4) {   /* 4 * float32 */
icculus@11991
   828
            _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
icculus@10815
   829
            i -= 4; src += 4; mmdst++;
icculus@10815
   830
        }
icculus@10815
   831
        dst = (Sint32 *) mmdst;
icculus@10815
   832
    }
icculus@10815
   833
icculus@10815
   834
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   835
    while (i) {
icculus@11991
   836
        const float sample = *src;
icculus@11991
   837
        if (sample >= 1.0f) {
icculus@11991
   838
            *dst = 2147483647;
icculus@11991
   839
        } else if (sample <= -1.0f) {
icculus@11991
   840
            *dst = -2147483648;
icculus@11991
   841
        } else {
icculus@11991
   842
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11991
   843
        }
icculus@10815
   844
        i--; src++; dst++;
icculus@10815
   845
    }
icculus@10815
   846
icculus@10815
   847
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   848
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@10815
   849
    }
icculus@10815
   850
}
icculus@10815
   851
#endif
icculus@10815
   852
icculus@10815
   853
icculus@11992
   854
#if HAVE_NEON_INTRINSICS
icculus@11992
   855
static void SDLCALL
icculus@11992
   856
SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   857
{
icculus@11992
   858
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   859
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@11992
   860
    int i;
icculus@11992
   861
icculus@11992
   862
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
icculus@11992
   863
icculus@11992
   864
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   865
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11992
   866
        *dst = ((float) *src) * DIVBY128;
icculus@11992
   867
    }
icculus@11992
   868
icculus@11992
   869
    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
icculus@11992
   870
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   871
icculus@11992
   872
    /* Make sure src is aligned too. */
icculus@11992
   873
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   874
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   875
        const int8_t *mmsrc = (const int8_t *) src;
icculus@11992
   876
        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
icculus@11992
   877
        while (i >= 16) {   /* 16 * 8-bit */
icculus@11992
   878
            const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
icculus@11992
   879
            const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
icculus@11992
   880
            const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
icculus@11992
   881
            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
icculus@11992
   882
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
icculus@11992
   883
            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
icculus@11992
   884
            vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
icculus@11992
   885
            vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
icculus@11992
   886
            i -= 16; mmsrc -= 16; dst -= 16;
icculus@11992
   887
        }
icculus@11992
   888
icculus@11992
   889
        src = (const Sint8 *) mmsrc;
icculus@11992
   890
    }
icculus@11992
   891
icculus@11992
   892
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@11992
   893
icculus@11992
   894
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   895
    while (i) {
icculus@11992
   896
        *dst = ((float) *src) * DIVBY128;
icculus@11992
   897
        i--; src--; dst--;
icculus@11992
   898
    }
icculus@11992
   899
icculus@11992
   900
    cvt->len_cvt *= 4;
icculus@11992
   901
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
   902
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
   903
    }
icculus@11992
   904
}
icculus@11992
   905
icculus@11992
   906
static void SDLCALL
icculus@11992
   907
SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   908
{
icculus@11992
   909
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   910
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@11992
   911
    int i;
icculus@11992
   912
icculus@11992
   913
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
icculus@11992
   914
icculus@11992
   915
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   916
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11992
   917
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@11992
   918
    }
icculus@11992
   919
icculus@11992
   920
    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
icculus@11992
   921
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   922
icculus@11992
   923
    /* Make sure src is aligned too. */
icculus@11992
   924
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   925
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   926
        const uint8_t *mmsrc = (const uint8_t *) src;
icculus@11992
   927
        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
icculus@11992
   928
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
   929
        while (i >= 16) {   /* 16 * 8-bit */
icculus@11992
   930
            const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
icculus@11992
   931
            const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
icculus@11992
   932
            const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
icculus@11992
   933
            /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
icculus@11992
   934
            vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
icculus@11992
   935
            vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
icculus@11992
   936
            vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
icculus@11992
   937
            vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
icculus@11992
   938
            i -= 16; mmsrc -= 16; dst -= 16;
icculus@11992
   939
        }
icculus@11992
   940
icculus@11992
   941
        src = (const Uint8 *) mmsrc;
icculus@11992
   942
    }
icculus@11992
   943
icculus@11992
   944
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@11992
   945
icculus@11992
   946
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   947
    while (i) {
icculus@11992
   948
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@11992
   949
        i--; src--; dst--;
icculus@11992
   950
    }
icculus@11992
   951
icculus@11992
   952
    cvt->len_cvt *= 4;
icculus@11992
   953
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
   954
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
   955
    }
icculus@11992
   956
}
icculus@11992
   957
icculus@11992
   958
static void SDLCALL
icculus@11992
   959
SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
   960
{
icculus@11992
   961
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
   962
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@11992
   963
    int i;
icculus@11992
   964
icculus@11992
   965
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
icculus@11992
   966
icculus@11992
   967
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
   968
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11992
   969
        *dst = ((float) *src) * DIVBY32768;
icculus@11992
   970
    }
icculus@11992
   971
icculus@11992
   972
    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
icculus@11992
   973
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
   974
icculus@11992
   975
    /* Make sure src is aligned too. */
icculus@11992
   976
    if ((((size_t) src) & 15) == 0) {
icculus@11992
   977
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
   978
        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
icculus@11992
   979
        while (i >= 8) {   /* 8 * 16-bit */
icculus@11992
   980
            const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
icculus@11992
   981
            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
icculus@11992
   982
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
icculus@11992
   983
            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
icculus@11992
   984
            i -= 8; src -= 8; dst -= 8;
icculus@11992
   985
        }
icculus@11992
   986
    }
icculus@11992
   987
icculus@11992
   988
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@11992
   989
icculus@11992
   990
    /* Finish off any leftovers with scalar operations. */
icculus@11992
   991
    while (i) {
icculus@11992
   992
        *dst = ((float) *src) * DIVBY32768;
icculus@11992
   993
        i--; src--; dst--;
icculus@11992
   994
    }
icculus@11992
   995
icculus@11992
   996
    cvt->len_cvt *= 2;
icculus@11992
   997
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
   998
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
   999
    }
icculus@11992
  1000
}
icculus@11992
  1001
icculus@11992
  1002
static void SDLCALL
icculus@11992
  1003
SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1004
{
icculus@11992
  1005
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@11992
  1006
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@11992
  1007
    int i;
icculus@11992
  1008
icculus@11992
  1009
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
icculus@11992
  1010
icculus@11992
  1011
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@11992
  1012
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11992
  1013
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@11992
  1014
    }
icculus@11992
  1015
icculus@11992
  1016
    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
icculus@11992
  1017
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1018
icculus@11992
  1019
    /* Make sure src is aligned too. */
icculus@11992
  1020
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1021
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1022
        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
icculus@11992
  1023
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1024
        while (i >= 8) {   /* 8 * 16-bit */
icculus@11992
  1025
            const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
icculus@11992
  1026
            /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
icculus@11992
  1027
            vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
icculus@11992
  1028
            vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
icculus@11992
  1029
            i -= 8; src -= 8; dst -= 8;
icculus@11992
  1030
        }
icculus@11992
  1031
    }
icculus@11992
  1032
icculus@11992
  1033
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@11992
  1034
icculus@11992
  1035
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1036
    while (i) {
icculus@11992
  1037
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@11992
  1038
        i--; src--; dst--;
icculus@11992
  1039
    }
icculus@11992
  1040
icculus@11992
  1041
    cvt->len_cvt *= 2;
icculus@11992
  1042
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1043
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
  1044
    }
icculus@11992
  1045
}
icculus@11992
  1046
icculus@11992
  1047
static void SDLCALL
icculus@11992
  1048
SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1049
{
icculus@11992
  1050
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@11992
  1051
    float *dst = (float *) cvt->buf;
icculus@11992
  1052
    int i;
icculus@11992
  1053
icculus@11992
  1054
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
icculus@11992
  1055
icculus@11992
  1056
    /* Get dst aligned to 16 bytes */
icculus@11992
  1057
    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1058
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@11992
  1059
    }
icculus@11992
  1060
icculus@11992
  1061
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1062
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@11992
  1063
icculus@11992
  1064
    {
icculus@11992
  1065
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1066
        const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
icculus@11992
  1067
        const int32_t *mmsrc = (const int32_t *) src;
icculus@11992
  1068
        while (i >= 4) {   /* 4 * sint32 */
icculus@11992
  1069
            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
icculus@11992
  1070
            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
icculus@11992
  1071
            i -= 4; mmsrc += 4; dst += 4;
icculus@11992
  1072
        }
icculus@11992
  1073
        src = (const Sint32 *) mmsrc;
icculus@11992
  1074
    }
icculus@11992
  1075
icculus@11992
  1076
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1077
    while (i) {
icculus@11992
  1078
        *dst = ((float) (*src>>8)) * DIVBY8388607;
icculus@11992
  1079
        i--; src++; dst++;
icculus@11992
  1080
    }
icculus@11992
  1081
icculus@11992
  1082
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1083
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@11992
  1084
    }
icculus@11992
  1085
}
icculus@11992
  1086
icculus@11992
  1087
static void SDLCALL
icculus@11992
  1088
SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1089
{
icculus@11992
  1090
    const float *src = (const float *) cvt->buf;
icculus@11992
  1091
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@11992
  1092
    int i;
icculus@11992
  1093
icculus@11992
  1094
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
icculus@11992
  1095
icculus@11992
  1096
    /* Get dst aligned to 16 bytes */
icculus@11992
  1097
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1098
        const float sample = *src;
icculus@11992
  1099
        if (sample >= 1.0f) {
icculus@11992
  1100
            *dst = 127;
icculus@11992
  1101
        } else if (sample <= -1.0f) {
icculus@11992
  1102
            *dst = -128;
icculus@11992
  1103
        } else {
icculus@11992
  1104
            *dst = (Sint8)(sample * 127.0f);
icculus@11992
  1105
        }
icculus@11992
  1106
    }
icculus@11992
  1107
icculus@11992
  1108
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1109
icculus@11992
  1110
    /* Make sure src is aligned too. */
icculus@11992
  1111
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1112
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1113
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1114
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1115
        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
icculus@11992
  1116
        int8_t *mmdst = (int8_t *) dst;
icculus@11992
  1117
        while (i >= 16) {   /* 16 * float32 */
icculus@11992
  1118
            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1119
            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1120
            const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1121
            const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1122
            const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
icculus@11992
  1123
            const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
icculus@11992
  1124
            vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
icculus@11992
  1125
            i -= 16; src += 16; mmdst += 16;
icculus@11992
  1126
        }
icculus@11992
  1127
        dst = (Sint8 *) mmdst;
icculus@11992
  1128
    }
icculus@11992
  1129
icculus@11992
  1130
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1131
    while (i) {
icculus@11992
  1132
        const float sample = *src;
icculus@11992
  1133
        if (sample >= 1.0f) {
icculus@11992
  1134
            *dst = 127;
icculus@11992
  1135
        } else if (sample <= -1.0f) {
icculus@11992
  1136
            *dst = -128;
icculus@11992
  1137
        } else {
icculus@11992
  1138
            *dst = (Sint8)(sample * 127.0f);
icculus@11992
  1139
        }
icculus@11992
  1140
        i--; src++; dst++;
icculus@11992
  1141
    }
icculus@11992
  1142
icculus@11992
  1143
    cvt->len_cvt /= 4;
icculus@11992
  1144
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1145
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@11992
  1146
    }
icculus@11992
  1147
}
icculus@11992
  1148
icculus@11992
  1149
static void SDLCALL
icculus@11992
  1150
SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1151
{
icculus@11992
  1152
    const float *src = (const float *) cvt->buf;
icculus@11992
  1153
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@11992
  1154
    int i;
icculus@11992
  1155
icculus@11992
  1156
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
icculus@11992
  1157
icculus@11992
  1158
    /* Get dst aligned to 16 bytes */
icculus@11992
  1159
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1160
        const float sample = *src;
icculus@11992
  1161
        if (sample >= 1.0f) {
icculus@11992
  1162
            *dst = 255;
icculus@11992
  1163
        } else if (sample <= -1.0f) {
icculus@11992
  1164
            *dst = 0;
icculus@11992
  1165
        } else {
icculus@11992
  1166
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11992
  1167
        }
icculus@11992
  1168
    }
icculus@11992
  1169
icculus@11992
  1170
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1171
icculus@11992
  1172
    /* Make sure src is aligned too. */
icculus@11992
  1173
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1174
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1175
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1176
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1177
        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
icculus@11992
  1178
        uint8_t *mmdst = (uint8_t *) dst;
icculus@11992
  1179
        while (i >= 16) {   /* 16 * float32 */
icculus@11992
  1180
            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1181
            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1182
            const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1183
            const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1184
            const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
icculus@11992
  1185
            const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
icculus@11992
  1186
            vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
icculus@11992
  1187
            i -= 16; src += 16; mmdst += 16;
icculus@11992
  1188
        }
icculus@11992
  1189
icculus@11992
  1190
        dst = (Uint8 *) mmdst;
icculus@11992
  1191
    }
icculus@11992
  1192
icculus@11992
  1193
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1194
    while (i) {
icculus@11992
  1195
        const float sample = *src;
icculus@11992
  1196
        if (sample >= 1.0f) {
icculus@11992
  1197
            *dst = 255;
icculus@11992
  1198
        } else if (sample <= -1.0f) {
icculus@11992
  1199
            *dst = 0;
icculus@11992
  1200
        } else {
icculus@11992
  1201
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
icculus@11992
  1202
        }
icculus@11992
  1203
        i--; src++; dst++;
icculus@11992
  1204
    }
icculus@11992
  1205
icculus@11992
  1206
    cvt->len_cvt /= 4;
icculus@11992
  1207
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1208
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@11992
  1209
    }
icculus@11992
  1210
}
icculus@11992
  1211
icculus@11992
  1212
static void SDLCALL
icculus@11992
  1213
SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1214
{
icculus@11992
  1215
    const float *src = (const float *) cvt->buf;
icculus@11992
  1216
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@11992
  1217
    int i;
icculus@11992
  1218
icculus@11992
  1219
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
icculus@11992
  1220
icculus@11992
  1221
    /* Get dst aligned to 16 bytes */
icculus@11992
  1222
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1223
        const float sample = *src;
icculus@11992
  1224
        if (sample >= 1.0f) {
icculus@11992
  1225
            *dst = 32767;
icculus@11992
  1226
        } else if (sample <= -1.0f) {
icculus@11992
  1227
            *dst = -32768;
icculus@11992
  1228
        } else {
icculus@11992
  1229
            *dst = (Sint16)(sample * 32767.0f);
icculus@11992
  1230
        }
icculus@11992
  1231
    }
icculus@11992
  1232
icculus@11992
  1233
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1234
icculus@11992
  1235
    /* Make sure src is aligned too. */
icculus@11992
  1236
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1237
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1238
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1239
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1240
        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
icculus@11992
  1241
        int16_t *mmdst = (int16_t *) dst;
icculus@11992
  1242
        while (i >= 8) {   /* 8 * float32 */
icculus@11992
  1243
            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1244
            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
icculus@11992
  1245
            vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
icculus@11992
  1246
            i -= 8; src += 8; mmdst += 8;
icculus@11992
  1247
        }
icculus@11992
  1248
        dst = (Sint16 *) mmdst;
icculus@11992
  1249
    }
icculus@11992
  1250
icculus@11992
  1251
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1252
    while (i) {
icculus@11992
  1253
        const float sample = *src;
icculus@11992
  1254
        if (sample >= 1.0f) {
icculus@11992
  1255
            *dst = 32767;
icculus@11992
  1256
        } else if (sample <= -1.0f) {
icculus@11992
  1257
            *dst = -32768;
icculus@11992
  1258
        } else {
icculus@11992
  1259
            *dst = (Sint16)(sample * 32767.0f);
icculus@11992
  1260
        }
icculus@11992
  1261
        i--; src++; dst++;
icculus@11992
  1262
    }
icculus@11992
  1263
icculus@11992
  1264
    cvt->len_cvt /= 2;
icculus@11992
  1265
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1266
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@11992
  1267
    }
icculus@11992
  1268
}
icculus@11992
  1269
icculus@11992
  1270
static void SDLCALL
icculus@11992
  1271
SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1272
{
icculus@11992
  1273
    const float *src = (const float *) cvt->buf;
icculus@11992
  1274
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@11992
  1275
    int i;
icculus@11992
  1276
icculus@11992
  1277
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
icculus@11992
  1278
icculus@11992
  1279
    /* Get dst aligned to 16 bytes */
icculus@11992
  1280
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1281
        const float sample = *src;
icculus@11992
  1282
        if (sample >= 1.0f) {
icculus@11992
  1283
            *dst = 65535;
icculus@11992
  1284
        } else if (sample <= -1.0f) {
icculus@11992
  1285
            *dst = 0;
icculus@11992
  1286
        } else {
icculus@11992
  1287
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11992
  1288
        }
icculus@11992
  1289
    }
icculus@11992
  1290
icculus@11992
  1291
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1292
icculus@11992
  1293
    /* Make sure src is aligned too. */
icculus@11992
  1294
    if ((((size_t) src) & 15) == 0) {
icculus@11992
  1295
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1296
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1297
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1298
        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
icculus@11992
  1299
        uint16_t *mmdst = (uint16_t *) dst;
icculus@11992
  1300
        while (i >= 8) {   /* 8 * float32 */
icculus@11992
  1301
            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1302
            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
icculus@11992
  1303
            vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
icculus@11992
  1304
            i -= 8; src += 8; mmdst += 8;
icculus@11992
  1305
        }
icculus@11992
  1306
        dst = (Uint16 *) mmdst;
icculus@11992
  1307
    }
icculus@11992
  1308
icculus@11992
  1309
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1310
    while (i) {
icculus@11992
  1311
        const float sample = *src;
icculus@11992
  1312
        if (sample >= 1.0f) {
icculus@11992
  1313
            *dst = 65535;
icculus@11992
  1314
        } else if (sample <= -1.0f) {
icculus@11992
  1315
            *dst = 0;
icculus@11992
  1316
        } else {
icculus@11992
  1317
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
icculus@11992
  1318
        }
icculus@11992
  1319
        i--; src++; dst++;
icculus@11992
  1320
    }
icculus@11992
  1321
icculus@11992
  1322
    cvt->len_cvt /= 2;
icculus@11992
  1323
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1324
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@11992
  1325
    }
icculus@11992
  1326
}
icculus@11992
  1327
icculus@11992
  1328
static void SDLCALL
icculus@11992
  1329
SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@11992
  1330
{
icculus@11992
  1331
    const float *src = (const float *) cvt->buf;
icculus@11992
  1332
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@11992
  1333
    int i;
icculus@11992
  1334
icculus@11992
  1335
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
icculus@11992
  1336
icculus@11992
  1337
    /* Get dst aligned to 16 bytes */
icculus@11992
  1338
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11992
  1339
        const float sample = *src;
icculus@11992
  1340
        if (sample >= 1.0f) {
icculus@11992
  1341
            *dst = 2147483647;
icculus@11992
  1342
        } else if (sample <= -1.0f) {
icculus@11992
  1343
            *dst = -2147483648;
icculus@11992
  1344
        } else {
icculus@11992
  1345
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11992
  1346
        }
icculus@11992
  1347
    }
icculus@11992
  1348
icculus@11992
  1349
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@11992
  1350
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@11992
  1351
icculus@11992
  1352
    {
icculus@11992
  1353
        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
icculus@11992
  1354
        const float32x4_t one = vdupq_n_f32(1.0f);
icculus@11992
  1355
        const float32x4_t negone = vdupq_n_f32(-1.0f);
icculus@11992
  1356
        const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
icculus@11992
  1357
        int32_t *mmdst = (int32_t *) dst;
icculus@11992
  1358
        while (i >= 4) {   /* 4 * float32 */
icculus@11992
  1359
            vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
icculus@11992
  1360
            i -= 4; src += 4; mmdst += 4;
icculus@11992
  1361
        }
icculus@11992
  1362
        dst = (Sint32 *) mmdst;
icculus@11992
  1363
    }
icculus@11992
  1364
icculus@11992
  1365
    /* Finish off any leftovers with scalar operations. */
icculus@11992
  1366
    while (i) {
icculus@11992
  1367
        const float sample = *src;
icculus@11992
  1368
        if (sample >= 1.0f) {
icculus@11992
  1369
            *dst = 2147483647;
icculus@11992
  1370
        } else if (sample <= -1.0f) {
icculus@11992
  1371
            *dst = -2147483648;
icculus@11992
  1372
        } else {
icculus@11992
  1373
            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
icculus@11992
  1374
        }
icculus@11992
  1375
        i--; src++; dst++;
icculus@11992
  1376
    }
icculus@11992
  1377
icculus@11992
  1378
    if (cvt->filters[++cvt->filter_index]) {
icculus@11992
  1379
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@11992
  1380
    }
icculus@11992
  1381
}
icculus@11992
  1382
#endif
icculus@11992
  1383
icculus@11992
  1384
icculus@11992
  1385
icculus@10815
  1386
void SDL_ChooseAudioConverters(void)
icculus@10815
  1387
{
icculus@10815
  1388
    static SDL_bool converters_chosen = SDL_FALSE;
icculus@10815
  1389
icculus@10815
  1390
    if (converters_chosen) {
icculus@10815
  1391
        return;
icculus@10815
  1392
    }
icculus@10815
  1393
slouken@11406
  1394
#define SET_CONVERTER_FUNCS(fntype) \
icculus@10815
  1395
        SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
icculus@10815
  1396
        SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
icculus@10815
  1397
        SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
icculus@10815
  1398
        SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
icculus@10815
  1399
        SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
icculus@10815
  1400
        SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
icculus@10815
  1401
        SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
icculus@10815
  1402
        SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
icculus@10815
  1403
        SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
icculus@10815
  1404
        SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
icculus@10815
  1405
        converters_chosen = SDL_TRUE
icculus@10815
  1406
slouken@11406
  1407
#if HAVE_SSE2_INTRINSICS
icculus@10815
  1408
    if (SDL_HasSSE2()) {
icculus@10815
  1409
        SET_CONVERTER_FUNCS(SSE2);
icculus@10815
  1410
        return;
icculus@10815
  1411
    }
slouken@11406
  1412
#endif
icculus@10815
  1413
icculus@11992
  1414
#if HAVE_NEON_INTRINSICS
icculus@11992
  1415
    if (SDL_HasNEON()) {
icculus@11992
  1416
        SET_CONVERTER_FUNCS(NEON);
icculus@11992
  1417
        return;
icculus@11992
  1418
    }
icculus@11992
  1419
#endif
icculus@11992
  1420
slouken@11406
  1421
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
  1422
    SET_CONVERTER_FUNCS(Scalar);
slouken@11406
  1423
#endif
icculus@10815
  1424
slouken@11406
  1425
#undef SET_CONVERTER_FUNCS
icculus@10815
  1426
icculus@10815
  1427
    SDL_assert(converters_chosen == SDL_TRUE);
icculus@10815
  1428
}
icculus@1982
  1429
slouken@1985
  1430
/* vi: set ts=4 sw=4 expandtab: */