src/audio/SDL_audiotypecvt.c
author Sam Lantinga <slouken@libsdl.org>
Wed, 18 Oct 2017 19:30:47 -0700
changeset 11633 37aca00967db
parent 11406 f40c2dedaded
child 11811 5d94cb6b24d3
permissions -rw-r--r--
Fixed bug 3876 - Resampling of certain sounds adds heavy distortion

Simon Hug

Patch that adds [-1, 1] clamping to the scalar audio type conversions.

This may come from the SDL_Convert_F32_to_X_Scalar functions. They don't clamp the float value to [-1, 1] and when they cast it to the target integer it may be too large or too small for the type and get truncated, causing horrible noise.

The attached patch throws clamping in, but I don't know if that's the preferred way to fix this. For x86 (without SSE) the compiler (I tested MSVC) seems to throw a horrible amount of x87 code in it. It's a bit better with SSE, but probably still quite the performance hit. And SSE2 uses a branchless approach with maxss and minss.
icculus@1982
     1
/*
slouken@5535
     2
  Simple DirectMedia Layer
slouken@10737
     3
  Copyright (C) 1997-2017 Sam Lantinga <slouken@libsdl.org>
slouken@5535
     4
slouken@5535
     5
  This software is provided 'as-is', without any express or implied
slouken@5535
     6
  warranty.  In no event will the authors be held liable for any damages
slouken@5535
     7
  arising from the use of this software.
slouken@5535
     8
slouken@5535
     9
  Permission is granted to anyone to use this software for any purpose,
slouken@5535
    10
  including commercial applications, and to alter it and redistribute it
slouken@5535
    11
  freely, subject to the following restrictions:
slouken@5535
    12
slouken@5535
    13
  1. The origin of this software must not be misrepresented; you must not
slouken@5535
    14
     claim that you wrote the original software. If you use this software
slouken@5535
    15
     in a product, an acknowledgment in the product documentation would be
slouken@5535
    16
     appreciated but is not required.
slouken@5535
    17
  2. Altered source versions must be plainly marked as such, and must not be
slouken@5535
    18
     misrepresented as being the original software.
slouken@5535
    19
  3. This notice may not be removed or altered from any source distribution.
icculus@1982
    20
*/
icculus@1982
    21
icculus@8093
    22
#include "../SDL_internal.h"
icculus@1982
    23
#include "SDL_audio.h"
icculus@1982
    24
#include "SDL_audio_c.h"
icculus@10815
    25
#include "SDL_cpuinfo.h"
icculus@10575
    26
#include "SDL_assert.h"
icculus@1982
    27
icculus@10815
    28
/* !!! FIXME: write NEON code. */
icculus@10815
    29
#define HAVE_NEON_INTRINSICS 0
icculus@10815
    30
icculus@10835
    31
#ifdef __SSE2__
icculus@10835
    32
#define HAVE_SSE2_INTRINSICS 1
icculus@10815
    33
#endif
icculus@10815
    34
icculus@10815
    35
#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
icculus@10815
    36
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
icculus@10815
    37
#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
icculus@10815
    38
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
icculus@10815
    39
#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
icculus@10815
    40
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
icculus@10815
    41
#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
icculus@10815
    42
#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
icculus@10815
    43
#endif
icculus@10815
    44
icculus@10815
    45
/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
icculus@10815
    46
#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    47
#define NEED_SCALAR_CONVERTER_FALLBACKS 1
icculus@10815
    48
#endif
icculus@10815
    49
icculus@10815
    50
/* Function pointers set to a CPU-specific implementation. */
icculus@10815
    51
SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
icculus@10815
    52
SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
icculus@10815
    53
SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
icculus@10815
    54
SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
icculus@10815
    55
SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
icculus@10815
    56
SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
icculus@10815
    57
SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
icculus@10815
    58
SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
icculus@10815
    59
SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
icculus@10815
    60
SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
icculus@10815
    61
icculus@10815
    62
icculus@11403
    63
#define DIVBY128 0.0078125f
icculus@11403
    64
#define DIVBY32768 0.000030517578125f
icculus@11403
    65
#define DIVBY2147483648 0.00000000046566128730773926
icculus@1982
    66
icculus@10815
    67
icculus@10815
    68
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
    69
static void SDLCALL
icculus@10815
    70
SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
    71
{
icculus@10814
    72
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    73
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10575
    74
    int i;
icculus@3021
    75
icculus@10575
    76
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
icculus@1982
    77
icculus@10814
    78
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
    79
        *dst = ((float) *src) * DIVBY128;
icculus@1982
    80
    }
icculus@1982
    81
icculus@10575
    82
    cvt->len_cvt *= 4;
icculus@1982
    83
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
    84
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
    85
    }
icculus@1982
    86
}
icculus@1982
    87
icculus@10815
    88
static void SDLCALL
icculus@10815
    89
SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
    90
{
icculus@10575
    91
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
    92
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@1982
    93
    int i;
icculus@1982
    94
icculus@10575
    95
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
icculus@1982
    96
icculus@10814
    97
    for (i = cvt->len_cvt; i; --i, --src, --dst) {
icculus@11403
    98
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10575
    99
    }
icculus@10575
   100
icculus@10575
   101
    cvt->len_cvt *= 4;
icculus@10575
   102
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   103
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10575
   104
    }
icculus@10575
   105
}
icculus@10575
   106
icculus@10815
   107
static void SDLCALL
icculus@10815
   108
SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10575
   109
{
icculus@10575
   110
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   111
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10575
   112
    int i;
icculus@10575
   113
icculus@10575
   114
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
icculus@10575
   115
icculus@10575
   116
    for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
icculus@11403
   117
        *dst = ((float) *src) * DIVBY32768;
icculus@1982
   118
    }
icculus@1982
   119
icculus@1982
   120
    cvt->len_cvt *= 2;
icculus@1982
   121
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   122
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   123
    }
icculus@1982
   124
}
icculus@1982
   125
icculus@10815
   126
static void SDLCALL
icculus@10815
   127
SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   128
{
icculus@10575
   129
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10575
   130
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@1982
   131
    int i;
icculus@1982
   132
icculus@10575
   133
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
icculus@1982
   134
icculus@10575
   135
    for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
icculus@11403
   136
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@1982
   137
    }
icculus@1982
   138
icculus@1982
   139
    cvt->len_cvt *= 2;
icculus@1982
   140
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   141
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   142
    }
icculus@1982
   143
}
icculus@1982
   144
icculus@10815
   145
static void SDLCALL
icculus@10815
   146
SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   147
{
icculus@10814
   148
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10575
   149
    float *dst = (float *) cvt->buf;
icculus@1982
   150
    int i;
icculus@1982
   151
icculus@10575
   152
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
icculus@1982
   153
icculus@10575
   154
    for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
icculus@11403
   155
        *dst = (float) (((double) *src) * DIVBY2147483648);
icculus@1982
   156
    }
icculus@1982
   157
icculus@1982
   158
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   159
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@1982
   160
    }
icculus@1982
   161
}
icculus@1982
   162
icculus@10815
   163
static void SDLCALL
icculus@10815
   164
SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   165
{
icculus@10575
   166
    const float *src = (const float *) cvt->buf;
icculus@10575
   167
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@1982
   168
    int i;
icculus@1982
   169
icculus@10575
   170
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
icculus@1982
   171
icculus@10575
   172
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   173
        const float sample = *src;
slouken@11633
   174
        if (sample > 1.0f) {
slouken@11633
   175
            *dst = 127;
slouken@11633
   176
        } else if (sample < -1.0f) {
slouken@11633
   177
            *dst = -127;
slouken@11633
   178
        } else {
slouken@11633
   179
            *dst = (Sint8)(sample * 127.0f);
slouken@11633
   180
        }
icculus@1982
   181
    }
icculus@1982
   182
icculus@10575
   183
    cvt->len_cvt /= 4;
icculus@1982
   184
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   185
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@1982
   186
    }
icculus@1982
   187
}
icculus@1982
   188
icculus@10815
   189
static void SDLCALL
icculus@10815
   190
SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   191
{
icculus@10575
   192
    const float *src = (const float *) cvt->buf;
icculus@10575
   193
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@1982
   194
    int i;
icculus@1982
   195
icculus@10575
   196
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
icculus@1982
   197
icculus@10575
   198
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   199
        const float sample = *src;
slouken@11633
   200
        if (sample > 1.0f) {
slouken@11633
   201
            *dst = 255;
slouken@11633
   202
        } else if (sample < -1.0f) {
slouken@11633
   203
            *dst = 0;
slouken@11633
   204
        } else {
slouken@11633
   205
            *dst = (Uint8)((sample + 1.0f) * 127.0f);
slouken@11633
   206
        }
icculus@1982
   207
    }
icculus@1982
   208
icculus@10575
   209
    cvt->len_cvt /= 4;
icculus@1982
   210
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   211
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@1982
   212
    }
icculus@1982
   213
}
icculus@1982
   214
icculus@10815
   215
static void SDLCALL
icculus@10815
   216
SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   217
{
icculus@10575
   218
    const float *src = (const float *) cvt->buf;
icculus@10575
   219
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@1982
   220
    int i;
icculus@1982
   221
icculus@10575
   222
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
icculus@1982
   223
icculus@10575
   224
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   225
        const float sample = *src;
slouken@11633
   226
        if (sample > 1.0f) {
slouken@11633
   227
            *dst = 32767;
slouken@11633
   228
        } else if (sample < -1.0f) {
slouken@11633
   229
            *dst = -32767;
slouken@11633
   230
        } else {
slouken@11633
   231
            *dst = (Sint16)(sample * 32767.0f);
slouken@11633
   232
        }
icculus@1982
   233
    }
icculus@1982
   234
icculus@1982
   235
    cvt->len_cvt /= 2;
icculus@1982
   236
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   237
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@1982
   238
    }
icculus@1982
   239
}
icculus@1982
   240
icculus@10815
   241
static void SDLCALL
icculus@10815
   242
SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   243
{
icculus@10575
   244
    const float *src = (const float *) cvt->buf;
icculus@10575
   245
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@1982
   246
    int i;
icculus@1982
   247
icculus@10575
   248
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
icculus@1982
   249
icculus@10575
   250
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   251
        const float sample = *src;
slouken@11633
   252
        if (sample > 1.0f) {
slouken@11633
   253
            *dst = 65534;
slouken@11633
   254
        } else if (sample < -1.0f) {
slouken@11633
   255
            *dst = 0;
slouken@11633
   256
        } else {
slouken@11633
   257
            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
slouken@11633
   258
        }
icculus@1982
   259
    }
icculus@1982
   260
icculus@1982
   261
    cvt->len_cvt /= 2;
icculus@1982
   262
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   263
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@1982
   264
    }
icculus@1982
   265
}
icculus@1982
   266
icculus@10815
   267
static void SDLCALL
icculus@10815
   268
SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@1982
   269
{
icculus@10575
   270
    const float *src = (const float *) cvt->buf;
icculus@10575
   271
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@1982
   272
    int i;
icculus@1982
   273
icculus@10575
   274
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
icculus@1982
   275
icculus@10575
   276
    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
slouken@11633
   277
        const float sample = *src;
slouken@11633
   278
        if (sample > 1.0f) {
slouken@11633
   279
            *dst = 2147483647;
slouken@11633
   280
        } else if (sample < -1.0f) {
slouken@11633
   281
            *dst = -2147483647;
slouken@11633
   282
        } else {
slouken@11633
   283
            *dst = (Sint32)((double)sample * 2147483647.0);
slouken@11633
   284
        }
icculus@1982
   285
    }
icculus@1982
   286
icculus@1982
   287
    if (cvt->filters[++cvt->filter_index]) {
icculus@10575
   288
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@1982
   289
    }
icculus@1982
   290
}
icculus@10815
   291
#endif
icculus@10815
   292
icculus@10815
   293
icculus@10815
   294
#if HAVE_SSE2_INTRINSICS
icculus@10815
   295
static void SDLCALL
icculus@10815
   296
SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   297
{
icculus@10815
   298
    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   299
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   300
    int i;
icculus@10815
   301
icculus@10815
   302
    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
icculus@10815
   303
icculus@10815
   304
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   305
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   306
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   307
    }
icculus@10815
   308
icculus@10815
   309
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   310
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   311
icculus@10815
   312
    /* Make sure src is aligned too. */
icculus@10815
   313
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   314
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   315
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   316
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   317
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   318
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   319
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
icculus@10815
   320
            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
icculus@10815
   321
            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   322
            /* right-shift-sign-extend gets us sint16 with the other set of values. */
icculus@10815
   323
            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
icculus@10815
   324
            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
icculus@11403
   325
            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   326
            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
icculus@11403
   327
            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
icculus@11403
   328
            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
icculus@10815
   329
            /* Interleave back into correct order, store. */
icculus@10815
   330
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   331
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   332
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   333
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   334
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   335
        }
icculus@10815
   336
icculus@10815
   337
        src = (const Sint8 *) mmsrc;
icculus@10815
   338
    }
icculus@10815
   339
icculus@10815
   340
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   341
icculus@10815
   342
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   343
    while (i) {
icculus@11403
   344
        *dst = ((float) *src) * DIVBY128;
icculus@10815
   345
        i--; src--; dst--;
icculus@10815
   346
    }
icculus@10815
   347
icculus@10815
   348
    cvt->len_cvt *= 4;
icculus@10815
   349
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   350
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   351
    }
icculus@10815
   352
}
icculus@10815
   353
icculus@10815
   354
static void SDLCALL
icculus@10815
   355
SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   356
{
icculus@10815
   357
    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   358
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
icculus@10815
   359
    int i;
icculus@10815
   360
icculus@10815
   361
    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
icculus@10815
   362
icculus@10815
   363
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   364
    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
icculus@11403
   365
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   366
    }
icculus@10815
   367
icculus@10815
   368
    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
icculus@10815
   369
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   370
icculus@10815
   371
    /* Make sure src is aligned too. */
icculus@10815
   372
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   373
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   374
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   375
        const __m128i zero = _mm_setzero_si128();
icculus@11403
   376
        const __m128 divby128 = _mm_set1_ps(DIVBY128);
icculus@10815
   377
        const __m128 minus1 = _mm_set1_ps(-1.0f);
icculus@10815
   378
        while (i >= 16) {   /* 16 * 8-bit */
icculus@10815
   379
            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
icculus@10815
   380
            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
icculus@10815
   381
            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
icculus@10815
   382
            /* right-shift-zero-extend gets us uint16 with the other set of values. */
icculus@10815
   383
            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
icculus@10815
   384
            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
icculus@10815
   385
            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
icculus@11403
   386
            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   387
            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
icculus@11403
   388
            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
icculus@11403
   389
            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
icculus@10815
   390
            /* Interleave back into correct order, store. */
icculus@10815
   391
            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
icculus@10815
   392
            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
icculus@10815
   393
            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
icculus@10815
   394
            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
icculus@10815
   395
            i -= 16; mmsrc--; dst -= 16;
icculus@10815
   396
        }
icculus@10815
   397
icculus@10815
   398
        src = (const Uint8 *) mmsrc;
icculus@10815
   399
    }
icculus@10815
   400
icculus@10815
   401
    src += 15; dst += 15;  /* adjust for any scalar finishing. */
icculus@10815
   402
icculus@10815
   403
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   404
    while (i) {
icculus@11403
   405
        *dst = (((float) *src) * DIVBY128) - 1.0f;
icculus@10815
   406
        i--; src--; dst--;
icculus@10815
   407
    }
icculus@10815
   408
icculus@10815
   409
    cvt->len_cvt *= 4;
icculus@10815
   410
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   411
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   412
    }
icculus@10815
   413
}
icculus@10815
   414
icculus@10815
   415
static void SDLCALL
icculus@10815
   416
SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   417
{
icculus@10815
   418
    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   419
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   420
    int i;
icculus@10815
   421
icculus@10815
   422
    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
icculus@10815
   423
icculus@10815
   424
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   425
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   426
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   427
    }
icculus@10815
   428
icculus@10815
   429
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   430
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   431
icculus@10815
   432
    /* Make sure src is aligned too. */
icculus@10815
   433
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   434
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   435
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   436
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   437
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   438
            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
icculus@10815
   439
            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   440
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   441
            const __m128i b = _mm_srai_epi32(ints, 16);
icculus@10815
   442
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   443
            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
icculus@11403
   444
            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
icculus@10815
   445
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   446
        }
icculus@10815
   447
    }
icculus@10815
   448
icculus@10815
   449
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   450
icculus@10815
   451
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   452
    while (i) {
icculus@11403
   453
        *dst = ((float) *src) * DIVBY32768;
icculus@10815
   454
        i--; src--; dst--;
icculus@10815
   455
    }
icculus@10815
   456
icculus@10815
   457
    cvt->len_cvt *= 2;
icculus@10815
   458
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   459
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   460
    }
icculus@10815
   461
}
icculus@10815
   462
icculus@10815
   463
static void SDLCALL
icculus@10815
   464
SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   465
{
icculus@10815
   466
    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
icculus@10815
   467
    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
icculus@10815
   468
    int i;
icculus@10815
   469
icculus@10815
   470
    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
icculus@10815
   471
icculus@10815
   472
    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
icculus@10815
   473
    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
icculus@11403
   474
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   475
    }
icculus@10815
   476
icculus@10815
   477
    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
icculus@10815
   478
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   479
icculus@10815
   480
    /* Make sure src is aligned too. */
icculus@10815
   481
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   482
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   483
        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
icculus@10815
   484
        const __m128 minus1 = _mm_set1_ps(1.0f);
icculus@10815
   485
        while (i >= 8) {   /* 8 * 16-bit */
icculus@10815
   486
            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
icculus@10815
   487
            /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
icculus@10815
   488
            const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
icculus@10815
   489
            /* right-shift-sign-extend gets us sint32 with the other set of values. */
icculus@10815
   490
            const __m128i b = _mm_srli_epi32(ints, 16);
icculus@10815
   491
            /* Interleave these back into the right order, convert to float, multiply, store. */
icculus@11403
   492
            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
icculus@11403
   493
            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
icculus@10815
   494
            i -= 8; src -= 8; dst -= 8;
icculus@10815
   495
        }
icculus@10815
   496
    }
icculus@10815
   497
icculus@10815
   498
    src += 7; dst += 7;  /* adjust for any scalar finishing. */
icculus@10815
   499
icculus@10815
   500
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   501
    while (i) {
icculus@11403
   502
        *dst = (((float) *src) * DIVBY32768) - 1.0f;
icculus@10815
   503
        i--; src--; dst--;
icculus@10815
   504
    }
icculus@10815
   505
icculus@10815
   506
    cvt->len_cvt *= 2;
icculus@10815
   507
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   508
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   509
    }
icculus@10815
   510
}
icculus@10815
   511
icculus@10815
   512
static void SDLCALL
icculus@10815
   513
SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   514
{
icculus@10815
   515
    const Sint32 *src = (const Sint32 *) cvt->buf;
icculus@10815
   516
    float *dst = (float *) cvt->buf;
icculus@10815
   517
    int i;
icculus@10815
   518
icculus@10815
   519
    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
icculus@10815
   520
icculus@10815
   521
    /* Get dst aligned to 16 bytes */
icculus@10815
   522
    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@11403
   523
        *dst = (float) (((double) *src) * DIVBY2147483648);
icculus@10815
   524
    }
icculus@10815
   525
icculus@10815
   526
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   527
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@10815
   528
icculus@10815
   529
    {
icculus@10815
   530
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@11403
   531
        const __m128d divby2147483648 = _mm_set1_pd(DIVBY2147483648);
icculus@10815
   532
        const __m128i *mmsrc = (const __m128i *) src;
icculus@10815
   533
        while (i >= 4) {   /* 4 * sint32 */
icculus@10815
   534
            const __m128i ints = _mm_load_si128(mmsrc);
icculus@10815
   535
            /* bitshift the whole register over, so _mm_cvtepi32_pd can read the top ints in the bottom of the vector. */
icculus@11403
   536
            const __m128d doubles1 = _mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(ints, 8)), divby2147483648);
icculus@11403
   537
            const __m128d doubles2 = _mm_mul_pd(_mm_cvtepi32_pd(ints), divby2147483648);
icculus@10815
   538
            /* convert to float32, bitshift/or to get these into a vector to store. */
icculus@10837
   539
            _mm_store_ps(dst, _mm_castsi128_ps(_mm_or_si128(_mm_slli_si128(_mm_castps_si128(_mm_cvtpd_ps(doubles1)), 8), _mm_castps_si128(_mm_cvtpd_ps(doubles2)))));
icculus@10815
   540
            i -= 4; mmsrc++; dst += 4;
icculus@10815
   541
        }
icculus@10815
   542
        src = (const Sint32 *) mmsrc;
icculus@10815
   543
    }
icculus@10815
   544
icculus@10815
   545
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   546
    while (i) {
icculus@11403
   547
        *dst = (float) (((double) *src) * DIVBY2147483648);
icculus@10815
   548
        i--; src++; dst++;
icculus@10815
   549
    }
icculus@10815
   550
icculus@10815
   551
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   552
        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
icculus@10815
   553
    }
icculus@10815
   554
}
icculus@10815
   555
icculus@10815
   556
static void SDLCALL
icculus@10815
   557
SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   558
{
icculus@10815
   559
    const float *src = (const float *) cvt->buf;
icculus@10815
   560
    Sint8 *dst = (Sint8 *) cvt->buf;
icculus@10815
   561
    int i;
icculus@10815
   562
icculus@10815
   563
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
icculus@10815
   564
icculus@10815
   565
    /* Get dst aligned to 16 bytes */
icculus@10815
   566
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@10815
   567
        *dst = (Sint8) (*src * 127.0f);
icculus@10815
   568
    }
icculus@10815
   569
icculus@10815
   570
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   571
icculus@10815
   572
    /* Make sure src is aligned too. */
icculus@10815
   573
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   574
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   575
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   576
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   577
        while (i >= 16) {   /* 16 * float32 */
icculus@10815
   578
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   579
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   580
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+8), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   581
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+12), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   582
            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   583
            i -= 16; src += 16; mmdst++;
icculus@10815
   584
        }
icculus@10815
   585
        dst = (Sint8 *) mmdst;
icculus@10815
   586
    }
icculus@10815
   587
icculus@10815
   588
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   589
    while (i) {
icculus@10815
   590
        *dst = (Sint8) (*src * 127.0f);
icculus@10815
   591
        i--; src++; dst++;
icculus@10815
   592
    }
icculus@10815
   593
icculus@10815
   594
    cvt->len_cvt /= 4;
icculus@10815
   595
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   596
        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
icculus@10815
   597
    }
icculus@10815
   598
}
icculus@10815
   599
icculus@10815
   600
static void SDLCALL
icculus@10815
   601
SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   602
{
icculus@10815
   603
    const float *src = (const float *) cvt->buf;
icculus@10815
   604
    Uint8 *dst = (Uint8 *) cvt->buf;
icculus@10815
   605
    int i;
icculus@10815
   606
icculus@10815
   607
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
icculus@10815
   608
icculus@10815
   609
    /* Get dst aligned to 16 bytes */
icculus@10815
   610
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@10815
   611
        *dst = (Uint8) ((*src + 1.0f) * 127.0f);
icculus@10815
   612
    }
icculus@10815
   613
icculus@10815
   614
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   615
icculus@10815
   616
    /* Make sure src is aligned too. */
icculus@10815
   617
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   618
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   619
        const __m128 add1 = _mm_set1_ps(1.0f);
icculus@10815
   620
        const __m128 mulby127 = _mm_set1_ps(127.0f);
icculus@10815
   621
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   622
        while (i >= 16) {   /* 16 * float32 */
icculus@10815
   623
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src), add1), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   624
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+4), add1), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   625
            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+8), add1), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   626
            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_load_ps(src+12), add1), mulby127));  /* load 4 floats, convert to sint32 */
icculus@10815
   627
            _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
icculus@10815
   628
            i -= 16; src += 16; mmdst++;
icculus@10815
   629
        }
icculus@10815
   630
        dst = (Uint8 *) mmdst;
icculus@10815
   631
    }
icculus@10815
   632
icculus@10815
   633
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   634
    while (i) {
icculus@10815
   635
        *dst = (Uint8) ((*src + 1.0f) * 127.0f);
icculus@10815
   636
        i--; src++; dst++;
icculus@10815
   637
    }
icculus@10815
   638
icculus@10815
   639
    cvt->len_cvt /= 4;
icculus@10815
   640
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   641
        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
icculus@10815
   642
    }
icculus@10815
   643
}
icculus@10815
   644
icculus@10815
   645
static void SDLCALL
icculus@10815
   646
SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   647
{
icculus@10815
   648
    const float *src = (const float *) cvt->buf;
icculus@10815
   649
    Sint16 *dst = (Sint16 *) cvt->buf;
icculus@10815
   650
    int i;
icculus@10815
   651
icculus@10815
   652
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
icculus@10815
   653
icculus@10815
   654
    /* Get dst aligned to 16 bytes */
icculus@10815
   655
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@10815
   656
        *dst = (Sint16) (*src * 32767.0f);
icculus@10815
   657
    }
icculus@10815
   658
icculus@10815
   659
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   660
icculus@10815
   661
    /* Make sure src is aligned too. */
icculus@10815
   662
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   663
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   664
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   665
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   666
        while (i >= 8) {   /* 8 * float32 */
icculus@10815
   667
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
icculus@10815
   668
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
icculus@10815
   669
            _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
icculus@10815
   670
            i -= 8; src += 8; mmdst++;
icculus@10815
   671
        }
icculus@10815
   672
        dst = (Sint16 *) mmdst;
icculus@10815
   673
    }
icculus@10815
   674
icculus@10815
   675
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   676
    while (i) {
icculus@10838
   677
        *dst = (Sint16) (*src * 32767.0f);
icculus@10815
   678
        i--; src++; dst++;
icculus@10815
   679
    }
icculus@10815
   680
icculus@10815
   681
    cvt->len_cvt /= 2;
icculus@10815
   682
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   683
        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
icculus@10815
   684
    }
icculus@10815
   685
}
icculus@10815
   686
icculus@10815
   687
static void SDLCALL
icculus@10815
   688
SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   689
{
icculus@10815
   690
    const float *src = (const float *) cvt->buf;
icculus@10815
   691
    Uint16 *dst = (Uint16 *) cvt->buf;
icculus@10815
   692
    int i;
icculus@10815
   693
icculus@10815
   694
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
icculus@10815
   695
icculus@10815
   696
    /* Get dst aligned to 16 bytes */
icculus@10815
   697
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@10815
   698
        *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
icculus@10815
   699
    }
icculus@10815
   700
icculus@10815
   701
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   702
icculus@10815
   703
    /* Make sure src is aligned too. */
icculus@10815
   704
    if ((((size_t) src) & 15) == 0) {
icculus@10815
   705
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   706
        /* This calculates differently than the scalar path because SSE2 can't
icculus@10815
   707
           pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
icculus@10815
   708
           saturation, so that would corrupt our data. _mm_packus_epi32 exists,
icculus@10815
   709
           but not before SSE 4.1. So we convert from float to sint16, packing
icculus@10815
   710
           that down with legit signed saturation, and then xor the top bit
icculus@10815
   711
           against 1. This results in the correct unsigned 16-bit value, even
icculus@10815
   712
           though it looks like dark magic. */
icculus@10815
   713
        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
icculus@10815
   714
        const __m128i topbit = _mm_set1_epi16(-32768);
icculus@10815
   715
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   716
        while (i >= 8) {   /* 8 * float32 */
icculus@10815
   717
            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src), mulby32767));  /* load 4 floats, convert to sint32 */
icculus@10815
   718
            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(src+4), mulby32767));  /* load 4 floats, convert to sint32 */
icculus@10815
   719
            _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
icculus@10815
   720
            i -= 8; src += 8; mmdst++;
icculus@10815
   721
        }
icculus@10815
   722
        dst = (Uint16 *) mmdst;
icculus@10815
   723
    }
icculus@10815
   724
icculus@10815
   725
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   726
    while (i) {
icculus@10815
   727
        *dst = (Uint16) ((*src + 1.0f) * 32767.0f);
icculus@10815
   728
        i--; src++; dst++;
icculus@10815
   729
    }
icculus@10815
   730
icculus@10815
   731
    cvt->len_cvt /= 2;
icculus@10815
   732
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   733
        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
icculus@10815
   734
    }
icculus@10815
   735
}
icculus@10815
   736
icculus@10815
   737
static void SDLCALL
icculus@10815
   738
SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
icculus@10815
   739
{
icculus@10815
   740
    const float *src = (const float *) cvt->buf;
icculus@10815
   741
    Sint32 *dst = (Sint32 *) cvt->buf;
icculus@10815
   742
    int i;
icculus@10815
   743
icculus@10815
   744
    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
icculus@10815
   745
icculus@10815
   746
    /* Get dst aligned to 16 bytes */
icculus@10815
   747
    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
icculus@10815
   748
        *dst = (Sint32) (((double) *src) * 2147483647.0);
icculus@10815
   749
    }
icculus@10815
   750
icculus@10815
   751
    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
icculus@10815
   752
    SDL_assert(!i || ((((size_t) src) & 15) == 0));
icculus@10815
   753
icculus@10815
   754
    {
icculus@10815
   755
        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
icculus@10815
   756
        const __m128d mulby2147483647 = _mm_set1_pd(2147483647.0);
icculus@10815
   757
        __m128i *mmdst = (__m128i *) dst;
icculus@10815
   758
        while (i >= 4) {   /* 4 * float32 */
icculus@10815
   759
            const __m128 floats = _mm_load_ps(src);
icculus@10815
   760
            /* bitshift the whole register over, so _mm_cvtps_pd can read the top floats in the bottom of the vector. */
icculus@10836
   761
            const __m128d doubles1 = _mm_mul_pd(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(floats), 8))), mulby2147483647);
icculus@10815
   762
            const __m128d doubles2 = _mm_mul_pd(_mm_cvtps_pd(floats), mulby2147483647);
icculus@10837
   763
            _mm_store_si128(mmdst, _mm_or_si128(_mm_slli_si128(_mm_cvtpd_epi32(doubles1), 8), _mm_cvtpd_epi32(doubles2)));
icculus@10815
   764
            i -= 4; src += 4; mmdst++;
icculus@10815
   765
        }
icculus@10815
   766
        dst = (Sint32 *) mmdst;
icculus@10815
   767
    }
icculus@10815
   768
icculus@10815
   769
    /* Finish off any leftovers with scalar operations. */
icculus@10815
   770
    while (i) {
icculus@10815
   771
        *dst = (Sint32) (((double) *src) * 2147483647.0);
icculus@10815
   772
        i--; src++; dst++;
icculus@10815
   773
    }
icculus@10815
   774
icculus@10815
   775
    if (cvt->filters[++cvt->filter_index]) {
icculus@10815
   776
        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
icculus@10815
   777
    }
icculus@10815
   778
}
icculus@10815
   779
#endif
icculus@10815
   780
icculus@10815
   781
icculus@10815
   782
void SDL_ChooseAudioConverters(void)
icculus@10815
   783
{
icculus@10815
   784
    static SDL_bool converters_chosen = SDL_FALSE;
icculus@10815
   785
icculus@10815
   786
    if (converters_chosen) {
icculus@10815
   787
        return;
icculus@10815
   788
    }
icculus@10815
   789
slouken@11406
   790
#define SET_CONVERTER_FUNCS(fntype) \
icculus@10815
   791
        SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
icculus@10815
   792
        SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
icculus@10815
   793
        SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
icculus@10815
   794
        SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
icculus@10815
   795
        SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
icculus@10815
   796
        SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
icculus@10815
   797
        SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
icculus@10815
   798
        SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
icculus@10815
   799
        SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
icculus@10815
   800
        SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
icculus@10815
   801
        converters_chosen = SDL_TRUE
icculus@10815
   802
slouken@11406
   803
#if HAVE_SSE2_INTRINSICS
icculus@10815
   804
    if (SDL_HasSSE2()) {
icculus@10815
   805
        SET_CONVERTER_FUNCS(SSE2);
icculus@10815
   806
        return;
icculus@10815
   807
    }
slouken@11406
   808
#endif
icculus@10815
   809
slouken@11406
   810
#if NEED_SCALAR_CONVERTER_FALLBACKS
icculus@10815
   811
    SET_CONVERTER_FUNCS(Scalar);
slouken@11406
   812
#endif
icculus@10815
   813
slouken@11406
   814
#undef SET_CONVERTER_FUNCS
icculus@10815
   815
icculus@10815
   816
    SDL_assert(converters_chosen == SDL_TRUE);
icculus@10815
   817
}
icculus@1982
   818
slouken@1985
   819
/* vi: set ts=4 sw=4 expandtab: */