src/audio/SDL_audiotypecvt.c
changeset 12858 30204ee1c7c5
parent 12857 93cc810247aa
child 12859 6a71489fe580
     1.1 --- a/src/audio/SDL_audiotypecvt.c	Fri Jun 14 15:47:32 2019 -0400
     1.2 +++ b/src/audio/SDL_audiotypecvt.c	Fri Jun 14 15:52:48 2019 -0400
     1.3 @@ -25,8 +25,7 @@
     1.4  #include "SDL_cpuinfo.h"
     1.5  #include "SDL_assert.h"
     1.6  
     1.7 -/* !!! FIXME: disabled until we fix https://bugzilla.libsdl.org/show_bug.cgi?id=4186 */
     1.8 -#if 0 /*def __ARM_NEON */
     1.9 +#ifdef __ARM_NEON
    1.10  #define HAVE_NEON_INTRINSICS 1
    1.11  #endif
    1.12  
    1.13 @@ -926,16 +925,16 @@
    1.14          /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
    1.15          const uint8_t *mmsrc = (const uint8_t *) src;
    1.16          const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
    1.17 -        const float32x4_t one = vdupq_n_f32(1.0f);
    1.18 +        const float32x4_t negone = vdupq_n_f32(-1.0f);
    1.19          while (i >= 16) {   /* 16 * 8-bit */
    1.20              const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
    1.21              const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
    1.22              const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
    1.23              /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
    1.24 -            vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
    1.25 -            vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
    1.26 -            vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
    1.27 -            vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
    1.28 +            vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
    1.29 +            vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
    1.30 +            vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
    1.31 +            vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
    1.32              i -= 16; mmsrc -= 16; dst -= 16;
    1.33          }
    1.34  
    1.35 @@ -1021,12 +1020,12 @@
    1.36      if ((((size_t) src) & 15) == 0) {
    1.37          /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
    1.38          const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
    1.39 -        const float32x4_t one = vdupq_n_f32(1.0f);
    1.40 +        const float32x4_t negone = vdupq_n_f32(-1.0f);
    1.41          while (i >= 8) {   /* 8 * 16-bit */
    1.42              const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
    1.43              /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
    1.44 -            vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
    1.45 -            vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
    1.46 +            vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
    1.47 +            vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
    1.48              i -= 8; src -= 8; dst -= 8;
    1.49          }
    1.50      }