src/audio/SDL_audiotypecvt.c
changeset 12858 30204ee1c7c5
parent 12857 93cc810247aa
child 12859 6a71489fe580
equal deleted inserted replaced
12857:93cc810247aa 12858:30204ee1c7c5
    23 #include "SDL_audio.h"
    23 #include "SDL_audio.h"
    24 #include "SDL_audio_c.h"
    24 #include "SDL_audio_c.h"
    25 #include "SDL_cpuinfo.h"
    25 #include "SDL_cpuinfo.h"
    26 #include "SDL_assert.h"
    26 #include "SDL_assert.h"
    27 
    27 
    28 /* !!! FIXME: disabled until we fix https://bugzilla.libsdl.org/show_bug.cgi?id=4186 */
    28 #ifdef __ARM_NEON
    29 #if 0 /*def __ARM_NEON */
       
    30 #define HAVE_NEON_INTRINSICS 1
    29 #define HAVE_NEON_INTRINSICS 1
    31 #endif
    30 #endif
    32 
    31 
    33 #ifdef __SSE2__
    32 #ifdef __SSE2__
    34 #define HAVE_SSE2_INTRINSICS 1
    33 #define HAVE_SSE2_INTRINSICS 1
   924     /* Make sure src is aligned too. */
   923     /* Make sure src is aligned too. */
   925     if ((((size_t) src) & 15) == 0) {
   924     if ((((size_t) src) & 15) == 0) {
   926         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   925         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
   927         const uint8_t *mmsrc = (const uint8_t *) src;
   926         const uint8_t *mmsrc = (const uint8_t *) src;
   928         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   927         const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
   929         const float32x4_t one = vdupq_n_f32(1.0f);
   928         const float32x4_t negone = vdupq_n_f32(-1.0f);
   930         while (i >= 16) {   /* 16 * 8-bit */
   929         while (i >= 16) {   /* 16 * 8-bit */
   931             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
   930             const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
   932             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
   931             const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
   933             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
   932             const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
   934             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
   933             /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
   935             vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
   934             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
   936             vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
   935             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
   937             vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
   936             vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
   938             vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
   937             vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
   939             i -= 16; mmsrc -= 16; dst -= 16;
   938             i -= 16; mmsrc -= 16; dst -= 16;
   940         }
   939         }
   941 
   940 
   942         src = (const Uint8 *) mmsrc;
   941         src = (const Uint8 *) mmsrc;
   943     }
   942     }
  1019 
  1018 
  1020     /* Make sure src is aligned too. */
  1019     /* Make sure src is aligned too. */
  1021     if ((((size_t) src) & 15) == 0) {
  1020     if ((((size_t) src) & 15) == 0) {
  1022         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1021         /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
  1023         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
  1022         const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
  1024         const float32x4_t one = vdupq_n_f32(1.0f);
  1023         const float32x4_t negone = vdupq_n_f32(-1.0f);
  1025         while (i >= 8) {   /* 8 * 16-bit */
  1024         while (i >= 8) {   /* 8 * 16-bit */
  1026             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
  1025             const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
  1027             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
  1026             /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
  1028             vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
  1027             vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
  1029             vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
  1028             vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
  1030             i -= 8; src -= 8; dst -= 8;
  1029             i -= 8; src -= 8; dst -= 8;
  1031         }
  1030         }
  1032     }
  1031     }
  1033 
  1032 
  1034     src += 7; dst += 7;  /* adjust for any scalar finishing. */
  1033     src += 7; dst += 7;  /* adjust for any scalar finishing. */