diff --git a/libs/ardour/ardour/minimp3.h b/libs/ardour/ardour/minimp3.h index 23b44803d7..b2c289928e 100644 --- a/libs/ardour/ardour/minimp3.h +++ b/libs/ardour/ardour/minimp3.h @@ -14,7 +14,7 @@ typedef struct { - int frame_bytes, channels, hz, layer, bitrate_kbps; + int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps; } mp3dec_frame_info_t; typedef struct @@ -88,7 +88,7 @@ int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_s #if !defined(MINIMP3_NO_SIMD) -#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(_M_ARM64) || defined(__x86_64__) || defined(__aarch64__)) +#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64)) /* x64 always have SSE2, arm64 always have neon, no need for generic code */ #define MINIMP3_ONLY_SIMD #endif /* SIMD checks... */ @@ -138,7 +138,7 @@ static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[ #endif /* defined(__PIC__)*/ } #endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */ -static int have_simd() +static int have_simd(void) { #ifdef MINIMP3_ONLY_SIMD return 1; @@ -163,7 +163,7 @@ end: return g_have_simd - 1; #endif /* MINIMP3_ONLY_SIMD */ } -#elif defined(__ARM_NEON) || defined(__aarch64__) +#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64) #include #define HAVE_SSE 0 #define HAVE_SIMD 1 @@ -193,6 +193,18 @@ static int have_simd() #define HAVE_SIMD 0 #endif /* !defined(MINIMP3_NO_SIMD) */ +#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64) +#define HAVE_ARMV6 1 +static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a) +{ + int32_t x = 0; + __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a)); + return x; +} +#else +#define HAVE_ARMV6 0 +#endif + typedef struct { const uint8_t *buf; @@ -871,12 +883,22 @@ static void L3_midside_stereo(float *left, int n) int i = 0; float *right = left + 576; #if HAVE_SIMD - if (have_simd()) for (; i < n - 3; i += 4) + if (have_simd()) { - f4 vl = VLD(left + i); - f4 vr = VLD(right + i); - VSTORE(left + i, VADD(vl, vr)); - VSTORE(right + i, VSUB(vl, vr)); + for (; i < n - 3; i += 4) + { + f4 vl = VLD(left + i); + f4 vr = VLD(right + i); + VSTORE(left + i, VADD(vl, vr)); + VSTORE(right + i, VSUB(vl, vr)); + } +#ifdef __GNUC__ + /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc. + * For more info see: https://github.com/lieff/minimp3/issues/88 + */ + if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0) + return; +#endif } #endif /* HAVE_SIMD */ for (; i < n; i++) @@ -1343,7 +1365,7 @@ static void mp3d_DCT_II(float *grbuf, int n) } else #endif /* HAVE_SIMD */ #ifdef MINIMP3_ONLY_SIMD - {} + {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ #else /* MINIMP3_ONLY_SIMD */ for (; k < n; k++) { @@ -1409,10 +1431,16 @@ static void mp3d_DCT_II(float *grbuf, int n) #ifndef MINIMP3_FLOAT_OUTPUT static int16_t mp3d_scale_pcm(float sample) { +#if HAVE_ARMV6 + int32_t s32 = (int32_t)(sample + .5f); + s32 -= (s32 < 0); + int16_t s = (int16_t)minimp3_clip_int16_arm(s32); +#else if (sample >= 32766.5) return (int16_t) 32767; if (sample <= -32767.5) return (int16_t)-32768; int16_t s = (int16_t)(sample + .5f); s -= (s < 0); /* away from zero, to be compliant */ +#endif return s; } #else /* MINIMP3_FLOAT_OUTPUT */ @@ -1567,7 +1595,7 @@ static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins) } else #endif /* HAVE_SIMD */ #ifdef MINIMP3_ONLY_SIMD - {} + {} /* for HAVE_SIMD=1, MINIMP3_ONLY_SIMD=1 case we do not need non-intrinsic "else" branch */ #else /* MINIMP3_ONLY_SIMD */ for (i = 14; i >= 0; i--) { @@ -1676,7 +1704,7 @@ static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_b } } *ptr_frame_bytes = 0; - return i; + return mp3_bytes; } void mp3dec_init(mp3dec_t *dec) @@ -1713,6 +1741,7 @@ int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_s hdr = mp3 + i; memcpy(dec->header, hdr, HDR_SIZE); info->frame_bytes = i + frame_size; + info->frame_offset = i; info->channels = HDR_IS_MONO(hdr) ? 1 : 2; info->hz = hdr_sample_rate_hz(hdr); info->layer = 4 - HDR_GET_LAYER(hdr); @@ -1781,60 +1810,56 @@ int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_s #ifdef MINIMP3_FLOAT_OUTPUT void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples) { - if(num_samples > 0) - { - int i = 0; + int i = 0; #if HAVE_SIMD - int aligned_count = num_samples & ~7; - - for(;i < aligned_count;i+=8) - { - static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; - f4 a = VMUL(VLD(&in[i ]), g_scale); - f4 b = VMUL(VLD(&in[i+4]), g_scale); + int aligned_count = num_samples & ~7; + for(; i < aligned_count; i += 8) + { + static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f }; + f4 a = VMUL(VLD(&in[i ]), g_scale); + f4 b = VMUL(VLD(&in[i+4]), g_scale); #if HAVE_SSE - static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; - static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; - __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), - _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); - out[i ] = _mm_extract_epi16(pcm8, 0); - out[i+1] = _mm_extract_epi16(pcm8, 1); - out[i+2] = _mm_extract_epi16(pcm8, 2); - out[i+3] = _mm_extract_epi16(pcm8, 3); - out[i+4] = _mm_extract_epi16(pcm8, 4); - out[i+5] = _mm_extract_epi16(pcm8, 5); - out[i+6] = _mm_extract_epi16(pcm8, 6); - out[i+7] = _mm_extract_epi16(pcm8, 7); + static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f }; + static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f }; + __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)), + _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min))); + out[i ] = _mm_extract_epi16(pcm8, 0); + out[i+1] = _mm_extract_epi16(pcm8, 1); + out[i+2] = _mm_extract_epi16(pcm8, 2); + out[i+3] = _mm_extract_epi16(pcm8, 3); + out[i+4] = _mm_extract_epi16(pcm8, 4); + out[i+5] = _mm_extract_epi16(pcm8, 5); + out[i+6] = _mm_extract_epi16(pcm8, 6); + out[i+7] = _mm_extract_epi16(pcm8, 7); #else /* HAVE_SSE */ - int16x4_t pcma, pcmb; - a = VADD(a, VSET(0.5f)); - b = VADD(b, VSET(0.5f)); - pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); - pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); - vst1_lane_s16(out+i , pcma, 0); - vst1_lane_s16(out+i+1, pcma, 1); - vst1_lane_s16(out+i+2, pcma, 2); - vst1_lane_s16(out+i+3, pcma, 3); - vst1_lane_s16(out+i+4, pcmb, 0); - vst1_lane_s16(out+i+5, pcmb, 1); - vst1_lane_s16(out+i+6, pcmb, 2); - vst1_lane_s16(out+i+7, pcmb, 3); + int16x4_t pcma, pcmb; + a = VADD(a, VSET(0.5f)); + b = VADD(b, VSET(0.5f)); + pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0))))); + pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0))))); + vst1_lane_s16(out+i , pcma, 0); + vst1_lane_s16(out+i+1, pcma, 1); + vst1_lane_s16(out+i+2, pcma, 2); + vst1_lane_s16(out+i+3, pcma, 3); + vst1_lane_s16(out+i+4, pcmb, 0); + vst1_lane_s16(out+i+5, pcmb, 1); + vst1_lane_s16(out+i+6, pcmb, 2); + vst1_lane_s16(out+i+7, pcmb, 3); #endif /* HAVE_SSE */ - } + } #endif /* HAVE_SIMD */ - for(; i < num_samples; i++) + for(; i < num_samples; i++) + { + float sample = in[i] * 32768.0f; + if (sample >= 32766.5) + out[i] = (int16_t) 32767; + else if (sample <= -32767.5) + out[i] = (int16_t)-32768; + else { - float sample = in[i] * 32768.0f; - if (sample >= 32766.5) - out[i] = (int16_t) 32767; - else if (sample <= -32767.5) - out[i] = (int16_t)-32768; - else - { - int16_t s = (int16_t)(sample + .5f); - s -= (s < 0); /* away from zero, to be compliant */ - out[i] = s; - } + int16_t s = (int16_t)(sample + .5f); + s -= (s < 0); /* away from zero, to be compliant */ + out[i] = s; } } }