diff --git a/libs/ardour/x86_functions_avx512f.cc b/libs/ardour/x86_functions_avx512f.cc index dc0f5d02ac..ddbe95173b 100644 --- a/libs/ardour/x86_functions_avx512f.cc +++ b/libs/ardour/x86_functions_avx512f.cc @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Ayan Shafqat + * Copyright (C) 2023 Ayan Shafqat * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,58 +17,19 @@ */ #ifdef FPU_AVX512F_SUPPORT -/** - * ================= THIS IS WoRK IN PROGRESS ======================== - * - * The functions below are not optimized to AVX512F, yet. This is here - * to integrate with Ardour's build system and integration tests. - * - */ - #include "ardour/mix.h" #include -#include + +#define IS_ALIGNED_TO(ptr, bytes) \ + (reinterpret_cast(ptr) % (bytes) == 0) #ifndef __AVX512F__ #error "__AVX512F__ must be enabled for this module to work" #endif -#define IS_ALIGNED_TO(ptr, bytes) (reinterpret_cast(ptr) % (bytes) == 0) - -#if defined(__GNUC__) -#define IS_NOT_ALIGNED_TO(ptr, bytes) \ - __builtin_expect(!!(reinterpret_cast(ptr) % (bytes)), 0) -#else -#define IS_NOT_ALIGNED_TO(ptr, bytes) \ - (!!(reinterpret_cast(ptr) % (bytes))) -#endif - /** - * Local functions - */ - -static inline __m256 avx_getmax_ps(__m256 vmax); -static inline __m256 avx_getmin_ps(__m256 vmin); - -static void -x86_avx512f_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain); - -static void -x86_avx512f_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain); - -static void -x86_avx512f_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes); - -static void -x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes); - -/** - * Module implementation - */ - -/** - * @brief x86-64 AVX optimized routine for compute peak procedure + * @brief x86-64 AVX-512F optimized routine for compute peak procedure * @param src Pointer to source buffer * @param nframes Number of frames to process * @param current Current peak value @@ -77,99 +38,223 @@ x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n float x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current) { - // If src is null then skip processing - if ((src == nullptr) || (nframes == 0)) - { - return current; - } + // Convert to signed integer to prevent any arithmetic overflow errors + int32_t frames = static_cast(nframes); - // Broadcast mask to compute absolute value - const uint32_t f32_nan = UINT32_C(0x7FFFFFFF); - const __m256 ABS_MASK = - _mm256_broadcast_ss(reinterpret_cast(&f32_nan)); + // Broadcast the current max values to all elements of the ZMM register + __m512 zmax = _mm512_set1_ps(current); - // Broadcast the current max value to all elements of the YMM register - __m256 vmax = _mm256_set1_ps(current); + // Compute single/4/8 min/max of unaligned portion until alignment is reached + while (frames > 0) { + if (IS_ALIGNED_TO(src, sizeof(__m512))) { + break; + } - // Compute single min/max of unaligned portion until alignment is reached - while (IS_NOT_ALIGNED_TO(src, sizeof(__m256)) && (nframes > 0)) - { - __m256 vsrc; + if (IS_ALIGNED_TO(src, sizeof(__m256))) { + __m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); - vsrc = _mm256_broadcast_ss(src); - vsrc = _mm256_and_ps(ABS_MASK, vsrc); - vmax = _mm256_max_ps(vmax, vsrc); + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); + + src += 8; + frames -= 8; + continue; + } + + if (IS_ALIGNED_TO(src, sizeof(__m128))) { + __m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); + + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); + + src += 4; + frames -= 4; + continue; + } + + // Pointers are aligned to float boundaries (4 bytes) + __m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); + + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); ++src; - --nframes; + --frames; } - // Process the aligned portion 32 samples at a time - while (nframes >= 32) - { -#ifdef _WIN32 - _mm_prefetch(reinterpret_cast(src + 32), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 32), 0, 0); -#endif - __m256 t0 = _mm256_load_ps(src + 0); - __m256 t1 = _mm256_load_ps(src + 8); - __m256 t2 = _mm256_load_ps(src + 16); - __m256 t3 = _mm256_load_ps(src + 24); + while (frames >= 256) { + _mm_prefetch(reinterpret_cast(src + 256), _mm_hint(0)); - t0 = _mm256_and_ps(ABS_MASK, t0); - t1 = _mm256_and_ps(ABS_MASK, t1); - t2 = _mm256_and_ps(ABS_MASK, t2); - t3 = _mm256_and_ps(ABS_MASK, t3); + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); - vmax = _mm256_max_ps(vmax, t0); - vmax = _mm256_max_ps(vmax, t1); - vmax = _mm256_max_ps(vmax, t2); - vmax = _mm256_max_ps(vmax, t3); + __m512 x8 = _mm512_load_ps(src + 128); + __m512 x9 = _mm512_load_ps(src + 144); + __m512 x10 = _mm512_load_ps(src + 160); + __m512 x11 = _mm512_load_ps(src + 176); + __m512 x12 = _mm512_load_ps(src + 192); + __m512 x13 = _mm512_load_ps(src + 208); + __m512 x14 = _mm512_load_ps(src + 224); + __m512 x15 = _mm512_load_ps(src + 240); - src += 32; - nframes -= 32; + x0 = _mm512_abs_ps(x0); + x1 = _mm512_abs_ps(x1); + x2 = _mm512_abs_ps(x2); + x3 = _mm512_abs_ps(x3); + x4 = _mm512_abs_ps(x4); + x5 = _mm512_abs_ps(x5); + x6 = _mm512_abs_ps(x6); + x7 = _mm512_abs_ps(x7); + x8 = _mm512_abs_ps(x8); + x9 = _mm512_abs_ps(x9); + x10 = _mm512_abs_ps(x10); + x11 = _mm512_abs_ps(x11); + x12 = _mm512_abs_ps(x12); + x13 = _mm512_abs_ps(x13); + x14 = _mm512_abs_ps(x14); + x15 = _mm512_abs_ps(x15); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + zmax = _mm512_max_ps(zmax, x4); + zmax = _mm512_max_ps(zmax, x5); + zmax = _mm512_max_ps(zmax, x6); + zmax = _mm512_max_ps(zmax, x7); + zmax = _mm512_max_ps(zmax, x8); + zmax = _mm512_max_ps(zmax, x9); + zmax = _mm512_max_ps(zmax, x10); + zmax = _mm512_max_ps(zmax, x11); + zmax = _mm512_max_ps(zmax, x12); + zmax = _mm512_max_ps(zmax, x13); + zmax = _mm512_max_ps(zmax, x14); + zmax = _mm512_max_ps(zmax, x15); + + src += 256; + frames -= 256; + } + + while (frames >= 128) { + _mm_prefetch(reinterpret_cast(src + 128), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); + + x0 = _mm512_abs_ps(x0); + x1 = _mm512_abs_ps(x1); + x2 = _mm512_abs_ps(x2); + x3 = _mm512_abs_ps(x3); + x4 = _mm512_abs_ps(x4); + x5 = _mm512_abs_ps(x5); + x6 = _mm512_abs_ps(x6); + x7 = _mm512_abs_ps(x7); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + zmax = _mm512_max_ps(zmax, x4); + zmax = _mm512_max_ps(zmax, x5); + zmax = _mm512_max_ps(zmax, x6); + zmax = _mm512_max_ps(zmax, x7); + + src += 128; + frames -= 128; + } + + while (frames >= 64) { + _mm_prefetch(reinterpret_cast(src + 64), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + + x0 = _mm512_abs_ps(x0); + x1 = _mm512_abs_ps(x1); + x2 = _mm512_abs_ps(x2); + x3 = _mm512_abs_ps(x3); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + + src += 64; + frames -= 64; + } + + // Process the remaining samples 16 at a time + while (frames >= 16) { + __m512 x = _mm512_load_ps(src); + + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); + + src += 16; + frames -= 16; } // Process the remaining samples 8 at a time - while (nframes >= 8) - { - __m256 vsrc; + while (frames >= 8) { + __m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); - vsrc = _mm256_load_ps(src); - vsrc = _mm256_and_ps(ABS_MASK, vsrc); - vmax = _mm256_max_ps(vmax, vsrc); + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); src += 8; - nframes -= 8; + frames -= 8; } - // If there are still some left 4 to 8 samples, process them below - while (nframes > 0) - { - __m256 vsrc; + // Process the remaining samples 4 at a time + while (frames >= 4) { + __m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); - vsrc = _mm256_broadcast_ss(src); - vsrc = _mm256_and_ps(ABS_MASK, vsrc); - vmax = _mm256_max_ps(vmax, vsrc); + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); + + src += 4; + frames -= 4; + } + + // If there are still some left 2-4 samples, process them one at a time. + while (frames > 0) { + __m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); + + x = _mm512_abs_ps(x); + zmax = _mm512_max_ps(zmax, x); ++src; - --nframes; + --frames; } - vmax = avx_getmax_ps(vmax); + // Get the max of the ZMM registers + current = _mm512_reduce_max_ps(zmax); -#if defined(__GNUC__) && (__GNUC__ < 5) - return *((float *)&vmax); -#elif defined(__GNUC__) && (__GNUC__ < 8) - return vmax[0]; -#else - return _mm256_cvtss_f32(vmax); -#endif + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register + + return current; } /** - * @brief x86-64 AVX optimized routine for find peak procedure + * @brief x86-64 AVX-512F optimized routine for find peak procedure * @param src Pointer to source buffer * @param nframes Number of frames to process * @param[in,out] minf Current minimum value, updated @@ -178,83 +263,226 @@ x86_avx512f_compute_peak(const float *src, uint32_t nframes, float current) void x86_avx512f_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf) { - // Broadcast the current min and max values to all elements of the YMM register - __m256 vmin = _mm256_broadcast_ss(minf); - __m256 vmax = _mm256_broadcast_ss(maxf); + // Convert to signed integer to prevent any arithmetic overflow errors + int32_t frames = static_cast(nframes); - // Compute single min/max of unaligned portion until alignment is reached - while (IS_NOT_ALIGNED_TO(src, sizeof(__m256)) && nframes > 0) { - __m256 vsrc; + // Broadcast the current min and max values to all elements of the ZMM register + __m512 zmin = _mm512_set1_ps(*minf); + __m512 zmax = _mm512_set1_ps(*maxf); - vsrc = _mm256_broadcast_ss(src); - vmax = _mm256_max_ps(vmax, vsrc); - vmin = _mm256_min_ps(vmin, vsrc); + + // Compute single/4/8 min/max of unaligned portion until alignment is reached + while (frames > 0) { + if (IS_ALIGNED_TO(src, sizeof(__m512))) { + break; + } + + if (IS_ALIGNED_TO(src, sizeof(__m256))) { + __m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); + + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); + + src += 8; + frames -= 8; + continue; + } + + if (IS_ALIGNED_TO(src, sizeof(__m128))) { + __m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); + + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); + + src += 4; + frames -= 4; + continue; + } + + // Pointers are aligned to float boundaries (4 bytes) + __m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); + + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); ++src; - --nframes; + --frames; } - // Process the aligned portion 32 samples at a time - while (nframes >= 32) - { -#ifdef _WIN32 - _mm_prefetch(reinterpret_cast(src + 32), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 32), 0, 0); -#endif - __m256 t0 = _mm256_load_ps(src + 0); - __m256 t1 = _mm256_load_ps(src + 8); - __m256 t2 = _mm256_load_ps(src + 16); - __m256 t3 = _mm256_load_ps(src + 24); + while (frames >= 256) { + _mm_prefetch(reinterpret_cast(src + 256), _mm_hint(0)); - vmax = _mm256_max_ps(vmax, t0); - vmax = _mm256_max_ps(vmax, t1); - vmax = _mm256_max_ps(vmax, t2); - vmax = _mm256_max_ps(vmax, t3); + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); - vmin = _mm256_min_ps(vmin, t0); - vmin = _mm256_min_ps(vmin, t1); - vmin = _mm256_min_ps(vmin, t2); - vmin = _mm256_min_ps(vmin, t3); + __m512 x8 = _mm512_load_ps(src + 128); + __m512 x9 = _mm512_load_ps(src + 144); + __m512 x10 = _mm512_load_ps(src + 160); + __m512 x11 = _mm512_load_ps(src + 176); + __m512 x12 = _mm512_load_ps(src + 192); + __m512 x13 = _mm512_load_ps(src + 208); + __m512 x14 = _mm512_load_ps(src + 224); + __m512 x15 = _mm512_load_ps(src + 240); - src += 32; - nframes -= 32; + zmin = _mm512_min_ps(zmin, x0); + zmin = _mm512_min_ps(zmin, x1); + zmin = _mm512_min_ps(zmin, x2); + zmin = _mm512_min_ps(zmin, x3); + zmin = _mm512_min_ps(zmin, x4); + zmin = _mm512_min_ps(zmin, x5); + zmin = _mm512_min_ps(zmin, x6); + zmin = _mm512_min_ps(zmin, x7); + + zmin = _mm512_min_ps(zmin, x8); + zmin = _mm512_min_ps(zmin, x9); + zmin = _mm512_min_ps(zmin, x10); + zmin = _mm512_min_ps(zmin, x11); + zmin = _mm512_min_ps(zmin, x12); + zmin = _mm512_min_ps(zmin, x13); + zmin = _mm512_min_ps(zmin, x14); + zmin = _mm512_min_ps(zmin, x15); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + zmax = _mm512_max_ps(zmax, x4); + zmax = _mm512_max_ps(zmax, x5); + zmax = _mm512_max_ps(zmax, x6); + zmax = _mm512_max_ps(zmax, x7); + + zmax = _mm512_max_ps(zmax, x8); + zmax = _mm512_max_ps(zmax, x9); + zmax = _mm512_max_ps(zmax, x10); + zmax = _mm512_max_ps(zmax, x11); + zmax = _mm512_max_ps(zmax, x12); + zmax = _mm512_max_ps(zmax, x13); + zmax = _mm512_max_ps(zmax, x14); + zmax = _mm512_max_ps(zmax, x15); + + src += 256; + frames -= 256; + } + + while (frames >= 128) { + _mm_prefetch(reinterpret_cast(src + 128), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); + + zmin = _mm512_min_ps(zmin, x0); + zmin = _mm512_min_ps(zmin, x1); + zmin = _mm512_min_ps(zmin, x2); + zmin = _mm512_min_ps(zmin, x3); + zmin = _mm512_min_ps(zmin, x4); + zmin = _mm512_min_ps(zmin, x5); + zmin = _mm512_min_ps(zmin, x6); + zmin = _mm512_min_ps(zmin, x7); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + zmax = _mm512_max_ps(zmax, x4); + zmax = _mm512_max_ps(zmax, x5); + zmax = _mm512_max_ps(zmax, x6); + zmax = _mm512_max_ps(zmax, x7); + + src += 128; + frames -= 128; + } + + while (frames >= 64) { + _mm_prefetch(reinterpret_cast(src + 64), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + + zmin = _mm512_min_ps(zmin, x0); + zmin = _mm512_min_ps(zmin, x1); + zmin = _mm512_min_ps(zmin, x2); + zmin = _mm512_min_ps(zmin, x3); + + zmax = _mm512_max_ps(zmax, x0); + zmax = _mm512_max_ps(zmax, x1); + zmax = _mm512_max_ps(zmax, x2); + zmax = _mm512_max_ps(zmax, x3); + + src += 64; + frames -= 64; + } + + // Process the remaining samples 16 at a time + while (frames >= 16) { + __m512 x = _mm512_load_ps(src); + + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); + + src += 16; + frames -= 16; } // Process the remaining samples 8 at a time - while (nframes >= 8) { - __m256 vsrc; + while (frames >= 8) { + __m512 x = _mm512_castps256_ps512(_mm256_load_ps(src)); - vsrc = _mm256_load_ps(src); - vmax = _mm256_max_ps(vmax, vsrc); - vmin = _mm256_min_ps(vmin, vsrc); + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); src += 8; - nframes -= 8; + frames -= 8; } - // If there are still some left 4 to 8 samples, process them one at a time. - while (nframes > 0) { - __m256 vsrc; + // Process the remaining samples 4 at a time + while (frames >= 4) { + __m512 x = _mm512_castps128_ps512(_mm_load_ps(src)); - vsrc = _mm256_broadcast_ss(src); - vmax = _mm256_max_ps(vmax, vsrc); - vmin = _mm256_min_ps(vmin, vsrc); + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); + + src += 4; + frames -= 4; + } + + // If there are still some left 2-4 samples, process them one at a time. + while (frames > 0) { + __m512 x = _mm512_castps128_ps512(_mm_load_ss(src)); + + zmin = _mm512_min_ps(zmin, x); + zmax = _mm512_max_ps(zmax, x); ++src; - --nframes; + --frames; } - // Get min and max of the YMM registers - vmin = avx_getmin_ps(vmin); - vmax = avx_getmax_ps(vmax); + // Get min and max of the ZMM registers + *minf = _mm512_reduce_min_ps(zmin); + *maxf = _mm512_reduce_max_ps(zmax); - _mm_store_ss(minf, _mm256_castps256_ps128(vmin)); - _mm_store_ss(maxf, _mm256_castps256_ps128(vmax)); + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register } /** - * @brief x86-64 AVX optimized routine for apply gain routine + * @brief x86-64 AVX-512F optimized routine for apply gain routine * @param[in,out] dst Pointer to the destination buffer, which gets updated * @param nframes Number of frames (or samples) to process * @param gain Gain to apply @@ -263,66 +491,128 @@ void x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) { // Convert to signed integer to prevent any arithmetic overflow errors - int32_t frames = (int32_t)nframes; - // Load gain vector to all elements of YMM register - __m256 vgain = _mm256_set1_ps(gain); + int32_t frames = static_cast(nframes); - do { - __m128 g0 = _mm256_castps256_ps128(vgain); - while (!IS_ALIGNED_TO(dst, sizeof(__m256)) && (frames > 0)) { - _mm_store_ss(dst, _mm_mul_ps(g0, _mm_load_ss(dst))); - ++dst; - --frames; + // Load gain vector to all elements of XMM, YMM, and ZMM register + // It's the same register, but used for SSE, AVX, and AVX512 calculation + __m512 zgain = _mm512_set1_ps(gain); + __m256 ygain = _mm512_castps512_ps256(zgain); + __m128 xgain = _mm512_castps512_ps128(zgain); + + while (frames > 0) { + if (IS_ALIGNED_TO(dst, sizeof(__m512))) { + break; } - } while (0); + + if (IS_ALIGNED_TO(dst, sizeof(__m256))) { + __m256 x = _mm256_load_ps(dst); + __m256 y = _mm256_mul_ps(ygain, x); + _mm256_store_ps(dst, y); + dst += 8; + frames -= 8; + continue; + } + + if (IS_ALIGNED_TO(dst, sizeof(__m128))) { + __m128 x = _mm_load_ps(dst); + __m128 y = _mm_mul_ps(xgain, x); + _mm_store_ps(dst, y); + dst += 4; + frames -= 4; + continue; + } + + // Pointers are aligned to float boundaries (4 bytes) + __m128 x = _mm_load_ss(dst); + __m128 y = _mm_mul_ss(xgain, x); + _mm_store_ss(dst, y); + ++dst; + --frames; + } + + // Process the remaining samples 128 at a time + while (frames >= 128) { + _mm_prefetch(reinterpret_cast(dst + 128), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(dst + 0); + __m512 x1 = _mm512_load_ps(dst + 16); + __m512 x2 = _mm512_load_ps(dst + 32); + __m512 x3 = _mm512_load_ps(dst + 48); + __m512 x4 = _mm512_load_ps(dst + 64); + __m512 x5 = _mm512_load_ps(dst + 80); + __m512 x6 = _mm512_load_ps(dst + 96); + __m512 x7 = _mm512_load_ps(dst + 112); + + __m512 y0 = _mm512_mul_ps(zgain, x0); + __m512 y1 = _mm512_mul_ps(zgain, x1); + __m512 y2 = _mm512_mul_ps(zgain, x2); + __m512 y3 = _mm512_mul_ps(zgain, x3); + __m512 y4 = _mm512_mul_ps(zgain, x4); + __m512 y5 = _mm512_mul_ps(zgain, x5); + __m512 y6 = _mm512_mul_ps(zgain, x6); + __m512 y7 = _mm512_mul_ps(zgain, x7); + + _mm512_store_ps(dst + 0, y0); + _mm512_store_ps(dst + 16, y1); + _mm512_store_ps(dst + 32, y2); + _mm512_store_ps(dst + 48, y3); + _mm512_store_ps(dst + 64, y4); + _mm512_store_ps(dst + 80, y5); + _mm512_store_ps(dst + 96, y6); + _mm512_store_ps(dst + 112, y7); + + dst += 128; + frames -= 128; + } // Process the remaining samples 16 at a time - while (frames >= 16) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(dst + 16), 0, 0); -#endif - __m256 d0, d1; - d0 = _mm256_load_ps(dst + 0); - d1 = _mm256_load_ps(dst + 8); - - d0 = _mm256_mul_ps(vgain, d0); - d1 = _mm256_mul_ps(vgain, d1); - - _mm256_store_ps(dst + 0, d0); - _mm256_store_ps(dst + 8, d1); + while (frames >= 16) { + __m512 x = _mm512_load_ps(dst); + __m512 y = _mm512_mul_ps(zgain, x); + _mm512_store_ps(dst, y); dst += 16; frames -= 16; } - // Process the remaining samples 8 at a time + // Process remaining samples x8 while (frames >= 8) { - _mm256_store_ps(dst, _mm256_mul_ps(vgain, _mm256_load_ps(dst))); + __m256 x = _mm256_load_ps(dst); + __m256 y = _mm256_mul_ps(ygain, x); + _mm256_store_ps(dst, y); + dst += 8; frames -= 8; } - // Process the remaining samples - do { - __m128 g0 = _mm256_castps256_ps128(vgain); - while (frames > 0) { - _mm_store_ss(dst, _mm_mul_ps(g0, _mm_load_ss(dst))); - ++dst; - --frames; - } - } while (0); + // Process remaining samples x4 + while (frames >= 4) { + __m128 x = _mm_load_ps(dst); + __m128 y = _mm_mul_ps(xgain, x); + _mm_store_ps(dst, y); + + dst += 4; + frames -= 4; + } + + // Process remaining samples + while (frames > 0) { + __m128 x = _mm_load_ss(dst); + __m128 y = _mm_mul_ss(xgain, x); + _mm_store_ss(dst, y); + ++dst; + --frames; + } + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + // + _mm256_zeroupper(); // zeros the upper portion of YMM register } /** - * @brief x86-64 AVX optimized routine for mixing buffer with gain. - * - * This function may choose SSE over AVX if the pointers are aligned - * to 16 byte boundary instead of 32 byte boundary to reduce time to - * process. - * + * @brief x86-64 AVX-512F optimized routine for mixing buffer with gain. * @param[in,out] dst Pointer to destination buffer, which gets updated * @param[in] src Pointer to source buffer (not updated) * @param nframes Number of samples to process @@ -331,25 +621,168 @@ x86_avx512f_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) void x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes, float gain) { - if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) { - // Pointers are both aligned to 32 bit boundaries, this can be processed with AVX - x86_avx512f_mix_buffers_with_gain_aligned(dst, src, nframes, gain); - } else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) { - // This can still be processed with SSE - x86_sse_mix_buffers_with_gain(dst, src, nframes, gain); - } else { - // Pointers are unaligned, so process them with unaligned load/store AVX - x86_avx512f_mix_buffers_with_gain_unaligned(dst, src, nframes, gain); + // Convert to signed integer to prevent any arithmetic overflow errors + int32_t frames = static_cast(nframes); + + // Load gain vector to all elements of XMM, YMM, and ZMM register + // It's the same register, but used for SSE, AVX, and AVX512 calculation + __m512 zgain = _mm512_set1_ps(gain); + __m256 ygain = _mm512_castps512_ps256(zgain); + __m128 xgain = _mm512_castps512_ps128(zgain); + + while (frames > 0) + { + if (IS_ALIGNED_TO(src, sizeof(__m512)) && + IS_ALIGNED_TO(dst, sizeof(__m512))) { + break; + } + + if (IS_ALIGNED_TO(src, sizeof(__m256)) && + IS_ALIGNED_TO(dst, sizeof(__m256))) { + __m256 x = _mm256_load_ps(src); + __m256 y = _mm256_load_ps(dst); + + y = _mm256_fmadd_ps(ygain, x, y); + _mm256_store_ps(dst, y); + + src += 8; + dst += 8; + frames -= 8; + continue; + } + + if (IS_ALIGNED_TO(src, sizeof(__m128)) && + IS_ALIGNED_TO(dst, sizeof(__m128))) { + __m128 x = _mm_load_ps(src); + __m128 y = _mm_load_ps(dst); + + y = _mm_fmadd_ps(xgain, x, y); + _mm_store_ps(dst, y); + + src += 4; + dst += 4; + frames -= 4; + continue; + } + + // Pointers are aligned to float boundaries (4 bytes) + __m128 x = _mm_load_ss(src); + __m128 y = _mm_load_ss(dst); + + y = _mm_fmadd_ss(xgain, x, y); + _mm_store_ss(dst, y); + + ++src; + ++dst; + --frames; } + + // Process the remaining samples 128 at a time + while (frames >= 128) { + _mm_prefetch(reinterpret_cast(src + 128), _mm_hint(0)); + _mm_prefetch(reinterpret_cast(dst + 128), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); + + __m512 y0 = _mm512_load_ps(dst + 0); + __m512 y1 = _mm512_load_ps(dst + 16); + __m512 y2 = _mm512_load_ps(dst + 32); + __m512 y3 = _mm512_load_ps(dst + 48); + __m512 y4 = _mm512_load_ps(dst + 64); + __m512 y5 = _mm512_load_ps(dst + 80); + __m512 y6 = _mm512_load_ps(dst + 96); + __m512 y7 = _mm512_load_ps(dst + 112); + + y0 = _mm512_fmadd_ps(zgain, x0, y0); + y1 = _mm512_fmadd_ps(zgain, x1, y1); + y2 = _mm512_fmadd_ps(zgain, x2, y2); + y3 = _mm512_fmadd_ps(zgain, x3, y3); + y4 = _mm512_fmadd_ps(zgain, x4, y4); + y5 = _mm512_fmadd_ps(zgain, x5, y5); + y6 = _mm512_fmadd_ps(zgain, x6, y6); + y7 = _mm512_fmadd_ps(zgain, x7, y7); + + _mm512_store_ps(dst + 0, y0); + _mm512_store_ps(dst + 16, y1); + _mm512_store_ps(dst + 32, y2); + _mm512_store_ps(dst + 48, y3); + _mm512_store_ps(dst + 64, y4); + _mm512_store_ps(dst + 80, y5); + _mm512_store_ps(dst + 96, y6); + _mm512_store_ps(dst + 112, y7); + + src += 128; + dst += 128; + frames -= 128; + } + + // Process the remaining samples 16 at a time + while (frames >= 16) { + __m512 x = _mm512_load_ps(src); + __m512 y = _mm512_load_ps(dst); + y = _mm512_fmadd_ps(zgain, x, y); + _mm512_store_ps(dst, y); + + src += 16; + dst += 16; + frames -= 16; + } + + // Process remaining samples x8 + while (frames >= 8) { + __m256 x = _mm256_load_ps(src); + __m256 y = _mm256_load_ps(dst); + + y = _mm256_fmadd_ps(ygain, x, y); + _mm256_store_ps(dst, y); + + src += 8; + dst += 8; + frames -= 8; + } + + // Process remaining samples x4 + while (frames >= 4) { + __m128 x = _mm_load_ps(src); + __m128 y = _mm_load_ps(dst); + + y = _mm_fmadd_ps(xgain, x, y); + _mm_store_ps(dst, y); + + src += 4; + dst += 4; + frames -= 4; + } + + // Process remaining samples + while (frames > 0) { + __m128 x = _mm_load_ss(src); + __m128 y = _mm_load_ss(dst); + + y = _mm_fmadd_ss(xgain, x, y); + _mm_store_ss(dst, y); + + ++src; + ++dst; + --frames; + } + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + // + _mm256_zeroupper(); // zeros the upper portion of YMM register } /** - * @brief x86-64 AVX optimized routine for mixing buffer with no gain. - * - * This function may choose SSE over AVX if the pointers are aligned - * to 16 byte boundary instead of 32 byte boundary to reduce time to - * process. - * + * @brief x86-64 AVX-512F optimized routine for mixing buffer with no gain. * @param[in,out] dst Pointer to destination buffer, which gets updated * @param[in] src Pointer to source buffer (not updated) * @param nframes Number of samples to process @@ -357,25 +790,157 @@ x86_avx512f_mix_buffers_with_gain(float *dst, const float *src, uint32_t nframes void x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) { - if (IS_ALIGNED_TO(dst, 32) && IS_ALIGNED_TO(src, 32)) { - // Pointers are both aligned to 32 bit boundaries, this can be processed with AVX - x86_avx512f_mix_buffers_no_gain_aligned(dst, src, nframes); - } else if (IS_ALIGNED_TO(dst, 16) && IS_ALIGNED_TO(src, 16)) { - // This can still be processed with SSE - x86_sse_mix_buffers_no_gain(dst, src, nframes); - } else { - // Pointers are unaligned, so process them with unaligned load/store AVX - x86_avx512f_mix_buffers_no_gain_unaligned(dst, src, nframes); + // Convert to signed integer to prevent any arithmetic overflow errors + int32_t frames = static_cast(nframes); + + while (frames > 0) + { + if (IS_ALIGNED_TO(src, sizeof(__m512)) && + IS_ALIGNED_TO(dst, sizeof(__m512))) { + break; + } + + if (IS_ALIGNED_TO(src, sizeof(__m256)) && + IS_ALIGNED_TO(dst, sizeof(__m256))) { + __m256 x = _mm256_load_ps(src); + __m256 y = _mm256_load_ps(dst); + y = _mm256_add_ps(x, y); + _mm256_store_ps(dst, y); + src += 8; + dst += 8; + frames -= 8; + continue; + } + + if (IS_ALIGNED_TO(src, sizeof(__m128)) && + IS_ALIGNED_TO(dst, sizeof(__m128))) { + __m128 x = _mm_load_ps(src); + __m128 y = _mm_load_ps(dst); + y = _mm_add_ps(x, y); + _mm_store_ps(dst, y); + src += 4; + dst += 4; + frames -= 4; + continue; + } + + // Pointers are aligned to float boundaries (4 bytes) + __m128 x = _mm_load_ss(src); + __m128 y = _mm_load_ss(dst); + y = _mm_add_ss(x, y); + _mm_store_ss(dst, y); + ++src; + ++dst; + --frames; } + + // Process the remaining samples 128 at a time + while (frames >= 128) { + _mm_prefetch(reinterpret_cast(src + 128), _mm_hint(0)); + _mm_prefetch(reinterpret_cast(dst + 128), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); + + __m512 y0 = _mm512_load_ps(dst + 0); + __m512 y1 = _mm512_load_ps(dst + 16); + __m512 y2 = _mm512_load_ps(dst + 32); + __m512 y3 = _mm512_load_ps(dst + 48); + __m512 y4 = _mm512_load_ps(dst + 64); + __m512 y5 = _mm512_load_ps(dst + 80); + __m512 y6 = _mm512_load_ps(dst + 96); + __m512 y7 = _mm512_load_ps(dst + 112); + + y0 = _mm512_add_ps(x0, y0); + y1 = _mm512_add_ps(x1, y1); + y2 = _mm512_add_ps(x2, y2); + y3 = _mm512_add_ps(x3, y3); + y4 = _mm512_add_ps(x4, y4); + y5 = _mm512_add_ps(x5, y5); + y6 = _mm512_add_ps(x6, y6); + y7 = _mm512_add_ps(x7, y7); + + _mm512_store_ps(dst + 0, y0); + _mm512_store_ps(dst + 16, y1); + _mm512_store_ps(dst + 32, y2); + _mm512_store_ps(dst + 48, y3); + _mm512_store_ps(dst + 64, y4); + _mm512_store_ps(dst + 80, y5); + _mm512_store_ps(dst + 96, y6); + _mm512_store_ps(dst + 112, y7); + + src += 128; + dst += 128; + frames -= 128; + } + + // Process the remaining samples 16 at a time + while (frames >= 16) { + __m512 x = _mm512_load_ps(src); + __m512 y = _mm512_load_ps(dst); + + y = _mm512_add_ps(x, y); + _mm512_store_ps(dst, y); + + src += 16; + dst += 16; + frames -= 16; + } + + // Process remaining samples x8 + while (frames >= 8) { + __m256 x = _mm256_load_ps(src); + __m256 y = _mm256_load_ps(dst); + + y = _mm256_add_ps(x, y); + _mm256_store_ps(dst, y); + + src += 8; + dst += 8; + frames -= 8; + } + + // Process remaining samples x4 + while (frames >= 4) { + __m128 x = _mm_load_ps(src); + __m128 y = _mm_load_ps(dst); + + y = _mm_add_ps(x, y); + _mm_store_ps(dst, y); + + src += 4; + dst += 4; + frames -= 4; + } + + // Process remaining samples + while (frames > 0) { + __m128 x = _mm_load_ss(src); + __m128 y = _mm_load_ss(dst); + + y = _mm_add_ss(x, y); + _mm_store_ss(dst, y); + + ++src; + ++dst; + --frames; + } + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + // + _mm256_zeroupper(); // zeros the upper portion of YMM register } /** * @brief Copy vector from one location to another - * - * This has not been hand optimized for AVX with the rationale that standard - * C library implementation will provide faster memory copy operation. It will - * be redundant to implement memcpy for floats. - * * @param[out] dst Pointer to destination buffer * @param[in] src Pointer to source buffer * @param nframes Number of samples to copy @@ -383,429 +948,154 @@ x86_avx512f_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) void x86_avx512f_copy_vector(float *dst, const float *src, uint32_t nframes) { - (void) memcpy(dst, src, nframes * sizeof(float)); -} + // Convert to signed integer to prevent any arithmetic overflow errors + int32_t frames = static_cast(nframes); -/** - * Local helper functions - */ - -/** - * @brief Helper routine for mixing buffers with gain for unaligned buffers - * - * @details This routine executes the following expression below per element: - * - * dst = dst + (gain * src) - * - * @param[in,out] dst Pointer to destination buffer, which gets updated - * @param[in] src Pointer to source buffer (not updated) - * @param nframes Number of samples to process - * @param gain Gain to apply - */ -static void -x86_avx512f_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32_t nframes, float gain) -{ - // Load gain vector to all elements of YMM register - __m256 vgain = _mm256_set1_ps(gain); - - // Process the remaining samples 16 at a time - while (nframes >= 16) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); - _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 16), 0, 0); - __builtin_prefetch(reinterpret_cast(dst + 16), 0, 0); -#endif - __m256 s0, s1; - __m256 d0, d1; - - // Load sources - s0 = _mm256_loadu_ps(src + 0); - s1 = _mm256_loadu_ps(src + 8); - - // Load destinations - d0 = _mm256_loadu_ps(dst + 0); - d1 = _mm256_loadu_ps(dst + 8); - - // src = src * gain - s0 = _mm256_mul_ps(vgain, s0); - s1 = _mm256_mul_ps(vgain, s1); - - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - d1 = _mm256_add_ps(d1, s1); - - // Store result - _mm256_storeu_ps(dst + 0, d0); - _mm256_storeu_ps(dst + 8, d1); - - // Update pointers and counters - src += 16; - dst += 16; - nframes -= 16; - } - - // Process the remaining samples 8 at a time - while (nframes >= 8) { - __m256 s0, d0; - // Load sources - s0 = _mm256_loadu_ps(src); - // Load destinations - d0 = _mm256_loadu_ps(dst); - // src = src * gain - s0 = _mm256_mul_ps(vgain, s0); - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - // Store result - _mm256_storeu_ps(dst, d0); - // Update pointers and counters - src+= 8; - dst += 8; - nframes -= 8; - } - - // Process the remaining samples - do { - __m128 g0 = _mm_set_ss(gain); - while (nframes > 0) { - __m128 s0, d0; - s0 = _mm_load_ss(src); - d0 = _mm_load_ss(dst); - s0 = _mm_mul_ss(g0, s0); - d0 = _mm_add_ss(d0, s0); - _mm_store_ss(dst, d0); - ++src; - ++dst; - --nframes; + while (frames > 0) { + if (IS_ALIGNED_TO(src, sizeof(__m512)) && + IS_ALIGNED_TO(dst, sizeof(__m512))) { + break; } - } while (0); -} -/** - * @brief Helper routine for mixing buffers with gain for aligned buffers - * - * @details This routine executes the following expression below per element: - * - * dst = dst + (gain * src) - * - * @param[in,out] dst Pointer to destination buffer, which gets updated - * @param[in] src Pointer to source buffer (not updated) - * @param nframes Number of samples to process - * @param gain Gain to apply - */ -static void -x86_avx512f_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes, float gain) -{ - // Load gain vector to all elements of YMM register - __m256 vgain = _mm256_set1_ps(gain); + if (IS_ALIGNED_TO(src, sizeof(__m256)) && + IS_ALIGNED_TO(dst, sizeof(__m256))) { + __m256 x = _mm256_load_ps(src); + _mm256_store_ps(dst, x); + src += 8; + dst += 8; + frames -= 8; + continue; + } - // Process the remaining samples 16 at a time - while (nframes >= 16) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); - _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 16), 0, 0); - __builtin_prefetch(reinterpret_cast(dst + 16), 0, 0); -#endif - __m256 s0, s1; - __m256 d0, d1; + if (IS_ALIGNED_TO(src, sizeof(__m128)) && + IS_ALIGNED_TO(dst, sizeof(__m128))) { + __m128 x = _mm_load_ps(src); + _mm_store_ps(dst, x); + src += 4; + dst += 4; + frames -= 4; + continue; + } - // Load sources - s0 = _mm256_load_ps(src + 0); - s1 = _mm256_load_ps(src + 8); - - // Load destinations - d0 = _mm256_load_ps(dst + 0); - d1 = _mm256_load_ps(dst + 8); - - // src = src * gain - s0 = _mm256_mul_ps(vgain, s0); - s1 = _mm256_mul_ps(vgain, s1); - - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - d1 = _mm256_add_ps(d1, s1); - - // Store result - _mm256_store_ps(dst + 0, d0); - _mm256_store_ps(dst + 8, d1); - - // Update pointers and counters - src += 16; - dst += 16; - nframes -= 16; + // Pointers are aligned to float boundaries (4 bytes) + __m128 x = _mm_load_ss(src); + _mm_store_ss(dst, x); + ++src; + ++dst; + --frames; } - // Process the remaining samples 8 at a time - while (nframes >= 8) { - __m256 s0, d0; - // Load sources - s0 = _mm256_load_ps(src + 0 ); - // Load destinations - d0 = _mm256_load_ps(dst + 0 ); - // src = src * gain - s0 = _mm256_mul_ps(vgain, s0); - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - // Store result - _mm256_store_ps(dst, d0); - // Update pointers and counters + // Process 256 samples at a time + while (frames >= 256) { + _mm_prefetch(reinterpret_cast(src + 256), _mm_hint(0)); + _mm_prefetch(reinterpret_cast(dst + 256), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + __m512 x4 = _mm512_load_ps(src + 64); + __m512 x5 = _mm512_load_ps(src + 80); + __m512 x6 = _mm512_load_ps(src + 96); + __m512 x7 = _mm512_load_ps(src + 112); + + __m512 x8 = _mm512_load_ps(src + 128); + __m512 x9 = _mm512_load_ps(src + 144); + __m512 x10 = _mm512_load_ps(src + 160); + __m512 x11 = _mm512_load_ps(src + 176); + __m512 x12 = _mm512_load_ps(src + 192); + __m512 x13 = _mm512_load_ps(src + 208); + __m512 x14 = _mm512_load_ps(src + 224); + __m512 x15 = _mm512_load_ps(src + 240); + + _mm512_store_ps(dst + 0, x0); + _mm512_store_ps(dst + 16, x1); + _mm512_store_ps(dst + 32, x2); + _mm512_store_ps(dst + 48, x3); + _mm512_store_ps(dst + 64, x4); + _mm512_store_ps(dst + 80, x5); + _mm512_store_ps(dst + 96, x6); + _mm512_store_ps(dst + 112, x7); + + _mm512_store_ps(dst + 128, x8); + _mm512_store_ps(dst + 144, x9); + _mm512_store_ps(dst + 160, x10); + _mm512_store_ps(dst + 176, x11); + _mm512_store_ps(dst + 192, x12); + _mm512_store_ps(dst + 208, x13); + _mm512_store_ps(dst + 224, x14); + _mm512_store_ps(dst + 240, x15); + + src += 256; + dst += 256; + frames -= 256; + } + + // Process remaining samples 64 at a time + while (frames >= 64) { + _mm_prefetch(reinterpret_cast(src + 64), _mm_hint(0)); + _mm_prefetch(reinterpret_cast(dst + 64), _mm_hint(0)); + + __m512 x0 = _mm512_load_ps(src + 0); + __m512 x1 = _mm512_load_ps(src + 16); + __m512 x2 = _mm512_load_ps(src + 32); + __m512 x3 = _mm512_load_ps(src + 48); + + _mm512_store_ps(dst + 0, x0); + _mm512_store_ps(dst + 16, x1); + _mm512_store_ps(dst + 32, x2); + _mm512_store_ps(dst + 48, x3); + + src += 64; + dst += 64; + frames -= 64; + } + + // Process remaining samples 16 at a time + while (frames >= 16) { + __m512 x = _mm512_load_ps(src); + _mm512_store_ps(dst, x); + + src += 16; + dst += 16; + frames -= 16; + } + + // Process remaining samples x8 + while (frames >= 8) { + __m256 x = _mm256_load_ps(src); + _mm256_store_ps(dst, x); + src += 8; dst += 8; - nframes -= 8; + frames -= 8; } - // Process the remaining samples, one sample at a time. - do { - __m128 g0 = _mm256_castps256_ps128(vgain); // use the same register - while (nframes > 0) { - __m128 s0, d0; - s0 = _mm_load_ss(src); - d0 = _mm_load_ss(dst); - s0 = _mm_mul_ss(g0, s0); - d0 = _mm_add_ss(d0, s0); - _mm_store_ss(dst, d0); - ++src; - ++dst; - --nframes; - } - } while (0); + // Process remaining samples x4 + while (frames >= 4) { + __m128 x = _mm_load_ps(src); + _mm_store_ps(dst, x); + + src += 4; + dst += 4; + frames -= 4; + } + + // Process remaining samples + while (frames > 0) { + __m128 x = _mm_load_ss(src); + _mm_store_ss(dst, x); + + ++src; + ++dst; + --frames; + } + + // There's a penalty going from AVX mode to SSE mode. This can + // be avoided by ensuring the CPU that rest of the routine is no + // longer interested in the upper portion of the YMM register. + + _mm256_zeroupper(); // zeros the upper portion of YMM register } -/** - * @brief Helper routine for mixing buffers with no gain for aligned buffers - * - * @details This routine executes the following expression below per element: - * - * dst = dst + src - * - * @param[in,out] dst Pointer to destination buffer, which gets updated - * @param[in] src Pointer to source buffer (not updated) - * @param nframes Number of samples to process - */ -static void -x86_avx512f_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes) -{ - // Process the remaining samples 16 at a time - while (nframes >= 16) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); - _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 16), 0, 0); - __builtin_prefetch(reinterpret_cast(dst + 16), 0, 0); -#endif - __m256 s0, s1; - __m256 d0, d1; - - // Load sources - s0 = _mm256_loadu_ps(src + 0); - s1 = _mm256_loadu_ps(src + 8); - - // Load destinations - d0 = _mm256_loadu_ps(dst + 0); - d1 = _mm256_loadu_ps(dst + 8); - - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - d1 = _mm256_add_ps(d1, s1); - - // Store result - _mm256_storeu_ps(dst + 0, d0); - _mm256_storeu_ps(dst + 8, d1); - - // Update pointers and counters - src += 16; - dst += 16; - nframes -= 16; - } - - // Process the remaining samples 8 at a time - while (nframes >= 8) { - __m256 s0, d0; - // Load sources - s0 = _mm256_loadu_ps(src); - // Load destinations - d0 = _mm256_loadu_ps(dst); - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - // Store result - _mm256_storeu_ps(dst, d0); - // Update pointers and counters - src+= 8; - dst += 8; - nframes -= 8; - } - - // Process the remaining samples - do { - while (nframes > 0) { - __m128 s0, d0; - s0 = _mm_load_ss(src); - d0 = _mm_load_ss(dst); - d0 = _mm_add_ss(d0, s0); - _mm_store_ss(dst, d0); - ++src; - ++dst; - --nframes; - } - } while (0); - -} - -/** - * @brief Helper routine for mixing buffers with no gain for unaligned buffers - * - * @details This routine executes the following expression below per element: - * - * dst = dst + src - * - * @param[in,out] dst Pointer to destination buffer, which gets updated - * @param[in] src Pointer to source buffer (not updated) - * @param nframes Number of samples to process - */ -static void -x86_avx512f_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t nframes) -{ - // Process the aligned portion 32 samples at a time - while (nframes >= 32) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (32 * sizeof(float))), _mm_hint(0)); - _mm_prefetch(((char *)src + (32 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 32), 0, 0); - __builtin_prefetch(reinterpret_cast(dst + 32), 0, 0); -#endif - __m256 s0, s1, s2, s3; - __m256 d0, d1, d2, d3; - - // Load sources - s0 = _mm256_load_ps(src + 0 ); - s1 = _mm256_load_ps(src + 8 ); - s2 = _mm256_load_ps(src + 16); - s3 = _mm256_load_ps(src + 24); - - // Load destinations - d0 = _mm256_load_ps(dst + 0 ); - d1 = _mm256_load_ps(dst + 8 ); - d2 = _mm256_load_ps(dst + 16); - d3 = _mm256_load_ps(dst + 24); - - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - d1 = _mm256_add_ps(d1, s1); - d2 = _mm256_add_ps(d2, s2); - d3 = _mm256_add_ps(d3, s3); - - // Store result - _mm256_store_ps(dst + 0 , d0); - _mm256_store_ps(dst + 8 , d1); - _mm256_store_ps(dst + 16, d2); - _mm256_store_ps(dst + 24, d3); - - // Update pointers and counters - src += 32; - dst += 32; - nframes -= 32; - } - - // Process the remaining samples 16 at a time - while (nframes >= 16) - { -#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) - _mm_prefetch(((char *)dst + (16 * sizeof(float))), _mm_hint(0)); - _mm_prefetch(((char *)src + (16 * sizeof(float))), _mm_hint(0)); -#else - __builtin_prefetch(reinterpret_cast(src + 16), 0, 0); - __builtin_prefetch(reinterpret_cast(dst + 16), 0, 0); -#endif - __m256 s0, s1; - __m256 d0, d1; - - // Load sources - s0 = _mm256_load_ps(src + 0); - s1 = _mm256_load_ps(src + 8); - - // Load destinations - d0 = _mm256_load_ps(dst + 0); - d1 = _mm256_load_ps(dst + 8); - - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - d1 = _mm256_add_ps(d1, s1); - - // Store result - _mm256_store_ps(dst + 0, d0); - _mm256_store_ps(dst + 8, d1); - - // Update pointers and counters - src += 16; - dst += 16; - nframes -= 16; - } - - // Process the remaining samples 8 at a time - while (nframes >= 8) { - __m256 s0, d0; - // Load sources - s0 = _mm256_load_ps(src + 0 ); - // Load destinations - d0 = _mm256_load_ps(dst + 0 ); - // dst = dst + src - d0 = _mm256_add_ps(d0, s0); - // Store result - _mm256_store_ps(dst, d0); - // Update pointers and counters - src += 8; - dst += 8; - nframes -= 8; - } - - // Process the remaining samples - do { - while (nframes > 0) { - __m128 s0, d0; - s0 = _mm_load_ss(src); - d0 = _mm_load_ss(dst); - d0 = _mm_add_ss(d0, s0); - _mm_store_ss(dst, d0); - ++src; - ++dst; - --nframes; - } - } while (0); -} - -/** - * @brief Get the maximum value of packed float register - * @param vmax Packed float 8x register - * @return __m256 Maximum value in p[0] - */ -static inline __m256 avx_getmax_ps(__m256 vmax) -{ - vmax = _mm256_max_ps(vmax, _mm256_permute2f128_ps(vmax, vmax, 1)); - vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 3, 2))); - vmax = _mm256_max_ps(vmax, _mm256_permute_ps(vmax, _MM_SHUFFLE(0, 0, 0, 1))); - return vmax; -} - -/** - * @brief Get the minimum value of packed float register - * @param vmax Packed float 8x register - * @return __m256 Minimum value in p[0] - */ -static inline __m256 avx_getmin_ps(__m256 vmin) -{ - vmin = _mm256_min_ps(vmin, _mm256_permute2f128_ps(vmin, vmin, 1)); - vmin = _mm256_min_ps(vmin, _mm256_permute_ps(vmin, _MM_SHUFFLE(0, 0, 3, 2))); - vmin = _mm256_min_ps(vmin, _mm256_permute_ps(vmin, _MM_SHUFFLE(0, 0, 0, 1))); - return vmin; -} - -#endif // FPU_AVX512_SUPPORT \ No newline at end of file +#endif // FPU_AVX512F_SUPPORT