aarch64: fix unaligned count and peak computation

Clamp misaligned prefix loops to the remaining frame count to avoid nframes underflow and potential out-of-bounds access for small buffers. Apply vabsq_f32() to all SIMD loads before peak reduction to ensure correct absolute peak calculation in unrolled NEON paths.
2025-12-29 20:12:43 -05:00
parent b673989763
commit 98eab68044
1 changed files with 26 additions and 6 deletions
--- a/libs/ardour/aarch64_neon_functions.cc
+++ b/libs/ardour/aarch64_neon_functions.cc
@@ -24,6 +24,7 @@
 #include <arm_neon.h>

 #include <cstddef>
+#include <algorithm>

 /**
 * @brief Aligns a pointer to the next 16-byte boundary
@@ -78,12 +79,18 @@ arm_neon_compute_peak(const float* src, uint32_t nframes, float current)
 	if (UNLIKELY(src_aligned != src))
 	{
 		size_t unaligned_count = src_aligned - src;
-		for (size_t i = 0; i < unaligned_count; i++)
+
+		// Handle small number of nframes
+		size_t count = std::min<size_t>(unaligned_count, nframes);
+
+		for (size_t i = 0; i < count; i++)
 		{
 			float32x4_t x0 = vld1q_dup_f32(src + i);
+			x0 = vabsq_f32(x0);
 			vmax = vmaxq_f32(vmax, x0);
 		}
-		nframes -= unaligned_count;
+
+		nframes -= count;
 	}

 	// Compute the number of SIMD frames
@@ -106,6 +113,11 @@ arm_neon_compute_peak(const float* src, uint32_t nframes, float current)
 			x2 = vld1q_f32(src_aligned + offset + (2 * 4));
 			x3 = vld1q_f32(src_aligned + offset + (3 * 4));

+			x0 = vabsq_f32(x0);
+			x1 = vabsq_f32(x1);
+			x2 = vabsq_f32(x2);
+			x3 = vabsq_f32(x3);
+
 			max0 = vmaxq_f32(x0, x1);
 			max1 = vmaxq_f32(x2, x3);
 			max2 = vmaxq_f32(max0, max1);
@@ -182,13 +194,18 @@ arm_neon_find_peaks(const float* src, uint32_t nframes, float* minf, float* maxf
 	if (UNLIKELY(src_aligned != src))
 	{
 		size_t unaligned_count = src_aligned - src;
-		for (size_t i = 0; i < unaligned_count; i++)
+
+		// Handle small number of nframes
+		size_t count = std::min<size_t>(unaligned_count, nframes);
+
+		for (size_t i = 0; i < count; i++)
 		{
 			float32x4_t x0 = vld1q_dup_f32(src + i);
 			vmax = vmaxq_f32(vmax, x0);
 			vmin = vminq_f32(vmin, x0);
 		}
-		nframes -= unaligned_count;
+
+		nframes -= count;
 	}

 	// Compute the number of SIMD frames
@@ -299,7 +316,9 @@ arm_neon_apply_gain_to_buffer(float* dst, uint32_t nframes, float gain)
 	if (UNLIKELY(dst_aligned != dst))
 	{
 		size_t unaligned_count = dst_aligned - dst;
-		for (size_t i = 0; i < unaligned_count; i++)
+		size_t count = std::min<size_t>(unaligned_count, nframes);
+
+		for (size_t i = 0; i < count; i++)
 		{
 			float32_t x0, y0;

@@ -307,7 +326,8 @@ arm_neon_apply_gain_to_buffer(float* dst, uint32_t nframes, float gain)
 			y0 = x0 * gain;
 			dst[i] = y0;
 		}
-		nframes -= unaligned_count;
+
+		nframes -= count;
 	}

 	// Compute the number of SIMD frames