From c8c57f14bf5a41b64adf249acb9d1ff64393a9e1 Mon Sep 17 00:00:00 2001
From: Ayan Shafqat <ayan.x.shafqat@gmail.com>
Date: Thu, 20 Aug 2020 20:15:39 -0400
Subject: [PATCH] Adding ARM NEON optimized routines

This commit adds ARM NEON optimized routines for the following procedures
below:

*_compute_peak
*_find_peaks
*_apply_gain_to_buffer
*_mix_buffers_with_gain
*_mix_buffers_no_gain
*_copy_vector

NEON optimized routines have a prefix of: arm_neon_
---
 libs/ardour/ardour/mix.h          |  12 +
 libs/ardour/arm_neon_functions.cc | 518 ++++++++++++++++++++++++++++++
 libs/ardour/globals.cc            |  13 +
 libs/ardour/wscript               |   1 +
 4 files changed, 544 insertions(+)
 create mode 100644 libs/ardour/arm_neon_functions.cc

diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index a394a64ce2..4132e8e371 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -70,6 +70,18 @@ LIBARDOUR_API void  veclib_mix_buffers_no_gain       (ARDOUR::Sample * dst, cons
 
 #endif
 
+/* Optimized NEON functions */
+#if defined(__arm__) && !defined(__APPLE__)
+extern "C" {
+	LIBARDOUR_API float arm_neon_compute_peak          (const float * buf, uint32_t nsamples, float current);
+	LIBARDOUR_API void  arm_neon_apply_gain_to_buffer  (float * buf, uint32_t nframes, float gain);
+	LIBARDOUR_API void  arm_neon_copy_vector           (float * dst, const float * src, uint32_t nframes);
+	LIBARDOUR_API void  arm_neon_find_peaks            (const float *src, uint32_t nframes, float *minf, float *maxf);
+	LIBARDOUR_API void  arm_neon_mix_buffers_no_gain   (float * dst, const float * src, uint32_t nframes);
+	LIBARDOUR_API void  arm_neon_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain);
+}
+#endif
+
 /* non-optimized functions */
 
 LIBARDOUR_API float default_compute_peak              (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
diff --git a/libs/ardour/arm_neon_functions.cc b/libs/ardour/arm_neon_functions.cc
new file mode 100644
index 0000000000..347a8823eb
--- /dev/null
+++ b/libs/ardour/arm_neon_functions.cc
@@ -0,0 +1,518 @@
+/*
+ * Copyright (C) 2020 Ayan Shafqat <ayan.x.shafqat@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "ardour/mix.h"
+
+#if defined(__arm__) && defined(__ARM_NEON)
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0)
+
+#ifdef __cplusplus
+#define C_FUNC extern "C"
+#else
+#define C_FUNC
+#endif
+
+C_FUNC float
+arm_neon_compute_peak(const float *src, uint32_t nframes, float current)
+{
+	float32x4_t vc0;
+
+	// Broadcast single value to all elements of the register
+	vc0 = vdupq_n_f32(current);
+
+	// While pointer is not aligned, process one sample at a time
+	while (!IS_ALIGNED_TO(src, sizeof(float32x4_t)) && (nframes > 0)) {
+		float32x4_t x0;
+
+		x0 = vld1q_dup_f32(src);
+		x0 = vabsq_f32(x0);
+		vc0 = vmaxq_f32(vc0, x0);
+
+		++src;
+		--nframes;
+	}
+
+	// SIMD portion with aligned buffers
+	do {
+		while (nframes >= 8) {
+			float32x4_t x0, x1;
+
+			x0 = vld1q_f32(src + 0);
+			x1 = vld1q_f32(src + 4);
+
+			x0 = vabsq_f32(x0);
+			x1 = vabsq_f32(x1);
+
+			vc0 = vmaxq_f32(vc0, x0);
+			vc0 = vmaxq_f32(vc0, x1);
+
+			src += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0;
+
+			x0 = vld1q_f32(src);
+
+			x0 = vabsq_f32(x0);
+			vc0 = vmaxq_f32(vc0, x0);
+
+			src += 4;
+			nframes -= 4;
+		}
+
+		while (nframes >= 2) {
+			float32x2_t x0;
+			float32x4_t y0;
+
+
+			x0 = vld1_f32(src);        // Load two elements
+			x0 = vabs_f32(x0);         // Compute ABS value
+			y0 = vcombine_f32(x0, x0); // Combine two vectors
+
+			vc0 = vmaxq_f32(vc0, y0);
+
+			src += 2;
+			nframes -= 2;
+		}
+	} while (0);
+
+
+	// Do remaining samples one frame at a time
+	while (nframes > 0) {
+		float32x4_t x0;
+
+		x0 = vld1q_dup_f32(src);
+		x0 = vabsq_f32(x0);
+		vc0 = vmaxq_f32(vc0, x0);
+
+		++src;
+		--nframes;
+	}
+
+	// Compute the max in register
+	do {
+		float32x2_t vlo = vget_low_f32(vc0);
+		float32x2_t vhi = vget_high_f32(vc0);
+		float32x2_t max0 = vpmax_f32(vlo, vhi);
+		float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0]
+		current = vget_lane_f32(max1, 0);
+	} while (0);
+
+	return current;
+}
+
+C_FUNC void
+arm_neon_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf)
+{
+	float32x4_t vmin, vmax;
+
+	// Broadcast single value to all elements of the register
+	vmin = vld1q_dup_f32(minf);
+	vmax = vld1q_dup_f32(maxf);
+
+	// While pointer is not aligned, process one sample at a time
+	while (!IS_ALIGNED_TO(src, sizeof(float32x4_t)) && (nframes > 0)) {
+		float32x4_t x0;
+
+		x0 = vld1q_dup_f32(src);
+		vmax = vmaxq_f32(vmax, x0);
+		vmin = vminq_f32(vmin, x0);
+
+		++src;
+		--nframes;
+	}
+
+	// SIMD portion with aligned buffers
+	do {
+		while (nframes >= 8) {
+			float32x4_t x0, x1;
+
+			x0 = vld1q_f32(src + 0);
+			x1 = vld1q_f32(src + 4);
+
+			vmax = vmaxq_f32(vmax, x0);
+			vmax = vmaxq_f32(vmax, x1);
+
+			vmin = vminq_f32(vmin, x0);
+			vmin = vminq_f32(vmin, x1);
+
+			src += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0;
+
+			x0 = vld1q_f32(src);
+
+			vmax = vmaxq_f32(vmax, x0);
+			vmin = vminq_f32(vmin, x0);
+
+			src += 4;
+			nframes -= 4;
+		}
+
+		while (nframes >= 2) {
+			float32x2_t x0;
+			float32x4_t y0;
+
+
+			x0 = vld1_f32(src);        // Load two elements
+			y0 = vcombine_f32(x0, x0); // Combine two vectors
+
+			vmax = vmaxq_f32(vmax, y0);
+			vmin = vminq_f32(vmin, y0);
+
+			src += 2;
+			nframes -= 2;
+		}
+	} while (0);
+
+
+	// Do remaining samples one frame at a time
+	while (nframes > 0) {
+		float32x4_t x0;
+
+		x0 = vld1q_dup_f32(src);
+		vmax = vmaxq_f32(vmax, x0);
+		vmin = vminq_f32(vmin, x0);
+
+		++src;
+		--nframes;
+	}
+
+	// Compute the max in register
+	do {
+		float32x2_t vlo = vget_low_f32(vmax);
+		float32x2_t vhi = vget_high_f32(vmax);
+		float32x2_t max0 = vpmax_f32(vlo, vhi);
+		float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0]
+		vst1_lane_f32(maxf, max1, 0);
+	} while (0);
+
+	// Compute the min in register
+	do {
+		float32x2_t vlo = vget_low_f32(vmin);
+		float32x2_t vhi = vget_high_f32(vmin);
+		float32x2_t min0 = vpmin_f32(vlo, vhi);
+		float32x2_t min1 = vpmin_f32(min0, min0); // min is now at min1[0]
+		vst1_lane_f32(minf, min1, 0);
+	} while (0);
+}
+
+C_FUNC void
+arm_neon_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
+{
+	while (!IS_ALIGNED_TO(dst, sizeof(float32x4_t)) && nframes > 0) {
+		float32_t x0, y0;
+
+		x0 = *dst;
+		y0 = x0 * gain;
+		*dst = y0;
+
+		++dst;
+		--nframes;
+	}
+
+	// SIMD portion with aligned buffers
+	do {
+		float32x4_t g0 = vdupq_n_f32(gain);
+
+		while (nframes >= 8) {
+			float32x4_t x0, x1, y0, y1;
+			x0 = vld1q_f32(dst + 0);
+			x1 = vld1q_f32(dst + 4);
+
+			y0 = vmulq_f32(x0, g0);
+			y1 = vmulq_f32(x1, g0);
+
+			vst1q_f32(dst + 0, y0);
+			vst1q_f32(dst + 4, y1);
+
+			dst += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0, y0;
+
+			x0 = vld1q_f32(dst);
+			y0 = vmulq_f32(x0, g0);
+			vst1q_f32(dst, y0);
+
+			dst += 4;
+			nframes -= 4;
+		}
+
+		while (nframes >= 2) {
+			float32x2_t x0, y0;
+
+			x0 = vld1_f32(dst);
+			y0 = vmul_n_f32(x0, gain);
+			vst1_f32(dst, y0);
+
+			dst += 2;
+			nframes -= 2;
+		}
+	} while (0);
+
+	// Do the remaining portion one sample at a time
+	while (nframes > 0) {
+		float32_t x0, y0;
+
+		x0 = *dst;
+		y0 = x0 * gain;
+		*dst = y0;
+
+		++dst;
+		--nframes;
+	}
+}
+
+C_FUNC void
+arm_neon_mix_buffers_with_gain(
+	float *__restrict dst, const float *__restrict src,
+	uint32_t nframes, float gain)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) &&
+			IS_ALIGNED_TO(dst, sizeof(float32x4_t))) &&
+			(nframes > 0)) {
+		float32_t x0, y0;
+
+		x0 = *src;
+		y0 = *dst;
+		y0 = y0 + (x0 * gain);
+		*dst = y0;
+
+		++dst;
+		++src;
+		--nframes;
+	}
+
+	// Use NEON when buffers are aligned
+	do {
+		float32x4_t g0 = vdupq_n_f32(gain);
+
+		while (nframes >= 8) {
+			float32x4_t x0, x1, y0, y1;
+			x0 = vld1q_f32(src + 0);
+			x1 = vld1q_f32(src + 4);
+			y0 = vld1q_f32(dst + 0);
+			y1 = vld1q_f32(dst + 4);
+
+			y0 = vmlaq_f32(y0, x0, g0);
+			y1 = vmlaq_f32(y1, x1, g0);
+
+			vst1q_f32(dst + 0, y0);
+			vst1q_f32(dst + 4, y1);
+
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0, y0;
+			x0 = vld1q_f32(src);
+			y0 = vld1q_f32(dst);
+
+			y0 = vmlaq_f32(y0, x0, g0);
+
+			vst1q_f32(dst, y0);
+
+			src += 4;
+			dst += 4;
+			nframes -= 4;
+		}
+
+		while (nframes >= 2) {
+			float32x2_t x0, y0;
+			x0 = vld1_f32(src);
+			y0 = vld1_f32(dst);
+
+			y0 = vmla_n_f32(y0, x0, gain);
+
+			vst1_f32(dst, y0);
+
+			src += 2;
+			dst += 2;
+			nframes -= 2;
+		}
+	} while (0);
+
+	// Do the remaining samples
+	while (nframes > 0) {
+
+		float32_t x0, y0;
+
+		x0 = *src;
+		y0 = *dst;
+		y0 = y0 + (x0 * gain);
+		*dst = y0;
+
+		++dst;
+		++src;
+		--nframes;
+	}
+}
+
+C_FUNC void
+arm_neon_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) &&
+			IS_ALIGNED_TO(dst, sizeof(float32x4_t))) &&
+			(nframes > 0)) {
+		float32_t x0, y0;
+
+		x0 = *src;
+		y0 = *dst;
+		y0 = y0 + x0;
+		*dst = y0;
+
+		++src;
+		++dst;
+		--nframes;
+	}
+
+	// Use NEON when buffers are aligned
+	do {
+		while (nframes >= 8) {
+			float32x4_t x0, x1, y0, y1;
+			x0 = vld1q_f32(src + 0);
+			x1 = vld1q_f32(src + 4);
+			y0 = vld1q_f32(dst + 0);
+			y1 = vld1q_f32(dst + 4);
+
+			y0 = vaddq_f32(y0, x0);
+			y1 = vaddq_f32(y1, x1);
+
+			vst1q_f32(dst + 0, y0);
+			vst1q_f32(dst + 4, y1);
+
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0, y0;
+
+			x0 = vld1q_f32(src);
+			y0 = vld1q_f32(dst);
+
+			y0 = vaddq_f32(y0, x0);
+
+			vst1q_f32(dst, y0);
+
+			src += 4;
+			dst += 4;
+			nframes -= 4;
+		}
+	} while (0);
+
+	// Do the remaining samples
+	while (nframes > 0) {
+		float32_t x0, y0;
+
+		x0 = *src;
+		y0 = *dst;
+		y0 = y0 + x0;
+		*dst = y0;
+
+		++src;
+		++dst;
+		--nframes;
+	}
+}
+
+C_FUNC void
+arm_neon_copy_vector(
+	float *__restrict dst, const float *__restrict src,
+	uint32_t nframes)
+{
+	// While buffers aren't aligned, then process one sample at a time
+	while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) &&
+			IS_ALIGNED_TO(dst, sizeof(float32x4_t))) &&
+			(nframes > 0)) {
+		*dst++ = *src++;
+		--nframes;
+	}
+
+	// Use NEON when buffers are aligned
+	do {
+		while (nframes >= 16) {
+			float32x4_t x0, x1, x2, x3;
+
+			x0 = vld1q_f32(src + 0 );
+			x1 = vld1q_f32(src + 4 );
+			x2 = vld1q_f32(src + 8 );
+			x3 = vld1q_f32(src + 12);
+
+			vst1q_f32(dst + 0 , x0);
+			vst1q_f32(dst + 4 , x1);
+			vst1q_f32(dst + 8 , x2);
+			vst1q_f32(dst + 12, x3);
+
+			src += 16;
+			dst += 16;
+			nframes -= 16;
+		}
+
+		while (nframes >= 8) {
+			float32x4_t x0, x1;
+
+			x0 = vld1q_f32(src + 0);
+			x1 = vld1q_f32(src + 4);
+
+			vst1q_f32(dst + 0, x0);
+			vst1q_f32(dst + 4, x1);
+
+			src += 8;
+			dst += 8;
+			nframes -= 8;
+		}
+
+		while (nframes >= 4) {
+			float32x4_t x0;
+
+			x0 = vld1q_f32(src);
+			vst1q_f32(dst, x0);
+
+			src += 4;
+			dst += 4;
+			nframes -= 4;
+		}
+
+	} while (0);
+
+	// Do the remaining samples
+	while (nframes > 0) {
+		*dst++ = *src++;
+		--nframes;
+	}
+}
+
+#endif /* defined (__arm__) && defined (__ARM_NEON) */
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index b70e51fc4c..f586104bb5 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -217,6 +217,19 @@ setup_hardware_optimization (bool try_optimization)
 			generic_mix_functions = false;
 		}
 
+#elif defined(__arm__) && !defined(__APPLE__)
+		/* Use NEON routines */
+		do {
+			compute_peak          = arm_neon_compute_peak;
+			find_peaks            = arm_neon_find_peaks;
+			apply_gain_to_buffer  = arm_neon_apply_gain_to_buffer;
+			mix_buffers_with_gain = arm_neon_mix_buffers_with_gain;
+			mix_buffers_no_gain   = arm_neon_mix_buffers_no_gain;
+			copy_vector           = arm_neon_copy_vector;
+
+			generic_mix_functions = false;
+		} while (0);
+
 #elif defined(__APPLE__) && defined(BUILD_VECLIB_OPTIMIZATIONS)
 
 		if (floor (kCFCoreFoundationVersionNumber) > kCFCoreFoundationVersionNumber10_4) { /* at least Tiger */
diff --git a/libs/ardour/wscript b/libs/ardour/wscript
index 49cea6abbd..4a813660cd 100644
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@@ -467,6 +467,7 @@ def build(bld):
     avx_sources = []
 
     if Options.options.fpu_optimization:
+        obj.source += ['arm_neon_functions.cc']
         if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'):
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ]
             avx_sources = [ 'sse_functions_avx_linux.cc' ]