From c8c57f14bf5a41b64adf249acb9d1ff64393a9e1 Mon Sep 17 00:00:00 2001 From: Ayan Shafqat Date: Thu, 20 Aug 2020 20:15:39 -0400 Subject: [PATCH] Adding ARM NEON optimized routines This commit adds ARM NEON optimized routines for the following procedures below: *_compute_peak *_find_peaks *_apply_gain_to_buffer *_mix_buffers_with_gain *_mix_buffers_no_gain *_copy_vector NEON optimized routines have a prefix of: arm_neon_ --- libs/ardour/ardour/mix.h | 12 + libs/ardour/arm_neon_functions.cc | 518 ++++++++++++++++++++++++++++++ libs/ardour/globals.cc | 13 + libs/ardour/wscript | 1 + 4 files changed, 544 insertions(+) create mode 100644 libs/ardour/arm_neon_functions.cc diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h index a394a64ce2..4132e8e371 100644 --- a/libs/ardour/ardour/mix.h +++ b/libs/ardour/ardour/mix.h @@ -70,6 +70,18 @@ LIBARDOUR_API void veclib_mix_buffers_no_gain (ARDOUR::Sample * dst, cons #endif +/* Optimized NEON functions */ +#if defined(__arm__) && !defined(__APPLE__) +extern "C" { + LIBARDOUR_API float arm_neon_compute_peak (const float * buf, uint32_t nsamples, float current); + LIBARDOUR_API void arm_neon_apply_gain_to_buffer (float * buf, uint32_t nframes, float gain); + LIBARDOUR_API void arm_neon_copy_vector (float * dst, const float * src, uint32_t nframes); + LIBARDOUR_API void arm_neon_find_peaks (const float *src, uint32_t nframes, float *minf, float *maxf); + LIBARDOUR_API void arm_neon_mix_buffers_no_gain (float * dst, const float * src, uint32_t nframes); + LIBARDOUR_API void arm_neon_mix_buffers_with_gain (float * dst, const float * src, uint32_t nframes, float gain); +} +#endif + /* non-optimized functions */ LIBARDOUR_API float default_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current); diff --git a/libs/ardour/arm_neon_functions.cc b/libs/ardour/arm_neon_functions.cc new file mode 100644 index 0000000000..347a8823eb --- /dev/null +++ b/libs/ardour/arm_neon_functions.cc @@ -0,0 +1,518 @@ +/* + * Copyright (C) 2020 Ayan Shafqat + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "ardour/mix.h" + +#if defined(__arm__) && defined(__ARM_NEON) + +#include +#include + +#define IS_ALIGNED_TO(ptr, bytes) (((uintptr_t)ptr) % (bytes) == 0) + +#ifdef __cplusplus +#define C_FUNC extern "C" +#else +#define C_FUNC +#endif + +C_FUNC float +arm_neon_compute_peak(const float *src, uint32_t nframes, float current) +{ + float32x4_t vc0; + + // Broadcast single value to all elements of the register + vc0 = vdupq_n_f32(current); + + // While pointer is not aligned, process one sample at a time + while (!IS_ALIGNED_TO(src, sizeof(float32x4_t)) && (nframes > 0)) { + float32x4_t x0; + + x0 = vld1q_dup_f32(src); + x0 = vabsq_f32(x0); + vc0 = vmaxq_f32(vc0, x0); + + ++src; + --nframes; + } + + // SIMD portion with aligned buffers + do { + while (nframes >= 8) { + float32x4_t x0, x1; + + x0 = vld1q_f32(src + 0); + x1 = vld1q_f32(src + 4); + + x0 = vabsq_f32(x0); + x1 = vabsq_f32(x1); + + vc0 = vmaxq_f32(vc0, x0); + vc0 = vmaxq_f32(vc0, x1); + + src += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0; + + x0 = vld1q_f32(src); + + x0 = vabsq_f32(x0); + vc0 = vmaxq_f32(vc0, x0); + + src += 4; + nframes -= 4; + } + + while (nframes >= 2) { + float32x2_t x0; + float32x4_t y0; + + + x0 = vld1_f32(src); // Load two elements + x0 = vabs_f32(x0); // Compute ABS value + y0 = vcombine_f32(x0, x0); // Combine two vectors + + vc0 = vmaxq_f32(vc0, y0); + + src += 2; + nframes -= 2; + } + } while (0); + + + // Do remaining samples one frame at a time + while (nframes > 0) { + float32x4_t x0; + + x0 = vld1q_dup_f32(src); + x0 = vabsq_f32(x0); + vc0 = vmaxq_f32(vc0, x0); + + ++src; + --nframes; + } + + // Compute the max in register + do { + float32x2_t vlo = vget_low_f32(vc0); + float32x2_t vhi = vget_high_f32(vc0); + float32x2_t max0 = vpmax_f32(vlo, vhi); + float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0] + current = vget_lane_f32(max1, 0); + } while (0); + + return current; +} + +C_FUNC void +arm_neon_find_peaks(const float *src, uint32_t nframes, float *minf, float *maxf) +{ + float32x4_t vmin, vmax; + + // Broadcast single value to all elements of the register + vmin = vld1q_dup_f32(minf); + vmax = vld1q_dup_f32(maxf); + + // While pointer is not aligned, process one sample at a time + while (!IS_ALIGNED_TO(src, sizeof(float32x4_t)) && (nframes > 0)) { + float32x4_t x0; + + x0 = vld1q_dup_f32(src); + vmax = vmaxq_f32(vmax, x0); + vmin = vminq_f32(vmin, x0); + + ++src; + --nframes; + } + + // SIMD portion with aligned buffers + do { + while (nframes >= 8) { + float32x4_t x0, x1; + + x0 = vld1q_f32(src + 0); + x1 = vld1q_f32(src + 4); + + vmax = vmaxq_f32(vmax, x0); + vmax = vmaxq_f32(vmax, x1); + + vmin = vminq_f32(vmin, x0); + vmin = vminq_f32(vmin, x1); + + src += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0; + + x0 = vld1q_f32(src); + + vmax = vmaxq_f32(vmax, x0); + vmin = vminq_f32(vmin, x0); + + src += 4; + nframes -= 4; + } + + while (nframes >= 2) { + float32x2_t x0; + float32x4_t y0; + + + x0 = vld1_f32(src); // Load two elements + y0 = vcombine_f32(x0, x0); // Combine two vectors + + vmax = vmaxq_f32(vmax, y0); + vmin = vminq_f32(vmin, y0); + + src += 2; + nframes -= 2; + } + } while (0); + + + // Do remaining samples one frame at a time + while (nframes > 0) { + float32x4_t x0; + + x0 = vld1q_dup_f32(src); + vmax = vmaxq_f32(vmax, x0); + vmin = vminq_f32(vmin, x0); + + ++src; + --nframes; + } + + // Compute the max in register + do { + float32x2_t vlo = vget_low_f32(vmax); + float32x2_t vhi = vget_high_f32(vmax); + float32x2_t max0 = vpmax_f32(vlo, vhi); + float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0] + vst1_lane_f32(maxf, max1, 0); + } while (0); + + // Compute the min in register + do { + float32x2_t vlo = vget_low_f32(vmin); + float32x2_t vhi = vget_high_f32(vmin); + float32x2_t min0 = vpmin_f32(vlo, vhi); + float32x2_t min1 = vpmin_f32(min0, min0); // min is now at min1[0] + vst1_lane_f32(minf, min1, 0); + } while (0); +} + +C_FUNC void +arm_neon_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) +{ + while (!IS_ALIGNED_TO(dst, sizeof(float32x4_t)) && nframes > 0) { + float32_t x0, y0; + + x0 = *dst; + y0 = x0 * gain; + *dst = y0; + + ++dst; + --nframes; + } + + // SIMD portion with aligned buffers + do { + float32x4_t g0 = vdupq_n_f32(gain); + + while (nframes >= 8) { + float32x4_t x0, x1, y0, y1; + x0 = vld1q_f32(dst + 0); + x1 = vld1q_f32(dst + 4); + + y0 = vmulq_f32(x0, g0); + y1 = vmulq_f32(x1, g0); + + vst1q_f32(dst + 0, y0); + vst1q_f32(dst + 4, y1); + + dst += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0, y0; + + x0 = vld1q_f32(dst); + y0 = vmulq_f32(x0, g0); + vst1q_f32(dst, y0); + + dst += 4; + nframes -= 4; + } + + while (nframes >= 2) { + float32x2_t x0, y0; + + x0 = vld1_f32(dst); + y0 = vmul_n_f32(x0, gain); + vst1_f32(dst, y0); + + dst += 2; + nframes -= 2; + } + } while (0); + + // Do the remaining portion one sample at a time + while (nframes > 0) { + float32_t x0, y0; + + x0 = *dst; + y0 = x0 * gain; + *dst = y0; + + ++dst; + --nframes; + } +} + +C_FUNC void +arm_neon_mix_buffers_with_gain( + float *__restrict dst, const float *__restrict src, + uint32_t nframes, float gain) +{ + // While buffers aren't aligned, then process one sample at a time + while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) && + IS_ALIGNED_TO(dst, sizeof(float32x4_t))) && + (nframes > 0)) { + float32_t x0, y0; + + x0 = *src; + y0 = *dst; + y0 = y0 + (x0 * gain); + *dst = y0; + + ++dst; + ++src; + --nframes; + } + + // Use NEON when buffers are aligned + do { + float32x4_t g0 = vdupq_n_f32(gain); + + while (nframes >= 8) { + float32x4_t x0, x1, y0, y1; + x0 = vld1q_f32(src + 0); + x1 = vld1q_f32(src + 4); + y0 = vld1q_f32(dst + 0); + y1 = vld1q_f32(dst + 4); + + y0 = vmlaq_f32(y0, x0, g0); + y1 = vmlaq_f32(y1, x1, g0); + + vst1q_f32(dst + 0, y0); + vst1q_f32(dst + 4, y1); + + src += 8; + dst += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0, y0; + x0 = vld1q_f32(src); + y0 = vld1q_f32(dst); + + y0 = vmlaq_f32(y0, x0, g0); + + vst1q_f32(dst, y0); + + src += 4; + dst += 4; + nframes -= 4; + } + + while (nframes >= 2) { + float32x2_t x0, y0; + x0 = vld1_f32(src); + y0 = vld1_f32(dst); + + y0 = vmla_n_f32(y0, x0, gain); + + vst1_f32(dst, y0); + + src += 2; + dst += 2; + nframes -= 2; + } + } while (0); + + // Do the remaining samples + while (nframes > 0) { + + float32_t x0, y0; + + x0 = *src; + y0 = *dst; + y0 = y0 + (x0 * gain); + *dst = y0; + + ++dst; + ++src; + --nframes; + } +} + +C_FUNC void +arm_neon_mix_buffers_no_gain(float *dst, const float *src, uint32_t nframes) +{ + // While buffers aren't aligned, then process one sample at a time + while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) && + IS_ALIGNED_TO(dst, sizeof(float32x4_t))) && + (nframes > 0)) { + float32_t x0, y0; + + x0 = *src; + y0 = *dst; + y0 = y0 + x0; + *dst = y0; + + ++src; + ++dst; + --nframes; + } + + // Use NEON when buffers are aligned + do { + while (nframes >= 8) { + float32x4_t x0, x1, y0, y1; + x0 = vld1q_f32(src + 0); + x1 = vld1q_f32(src + 4); + y0 = vld1q_f32(dst + 0); + y1 = vld1q_f32(dst + 4); + + y0 = vaddq_f32(y0, x0); + y1 = vaddq_f32(y1, x1); + + vst1q_f32(dst + 0, y0); + vst1q_f32(dst + 4, y1); + + src += 8; + dst += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0, y0; + + x0 = vld1q_f32(src); + y0 = vld1q_f32(dst); + + y0 = vaddq_f32(y0, x0); + + vst1q_f32(dst, y0); + + src += 4; + dst += 4; + nframes -= 4; + } + } while (0); + + // Do the remaining samples + while (nframes > 0) { + float32_t x0, y0; + + x0 = *src; + y0 = *dst; + y0 = y0 + x0; + *dst = y0; + + ++src; + ++dst; + --nframes; + } +} + +C_FUNC void +arm_neon_copy_vector( + float *__restrict dst, const float *__restrict src, + uint32_t nframes) +{ + // While buffers aren't aligned, then process one sample at a time + while (!(IS_ALIGNED_TO(src, sizeof(float32x4_t)) && + IS_ALIGNED_TO(dst, sizeof(float32x4_t))) && + (nframes > 0)) { + *dst++ = *src++; + --nframes; + } + + // Use NEON when buffers are aligned + do { + while (nframes >= 16) { + float32x4_t x0, x1, x2, x3; + + x0 = vld1q_f32(src + 0 ); + x1 = vld1q_f32(src + 4 ); + x2 = vld1q_f32(src + 8 ); + x3 = vld1q_f32(src + 12); + + vst1q_f32(dst + 0 , x0); + vst1q_f32(dst + 4 , x1); + vst1q_f32(dst + 8 , x2); + vst1q_f32(dst + 12, x3); + + src += 16; + dst += 16; + nframes -= 16; + } + + while (nframes >= 8) { + float32x4_t x0, x1; + + x0 = vld1q_f32(src + 0); + x1 = vld1q_f32(src + 4); + + vst1q_f32(dst + 0, x0); + vst1q_f32(dst + 4, x1); + + src += 8; + dst += 8; + nframes -= 8; + } + + while (nframes >= 4) { + float32x4_t x0; + + x0 = vld1q_f32(src); + vst1q_f32(dst, x0); + + src += 4; + dst += 4; + nframes -= 4; + } + + } while (0); + + // Do the remaining samples + while (nframes > 0) { + *dst++ = *src++; + --nframes; + } +} + +#endif /* defined (__arm__) && defined (__ARM_NEON) */ diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index b70e51fc4c..f586104bb5 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -217,6 +217,19 @@ setup_hardware_optimization (bool try_optimization) generic_mix_functions = false; } +#elif defined(__arm__) && !defined(__APPLE__) + /* Use NEON routines */ + do { + compute_peak = arm_neon_compute_peak; + find_peaks = arm_neon_find_peaks; + apply_gain_to_buffer = arm_neon_apply_gain_to_buffer; + mix_buffers_with_gain = arm_neon_mix_buffers_with_gain; + mix_buffers_no_gain = arm_neon_mix_buffers_no_gain; + copy_vector = arm_neon_copy_vector; + + generic_mix_functions = false; + } while (0); + #elif defined(__APPLE__) && defined(BUILD_VECLIB_OPTIMIZATIONS) if (floor (kCFCoreFoundationVersionNumber) > kCFCoreFoundationVersionNumber10_4) { /* at least Tiger */ diff --git a/libs/ardour/wscript b/libs/ardour/wscript index 49cea6abbd..4a813660cd 100644 --- a/libs/ardour/wscript +++ b/libs/ardour/wscript @@ -467,6 +467,7 @@ def build(bld): avx_sources = [] if Options.options.fpu_optimization: + obj.source += ['arm_neon_functions.cc'] if (bld.env['build_target'] == 'i386' or bld.env['build_target'] == 'i686'): obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s', ] avx_sources = [ 'sse_functions_avx_linux.cc' ]