mirror of
https://github.com/Ardour/ardour.git
synced 2026-01-04 20:55:48 +01:00
Add optimized NEON implementations for Aarch64
Introduce optimized NEON implementations for audio peak detection, buffer mixing, scaling, and copy functions on AArch64. For benchmark results, see ardour-neon-functions [1]. Provide a dedicated `aarch64_neon_functions.cc` for Aarch64 target. provide backwards compatibility with armhf targets with `arm_neon_fuctions.cc`. [1] https://github.com/ashafq/ardour-neon-functions
This commit is contained in:
parent
8f4c10f855
commit
19e00c575a
2 changed files with 639 additions and 1 deletions
638
libs/ardour/aarch64_neon_functions.cc
Normal file
638
libs/ardour/aarch64_neon_functions.cc
Normal file
|
|
@ -0,0 +1,638 @@
|
|||
/*
|
||||
* Copyright (C) 2025 Ayan Shafqat <ayan.x.shafqat@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*/
|
||||
|
||||
#include "ardour/mix.h"
|
||||
|
||||
#ifdef ARM_NEON_SUPPORT
|
||||
|
||||
#include <arm_acle.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
/**
|
||||
* @brief Aligns a pointer to the next 16-byte boundary
|
||||
*
|
||||
* @param ptr Pointer to be aligned
|
||||
* @return void* Aligned pointer
|
||||
*/
|
||||
#define ALIGN_PTR_NEXT_16(ptr) ((void*) ((((uintptr_t) ptr) + 0xF) & ~0xf))
|
||||
|
||||
/**
|
||||
* @brief GCC builtin to hint the compiler that the expression is unlikely
|
||||
*/
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
#define UNLIKELY(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
#define UNLIKELY(x) (x)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
#define C_FUNC extern "C"
|
||||
#else
|
||||
#define C_FUNC
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Compute the absolute peak value in a buffer of floats
|
||||
*
|
||||
* This function computes the peak value in a buffer of floats. The function
|
||||
* uses NEON SIMD instructions to compute the peak value, with some loop
|
||||
* unrolling. The function is optimized for both Aarch64 and Aarch32 platforms.
|
||||
*
|
||||
* The performance of this function does not depend on the alignment of the
|
||||
* input buffer, as the function handles misaligned samples before the first
|
||||
* aligned address. However, it will be performant if the buffer is aligned to
|
||||
* a 16-byte boundary, and the number of frames is a multiple of 16.
|
||||
*
|
||||
* @param[in] src Pointer to the buffer of floats
|
||||
* @param[in] nframes Number of frames in the buffer
|
||||
* @param[in] current Current peak value
|
||||
* @return float
|
||||
*/
|
||||
C_FUNC float
|
||||
arm_neon_compute_peak(const float* src, uint32_t nframes, float current)
|
||||
{
|
||||
// Broadcast single value to all elements of the register
|
||||
float32x4_t vmax = vdupq_n_f32(current);
|
||||
|
||||
// Compute the next aligned pointer
|
||||
const float* src_aligned = (const float*) ALIGN_PTR_NEXT_16(src);
|
||||
|
||||
// Process misaligned samples before the first aligned address
|
||||
if (UNLIKELY(src_aligned != src))
|
||||
{
|
||||
size_t unaligned_count = src_aligned - src;
|
||||
for (size_t i = 0; i < unaligned_count; i++)
|
||||
{
|
||||
float32x4_t x0 = vld1q_dup_f32(src + i);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
}
|
||||
nframes -= unaligned_count;
|
||||
}
|
||||
|
||||
// Compute the number of SIMD frames
|
||||
size_t simd_count = nframes / 4;
|
||||
size_t nframes_simd = 4 * simd_count;
|
||||
size_t start = 0;
|
||||
|
||||
// Some loop unrolling
|
||||
if (simd_count >= 4)
|
||||
{
|
||||
size_t unrolled_count = simd_count / 4;
|
||||
for (size_t i = start; i < unrolled_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * 4 * i;
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
float32x4_t max0, max1, max2;
|
||||
|
||||
x0 = vld1q_f32(src_aligned + offset + (0 * 4));
|
||||
x1 = vld1q_f32(src_aligned + offset + (1 * 4));
|
||||
x2 = vld1q_f32(src_aligned + offset + (2 * 4));
|
||||
x3 = vld1q_f32(src_aligned + offset + (3 * 4));
|
||||
|
||||
max0 = vmaxq_f32(x0, x1);
|
||||
max1 = vmaxq_f32(x2, x3);
|
||||
max2 = vmaxq_f32(max0, max1);
|
||||
vmax = vmaxq_f32(vmax, max2);
|
||||
}
|
||||
|
||||
start = unrolled_count * 4;
|
||||
}
|
||||
|
||||
for (size_t i = start; i < simd_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * i;
|
||||
float32x4_t x0;
|
||||
x0 = vld1q_f32(src_aligned + offset);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
}
|
||||
|
||||
for (size_t frame = nframes_simd; frame < nframes; ++frame)
|
||||
{
|
||||
float32x4_t x0;
|
||||
x0 = vld1q_dup_f32(src_aligned + frame);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
}
|
||||
|
||||
// Compute the max in register
|
||||
#if (__aarch64__ == 1)
|
||||
{
|
||||
current = vmaxvq_f32(vmax);
|
||||
}
|
||||
#else
|
||||
{
|
||||
float32x2_t vlo = vget_low_f32(vmax);
|
||||
float32x2_t vhi = vget_high_f32(vmax);
|
||||
float32x2_t max0 = vpmax_f32(vlo, vhi);
|
||||
float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0]
|
||||
current = vget_lane_f32(max1, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Find the minimum and maximum values in a buffer of floats
|
||||
*
|
||||
* This function computes the minimum and maximum values in a buffer of floats.
|
||||
* The function uses NEON SIMD instructions to compute the min and max values,
|
||||
* with some loop unrolling. The function is optimized for both Aarch64 and
|
||||
* Aarch32 platforms.
|
||||
*
|
||||
* Similar to @p arm_neon_compute_peak, the performance of this function does
|
||||
* not depend on the alignment of the input buffer, as the function handles
|
||||
* misaligned samples before the first aligned address. However, it will be
|
||||
* performant if the buffer is aligned to a 16-byte boundary, and the number of
|
||||
* frames is a multiple of 16.
|
||||
*
|
||||
* @param[in] src Pointer to the buffer of floats
|
||||
* @param[in] nframes Number of frames in the buffer
|
||||
* @param[in,out] minf Pointer to the minimum value (gets updated)
|
||||
* @param[in,out] maxf Pointer to the maximum value (gets updated)
|
||||
*/
|
||||
C_FUNC void
|
||||
arm_neon_find_peaks(const float* src, uint32_t nframes, float* minf, float* maxf)
|
||||
{
|
||||
float32x4_t vmin, vmax;
|
||||
|
||||
// Broadcast single value to all elements of the register
|
||||
vmin = vld1q_dup_f32(minf);
|
||||
vmax = vld1q_dup_f32(maxf);
|
||||
|
||||
const float* src_aligned = (const float*) ALIGN_PTR_NEXT_16(src);
|
||||
|
||||
// Process misaligned samples before the first aligned address
|
||||
if (UNLIKELY(src_aligned != src))
|
||||
{
|
||||
size_t unaligned_count = src_aligned - src;
|
||||
for (size_t i = 0; i < unaligned_count; i++)
|
||||
{
|
||||
float32x4_t x0 = vld1q_dup_f32(src + i);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
vmin = vminq_f32(vmin, x0);
|
||||
}
|
||||
nframes -= unaligned_count;
|
||||
}
|
||||
|
||||
// Compute the number of SIMD frames
|
||||
size_t simd_count = nframes / 4;
|
||||
size_t nframes_simd = 4 * simd_count;
|
||||
size_t start = 0;
|
||||
|
||||
// Some loop unrolling
|
||||
if (simd_count >= 4)
|
||||
{
|
||||
size_t unrolled_count = simd_count / 4;
|
||||
for (size_t i = start; i < unrolled_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * 4 * i;
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
float32x4_t max0, max1, max2;
|
||||
float32x4_t min0, min1, min2;
|
||||
|
||||
x0 = vld1q_f32(src_aligned + offset + (0 * 4));
|
||||
x1 = vld1q_f32(src_aligned + offset + (1 * 4));
|
||||
x2 = vld1q_f32(src_aligned + offset + (2 * 4));
|
||||
x3 = vld1q_f32(src_aligned + offset + (3 * 4));
|
||||
|
||||
max0 = vmaxq_f32(x0, x1);
|
||||
max1 = vmaxq_f32(x2, x3);
|
||||
max2 = vmaxq_f32(max0, max1);
|
||||
vmax = vmaxq_f32(vmax, max2);
|
||||
|
||||
min0 = vminq_f32(x0, x1);
|
||||
min1 = vminq_f32(x2, x3);
|
||||
min2 = vminq_f32(min0, min1);
|
||||
vmin = vminq_f32(vmin, min2);
|
||||
}
|
||||
|
||||
start = unrolled_count * 4;
|
||||
}
|
||||
|
||||
for (size_t i = start; i < simd_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * i;
|
||||
float32x4_t x0;
|
||||
x0 = vld1q_f32(src_aligned + offset);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
vmin = vminq_f32(vmin, x0);
|
||||
}
|
||||
|
||||
for (size_t frame = nframes_simd; frame < nframes; ++frame)
|
||||
{
|
||||
float32x4_t x0;
|
||||
x0 = vld1q_dup_f32(src_aligned + frame);
|
||||
vmax = vmaxq_f32(vmax, x0);
|
||||
vmin = vminq_f32(vmin, x0);
|
||||
}
|
||||
|
||||
// Compute the max in register
|
||||
#if (__aarch64__ == 1)
|
||||
{
|
||||
float max_val = vmaxvq_f32(vmax);
|
||||
*maxf = max_val;
|
||||
}
|
||||
#else
|
||||
{
|
||||
float32x2_t vlo = vget_low_f32(vmax);
|
||||
float32x2_t vhi = vget_high_f32(vmax);
|
||||
float32x2_t max0 = vpmax_f32(vlo, vhi);
|
||||
float32x2_t max1 = vpmax_f32(max0, max0); // Max is now at max1[0]
|
||||
vst1_lane_f32(maxf, max1, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Compute the min in register
|
||||
#if (__aarch64__ == 1)
|
||||
{
|
||||
float min_val = vminvq_f32(vmin);
|
||||
*minf = min_val;
|
||||
}
|
||||
#else
|
||||
{
|
||||
float32x2_t vlo = vget_low_f32(vmin);
|
||||
float32x2_t vhi = vget_high_f32(vmin);
|
||||
float32x2_t min0 = vpmin_f32(vlo, vhi);
|
||||
float32x2_t min1 = vpmin_f32(min0, min0); // min is now at min1[0]
|
||||
vst1_lane_f32(minf, min1, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Applies a scalar gain to a buffer of floats
|
||||
*
|
||||
* The mathematically equivalent operation is:
|
||||
*
|
||||
* dst[i] = src[i] * gain, for i = 0 to nframes
|
||||
*
|
||||
* The function uses NEON SIMD instructions to apply the gain to the buffer.
|
||||
* The function is optimized for both Aarch64 and Aarch32 platforms, and is
|
||||
* designed to deal with unaligned buffer.
|
||||
*
|
||||
* @param[in,out] dst Pointer to the buffer of floats
|
||||
* @param[in] nframes Number of frames in the buffer
|
||||
* @param[in] gain Gain to apply to the buffer
|
||||
*/
|
||||
C_FUNC void
|
||||
arm_neon_apply_gain_to_buffer(float* dst, uint32_t nframes, float gain)
|
||||
{
|
||||
float* dst_aligned = (float*) ALIGN_PTR_NEXT_16(dst);
|
||||
|
||||
if (UNLIKELY(dst_aligned != dst))
|
||||
{
|
||||
size_t unaligned_count = dst_aligned - dst;
|
||||
for (size_t i = 0; i < unaligned_count; i++)
|
||||
{
|
||||
float32_t x0, y0;
|
||||
|
||||
x0 = dst[i];
|
||||
y0 = x0 * gain;
|
||||
dst[i] = y0;
|
||||
}
|
||||
nframes -= unaligned_count;
|
||||
}
|
||||
|
||||
// Compute the number of SIMD frames
|
||||
size_t simd_count = nframes / 4;
|
||||
size_t nframes_simd = 4 * simd_count;
|
||||
size_t start = 0;
|
||||
|
||||
float32x4_t g0 = vdupq_n_f32(gain);
|
||||
|
||||
// Do some loop unrolling
|
||||
if (simd_count >= 4)
|
||||
{
|
||||
size_t unrolled_count = simd_count / 4;
|
||||
for (size_t i = start; i < unrolled_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * 4 * i;
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
float32x4_t y0, y1, y2, y3;
|
||||
|
||||
float* ptr0 = dst_aligned + offset + (0 * 4);
|
||||
float* ptr1 = dst_aligned + offset + (1 * 4);
|
||||
float* ptr2 = dst_aligned + offset + (2 * 4);
|
||||
float* ptr3 = dst_aligned + offset + (3 * 4);
|
||||
|
||||
x0 = vld1q_f32(ptr0);
|
||||
x1 = vld1q_f32(ptr1);
|
||||
x2 = vld1q_f32(ptr2);
|
||||
x3 = vld1q_f32(ptr3);
|
||||
|
||||
y0 = vmulq_f32(x0, g0);
|
||||
y1 = vmulq_f32(x1, g0);
|
||||
y2 = vmulq_f32(x2, g0);
|
||||
y3 = vmulq_f32(x3, g0);
|
||||
|
||||
vst1q_f32(ptr0, y0);
|
||||
vst1q_f32(ptr1, y1);
|
||||
vst1q_f32(ptr2, y2);
|
||||
vst1q_f32(ptr3, y3);
|
||||
}
|
||||
|
||||
start = unrolled_count * 4;
|
||||
}
|
||||
|
||||
// Do the remaining 4 samples at a time
|
||||
for (size_t i = start; i < simd_count; ++i)
|
||||
{
|
||||
size_t offset = 4 * i;
|
||||
float32x4_t x0, y0;
|
||||
|
||||
x0 = vld1q_f32(dst_aligned + offset);
|
||||
y0 = vmulq_f32(x0, g0);
|
||||
vst1q_f32(dst_aligned + offset, y0);
|
||||
}
|
||||
|
||||
// Do the remaining portion one sample at a time
|
||||
for (size_t frame = nframes_simd; frame < nframes; ++frame)
|
||||
{
|
||||
float32_t x0, y0;
|
||||
x0 = dst_aligned[frame];
|
||||
y0 = x0 * gain;
|
||||
dst_aligned[frame] = y0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Mixes the source buffer into the destination buffer with a gain
|
||||
* factor.
|
||||
*
|
||||
* This function is mathematically equivalent to:
|
||||
*
|
||||
* dst[i] = dst[i] + src[i] * gain, for i = 0 to nframes, a.k.a. @p saxpy
|
||||
*
|
||||
* It uses ARM NEON intrinsics for vectorized processing. Although on AArch64
|
||||
* pointer alignment is strictly not necessary, the performance is improved
|
||||
* when the buffers are aligned to 16-byte boundaries.
|
||||
*
|
||||
* @warning This function *WILL CRASH* on Aarch32 if the pointers are not
|
||||
* aligned to 16-byte boundaries.
|
||||
*
|
||||
* @param[in,out] dst Pointer to the destination buffer
|
||||
* @param[in] src Pointer to the source buffer
|
||||
* @param[in] nframes Number of frames (samples) to process
|
||||
* @param[in] gain Gain factor to apply to each element of the source buffer
|
||||
*/
|
||||
C_FUNC void
|
||||
arm_neon_mix_buffers_with_gain(float* __restrict dst, const float* __restrict src, uint32_t nframes, float gain)
|
||||
{
|
||||
size_t frame = 0;
|
||||
size_t num_frames = nframes;
|
||||
size_t n_frame16 = num_frames - (num_frames % 16);
|
||||
size_t n_frame4 = num_frames - (num_frames % 4);
|
||||
|
||||
// Broadcast the same value over all lanes of SIMD
|
||||
float32x4_t vgain = vdupq_n_f32(gain);
|
||||
|
||||
// Process blocks of 16 frames to utilize reasonable amount of
|
||||
// register file.
|
||||
|
||||
while (frame < n_frame16)
|
||||
{
|
||||
const float* src_ptr = src + frame;
|
||||
float* dst_ptr = dst + frame;
|
||||
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
float32x4_t y0, y1, y2, y3;
|
||||
|
||||
x0 = vld1q_f32(src_ptr + 0 * 4);
|
||||
x1 = vld1q_f32(src_ptr + 1 * 4);
|
||||
x2 = vld1q_f32(src_ptr + 2 * 4);
|
||||
x3 = vld1q_f32(src_ptr + 3 * 4);
|
||||
|
||||
y0 = vld1q_f32(dst_ptr + 0 * 4);
|
||||
y1 = vld1q_f32(dst_ptr + 1 * 4);
|
||||
y2 = vld1q_f32(dst_ptr + 2 * 4);
|
||||
y3 = vld1q_f32(dst_ptr + 3 * 4);
|
||||
|
||||
y0 = vmlaq_f32(y0, x0, vgain);
|
||||
y1 = vmlaq_f32(y1, x1, vgain);
|
||||
y2 = vmlaq_f32(y2, x2, vgain);
|
||||
y3 = vmlaq_f32(y3, x3, vgain);
|
||||
|
||||
vst1q_f32(dst_ptr + 0 * 4, y0);
|
||||
vst1q_f32(dst_ptr + 1 * 4, y1);
|
||||
vst1q_f32(dst_ptr + 2 * 4, y2);
|
||||
vst1q_f32(dst_ptr + 3 * 4, y3);
|
||||
|
||||
frame += 16;
|
||||
}
|
||||
|
||||
// Process the remaining blocks 4 at a time if possible
|
||||
while (frame < n_frame4)
|
||||
{
|
||||
float32x4_t x0, y0;
|
||||
|
||||
x0 = vld1q_f32(src + frame);
|
||||
y0 = vld1q_f32(dst + frame);
|
||||
y0 = vmlaq_f32(y0, x0, vgain);
|
||||
vst1q_f32(dst + frame, y0);
|
||||
|
||||
frame += 4;
|
||||
}
|
||||
|
||||
// Process the remaining samples 1 at a time
|
||||
while (frame < num_frames)
|
||||
{
|
||||
float32_t x0, y0;
|
||||
|
||||
x0 = src[frame];
|
||||
y0 = dst[frame];
|
||||
// y0 = y0 + (x0 * gain);
|
||||
y0 = __builtin_fmaf(x0, gain, y0);
|
||||
dst[frame] = y0;
|
||||
|
||||
++frame;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Mixes the source buffer into the destination buffer without any gain
|
||||
* factor.
|
||||
*
|
||||
* Similar to @p arm_neon_mix_buffers_with_gain, this function is
|
||||
* mathematically equivalent to:
|
||||
*
|
||||
* dst[i] = dst[i] + src[i], for i = 0 to nframes
|
||||
*
|
||||
* Same as before, it works similarly as @p arm_neon_mix_buffers_with_gain, but
|
||||
* without any gain factor.
|
||||
*
|
||||
* @warning This function *WILL CRASH* on Aarch32 if the pointers are not
|
||||
* aligned to 16-byte boundaries.
|
||||
*
|
||||
* @param[in,out] dst Pointer to the destination buffer
|
||||
* @param[in] src Pointer to the source buffer
|
||||
* @param[in] nframes Number of frames (samples) to process
|
||||
* @param[in] gain Gain factor to apply to each element of the source buffer
|
||||
*/
|
||||
C_FUNC void
|
||||
arm_neon_mix_buffers_no_gain(float* dst, const float* src, uint32_t nframes)
|
||||
{
|
||||
// In Aarch64, alignment doesn't matter. But, this code will perform faster
|
||||
// with pointers aligned to 16 byte boundaries.
|
||||
|
||||
size_t frame = 0;
|
||||
size_t num_frames = nframes;
|
||||
size_t n_frame16 = num_frames - (num_frames % 16);
|
||||
size_t n_frame4 = num_frames - (num_frames % 4);
|
||||
|
||||
// Process blocks of 16 frames to utilize reasonable amount of
|
||||
// register file.
|
||||
|
||||
while (frame < n_frame16)
|
||||
{
|
||||
const float* src_ptr = src + frame;
|
||||
float* dst_ptr = dst + frame;
|
||||
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
float32x4_t y0, y1, y2, y3;
|
||||
|
||||
x0 = vld1q_f32(src_ptr + 0 * 4);
|
||||
x1 = vld1q_f32(src_ptr + 1 * 4);
|
||||
x2 = vld1q_f32(src_ptr + 2 * 4);
|
||||
x3 = vld1q_f32(src_ptr + 3 * 4);
|
||||
|
||||
y0 = vld1q_f32(dst_ptr + 0 * 4);
|
||||
y1 = vld1q_f32(dst_ptr + 1 * 4);
|
||||
y2 = vld1q_f32(dst_ptr + 2 * 4);
|
||||
y3 = vld1q_f32(dst_ptr + 3 * 4);
|
||||
|
||||
y0 = vaddq_f32(y0, x0);
|
||||
y1 = vaddq_f32(y1, x1);
|
||||
y2 = vaddq_f32(y2, x2);
|
||||
y3 = vaddq_f32(y3, x3);
|
||||
|
||||
vst1q_f32(dst_ptr + 0 * 4, y0);
|
||||
vst1q_f32(dst_ptr + 1 * 4, y1);
|
||||
vst1q_f32(dst_ptr + 2 * 4, y2);
|
||||
vst1q_f32(dst_ptr + 3 * 4, y3);
|
||||
|
||||
frame += 16;
|
||||
}
|
||||
|
||||
// Process the remaining blocks 4 at a time if possible
|
||||
while (frame < n_frame4)
|
||||
{
|
||||
float32x4_t x0, y0;
|
||||
|
||||
x0 = vld1q_f32(src + frame);
|
||||
y0 = vld1q_f32(dst + frame);
|
||||
y0 = vaddq_f32(y0, x0);
|
||||
vst1q_f32(dst + frame, y0);
|
||||
|
||||
frame += 4;
|
||||
}
|
||||
|
||||
// Process the remaining samples 1 at a time
|
||||
while (frame < num_frames)
|
||||
{
|
||||
float32_t x0, y0;
|
||||
|
||||
x0 = src[frame];
|
||||
y0 = dst[frame];
|
||||
y0 += x0;
|
||||
dst[frame] = y0;
|
||||
|
||||
++frame;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Copies a buffer of floats from source to destination
|
||||
*
|
||||
* Similar to:
|
||||
* memcpy(dst, src, nframes * sizeof(float));
|
||||
*
|
||||
* @param[in,out] dst Pointer to the destination buffer
|
||||
* @param[in] src Pointer to the source buffer
|
||||
* @param[in] nframes Number of frames (samples) to process
|
||||
*/
|
||||
C_FUNC void
|
||||
arm_neon_copy_vector(float* __restrict dst, const float* __restrict src, uint32_t nframes)
|
||||
{
|
||||
// Use NEON when buffers are aligned
|
||||
do
|
||||
{
|
||||
while (nframes >= 16)
|
||||
{
|
||||
float32x4_t x0, x1, x2, x3;
|
||||
|
||||
x0 = vld1q_f32(src + 0);
|
||||
x1 = vld1q_f32(src + 4);
|
||||
x2 = vld1q_f32(src + 8);
|
||||
x3 = vld1q_f32(src + 12);
|
||||
|
||||
vst1q_f32(dst + 0, x0);
|
||||
vst1q_f32(dst + 4, x1);
|
||||
vst1q_f32(dst + 8, x2);
|
||||
vst1q_f32(dst + 12, x3);
|
||||
|
||||
src += 16;
|
||||
dst += 16;
|
||||
nframes -= 16;
|
||||
}
|
||||
|
||||
while (nframes >= 8)
|
||||
{
|
||||
float32x4_t x0, x1;
|
||||
|
||||
x0 = vld1q_f32(src + 0);
|
||||
x1 = vld1q_f32(src + 4);
|
||||
|
||||
vst1q_f32(dst + 0, x0);
|
||||
vst1q_f32(dst + 4, x1);
|
||||
|
||||
src += 8;
|
||||
dst += 8;
|
||||
nframes -= 8;
|
||||
}
|
||||
|
||||
while (nframes >= 4)
|
||||
{
|
||||
float32x4_t x0;
|
||||
|
||||
x0 = vld1q_f32(src);
|
||||
vst1q_f32(dst, x0);
|
||||
|
||||
src += 4;
|
||||
dst += 4;
|
||||
nframes -= 4;
|
||||
}
|
||||
|
||||
} while (0);
|
||||
|
||||
// Do the remaining samples
|
||||
while (nframes > 0)
|
||||
{
|
||||
*dst++ = *src++;
|
||||
--nframes;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -513,7 +513,7 @@ def build(bld):
|
|||
fma_sources = [ 'x86_functions_fma.cc' ]
|
||||
avx512f_sources = [ 'x86_functions_avx512f.cc' ]
|
||||
elif bld.env['build_target'] == 'aarch64':
|
||||
obj.source += ['arm_neon_functions.cc']
|
||||
obj.source += ['aarch64_neon_functions.cc']
|
||||
obj.defines += [ 'ARM_NEON_SUPPORT' ]
|
||||
|
||||
elif bld.env['build_target'] == 'armhf':
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue