ardour/libs/staffpad/SimdTypes.h
2025-10-09 00:01:26 +02:00

128 lines
3.3 KiB
C++

/*
Simd-types for parallel dsp processing.
Aligned memory allocation for simd vectors.
*/
#pragma once
#include <cassert>
#include <cstdlib>
#if _MSC_VER
#define __finl __forceinline
#define __vecc __vectorcall
#else
#define __finl inline __attribute__((always_inline))
#define __vecc
#endif
#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#include "SimdTypes_sse2.h"
#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
#include "SimdTypes_neon.h"
#else
#include "SimdTypes_scalar.h"
#endif
namespace staffpad::audio::simd {
/// reserve aligned memory. Needs to be freed with aligned_free()
inline void* aligned_malloc(size_t required_bytes, size_t alignment)
{
auto offset = alignment - 1 + sizeof(void*);
auto p1 = std::malloc(required_bytes + offset);
if (p1 == nullptr) {
return nullptr;
}
// figure out aligned position
void* p2 = (void*)(((size_t)(p1) + offset) & ~(alignment - 1));
// write malloced pointer in front of aligned data
((void**)p2)[-1] = p1;
return p2;
}
/// free memory allocated with aligned_malloc
inline void aligned_free(void* p)
{
if (p) {
free(((void**)p)[-1]);
}
}
/// create a c++ class at an memory-aligned spot that needs to be deleted using aligned_delete
template<typename cls>
inline cls* aligned_new(int alignment)
{
void* mem = aligned_malloc(sizeof(cls), alignment);
return new (mem) cls();
}
/** delete objects created using aligned_new */
template<typename cls>
inline void aligned_delete(cls* obj)
{
if (obj != nullptr) {
obj->~cls();
aligned_free((void*)obj);
}
}
template<typename T>
inline bool is_aligned(T* obj, int alignment)
{
return (((size_t)obj) & (alignment - 1)) == 0;
}
/** this template allows to write float SIMD code with the supported operators the following way:
float *a_vec;
const float *b_vec;
perform_parallel_simd_aligned(a_vec, b_vec, 512, [](auto &a, auto &b) {
auto t = a;
a = 3.f * a + 12.f * b;
b = 0.25f * a + 3.f * b;
});
*/
// two buffers read/write
template<typename fnc>
__finl void perform_parallel_simd_aligned(float* a, float* b, int n, const fnc& f)
{
// fnc& f needs to be a lambda of type [](auto &a, auto &b){}.
// the autos will be float_x4/float
constexpr int N = 4;
constexpr int byte_size = sizeof(float);
assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size));
for (int i = 0; i <= n - N; i += N) {
auto x = float_x4_load_aligned(a + i);
auto y = float_x4_load_aligned(b + i);
f(x, y);
store_aligned(x, a + i);
store_aligned(y, b + i);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i) {
f(a[i], b[i]);
}
}
/// template for applying math to one data buffer
template<typename fnc>
__finl void perform_parallel_simd_aligned(float* a, int n, const fnc& f)
{
// fnc& f needs to be a lambda of type [](auto &a){}.
constexpr int N = 4;
constexpr int byte_size = sizeof(float);
assert(is_aligned(a, N * byte_size));
for (int i = 0; i <= n - N; i += N) {
auto x = float_x4_load_aligned(a + i);
f(x);
store_aligned(x, a + i);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i) {
f(a[i]);
}
}
} // namespace staffpad::audio::simd