Add StaffPad library, thanks to Audacity

This commit is contained in:
Robin Gareus 2025-10-08 05:44:20 +02:00
parent 4ef4288b2a
commit 6507bcd93b
No known key found for this signature in database
GPG key ID: A090BCE02CF57F04
19 changed files with 4854 additions and 0 deletions

View file

@ -0,0 +1,159 @@
#pragma once
#include <cassert>
#include "VectorOps.h"
/*
* Utility buffer class for delay-based effects
* Allocates a 2^n size buffer for branchless write and read using a bitmask
* It can read and write to past and future samples, but does not check for overlaps
*/
namespace staffpad::audio {
template<typename SampleT>
class CircularSampleBuffer
{
public:
CircularSampleBuffer()
{
}
~CircularSampleBuffer()
{
if (_buffer) {
free(_buffer);
}
}
void setSize(int n)
{
if (n > _allocatedSize) {
auto oldSize = _allocatedSize;
auto findLargerPowerOfTwo = [](int32_t number) {
int32_t powerOf2 = 1;
while (powerOf2 < number) {
powerOf2 *= 2;
}
return powerOf2;
};
_allocatedSize = findLargerPowerOfTwo(n);
_bufferSizeMask = _allocatedSize - 1;
_buffer = (SampleT*)std::realloc(_buffer, _allocatedSize * sizeof(SampleT));
// Reset the new memory region
assert(_buffer);
std::fill(_buffer + oldSize, _buffer + _allocatedSize, 0.f);
}
}
int getAllocatedSize() const
{
return _allocatedSize;
}
void reset()
{
if (_buffer && _allocatedSize > 0) {
memset(_buffer, 0, sizeof(SampleT) * _allocatedSize);
}
_position0 = 0;
}
void write(int offset, const SampleT& sample)
{
_buffer[(_position0 + offset) & _bufferSizeMask] = sample;
}
void writeOffset0(const SampleT& sample)
{
_buffer[_position0] = sample;
}
const SampleT& read(int offset) const
{
return _buffer[(_position0 + offset) & _bufferSizeMask];
}
/// change the 0 position by n
void advance(int n)
{
_position0 += n;
_position0 &= _bufferSizeMask;
}
private:
template<typename fnc>
void _splitBlockOffsetFunction(int startOffset, int n, fnc f) const
{
assert(n <= _allocatedSize);
int firstIndex = (_position0 + startOffset) & _bufferSizeMask;
int n_to_end = _allocatedSize - firstIndex;
if (n_to_end > n) {
f(firstIndex, 0, n);
} else {
f(firstIndex, 0, n_to_end);
f(0, n_to_end, n - n_to_end);
}
}
public:
void writeBlock(int startOffset, int n, const SampleT* sourceBlock)
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::copy(&sourceBlock[sampleOff], &_buffer[bufferOff], n);
});
}
void readBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::constantMultiply(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
});
}
void readAddBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::constantMultiplyAndAdd(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
});
}
void writeAddBlockWithGain(int startOffset, int n, const SampleT* sourceBlock, float gainFactor)
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::constantMultiplyAndAdd(&sourceBlock[sampleOff], gainFactor, &_buffer[bufferOff], n);
});
}
void readBlock(int startOffset, int n, SampleT* targetBlock) const
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
});
}
void readAndClearBlock(int startOffset, int n, SampleT* targetBlock)
{
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
vo::setToZero(&_buffer[bufferOff], n);
});
}
void clearBlock(int startOffset, int n)
{
_splitBlockOffsetFunction(startOffset, n,
[=](int bufferOff, int sampleOff, int n) { vo::setToZero(&_buffer[bufferOff], n); });
}
private:
SampleT* _buffer = nullptr;
int _position0 = 0; // position of sample index 0 inside _buffer. This is where the next sample will be written to
int _allocatedSize = 0; // 2^n buffer size
int _bufferSizeMask = 0; // 2^n-1 buffer mask
};
} // namespace staffpad::audio

View file

@ -0,0 +1,56 @@
#include "FourierTransform_pffft.h"
#include "pffft.h"
namespace staffpad::audio {
FourierTransform::FourierTransform(int32_t newBlockSize)
: _blockSize{newBlockSize}
{
_pffft_scratch = (float*)pffft_aligned_malloc(_blockSize * sizeof(float));
realFftSpec = pffft_new_setup(_blockSize, PFFFT_REAL);
}
FourierTransform::~FourierTransform()
{
if (_pffft_scratch) {
pffft_aligned_free(_pffft_scratch);
_pffft_scratch = nullptr;
}
if (realFftSpec) {
pffft_destroy_setup(realFftSpec);
realFftSpec = nullptr;
}
}
void FourierTransform::forwardReal(const SamplesReal& t, SamplesComplex& c)
{
assert(t.getNumSamples() == _blockSize);
for (auto ch = 0; ch < t.getNumChannels(); ++ch) {
auto* spec = c.getPtr(ch); // interleaved complex numbers, size _blockSize + 2
auto* cpx_flt = (float*)spec;
pffft_transform_ordered(realFftSpec, t.getPtr(ch), cpx_flt, _pffft_scratch, PFFFT_FORWARD);
// pffft combines dc and nyq values into the first complex value,
// adjust to CCS format.
auto dc = cpx_flt[0];
auto nyq = cpx_flt[1];
spec[0] = { dc, 0.f };
spec[c.getNumSamples() - 1] = { nyq, 0.f };
}
}
void FourierTransform::inverseReal(const SamplesComplex& c, SamplesReal& t)
{
assert(c.getNumSamples() == _blockSize / 2 + 1);
for (auto ch = 0; ch < c.getNumChannels(); ++ch) {
auto* spec = c.getPtr(ch);
// Use t to convert in-place from CCS to pffft format
t.assignSamples(ch, (float*)spec);
auto* ts = t.getPtr(ch);
ts[0] = spec[0].real();
ts[1] = spec[c.getNumSamples() - 1].real();
pffft_transform_ordered(realFftSpec, ts, ts, _pffft_scratch, PFFFT_BACKWARD);
}
}
} // namespace staffpad::audio

View file

@ -0,0 +1,32 @@
// FFT wrapper for the PFFFT library. It is possible to use different wrappers for
// other FFT libraries or platforms, as long as the CSS complex data format is used.
#pragma once
#include <stdint.h>
#include "SamplesFloat.h"
struct PFFFT_Setup;
namespace staffpad::audio {
class FourierTransform
{
public:
FourierTransform(int32_t newBlockSize);
~FourierTransform();
int getSize() const { return static_cast<int>(_blockSize); }
void forwardReal(const SamplesReal& t, SamplesComplex& c);
void inverseReal(const SamplesComplex& c, SamplesReal& t);
private:
PFFFT_Setup* realFftSpec = nullptr;
PFFFT_Setup* complexFftSpec = nullptr;
float* _pffft_scratch = nullptr;
const int32_t _blockSize;
int32_t _order = 0;
};
} // namespace staffpad::audio

10
libs/staffpad/README Normal file
View file

@ -0,0 +1,10 @@
StaffPad - Time and Pitch
=========================
This library was copied from Audacity [1] in October 2025.
Licensed in terms of the GNU GNU GENERAL PUBLIC LICENSE V2 or later, unless unless otherwise noted.
(https://github.com/audacity/audacity?tab=readme-ov-file#license)
[1] https://github.com/audacity/audacity/tree/master/au3/libraries/lib-time-and-pitch/StaffPad
git rev 4cdd78b4326456cb77c7102a4f9a6435e0d474a2

View file

@ -0,0 +1,114 @@
#pragma once
#include <complex>
#include <vector>
#include "SimdTypes.h"
#include "VectorOps.h"
namespace staffpad {
template<typename T = float>
class SamplesFloat
{
public:
~SamplesFloat()
{
for (int ch = 0; ch < num_channels; ch++) {
dealloc(ch);
}
}
void setSize(int32_t numChannels, int32_t samples)
{
for (int ch = 0; ch < num_channels; ch++) {
dealloc(ch);
}
num_channels = numChannels;
num_samples = samples;
data.resize(num_channels);
for (int ch = 0; ch < num_channels; ch++) {
alloc(ch, num_samples);
}
}
int32_t getNumChannels() const
{
return num_channels;
}
int32_t getNumSamples() const
{
return num_samples;
}
float** getPtrs()
{
return data.data();
}
T* getPtr(int32_t channel)
{
assert(channel < num_channels);
assert(data[channel]);
return data[channel];
}
const T* getPtr(int32_t channel) const
{
assert(channel < num_channels);
assert(data[channel]);
return data[channel];
}
void assignSamples(int32_t channel, const T* input)
{
assert(channel < num_channels);
assert(data[channel]);
vo::copy(input, data[channel], num_samples);
}
void assignSamples(const SamplesFloat& rhs)
{
assert(num_channels == rhs.num_channels);
assert(num_samples == rhs.num_samples);
for (int ch = 0; ch < num_channels; ch++) {
assert(data[ch]);
vo::copy(rhs.getPtr(ch), getPtr(ch), num_samples);
}
}
void zeroOut()
{
for (int ch = 0; ch < num_channels; ch++) {
vo::setToZero(data[ch], num_samples);
}
}
private:
int32_t num_channels{ 0 };
int32_t num_samples{ 0 };
std::vector<T*> data;
void alloc(int32_t channel, int32_t samples)
{
assert(channel < num_channels);
if (data[channel]) {
dealloc(channel);
}
data[channel] = (T*)audio::simd::aligned_malloc(samples * sizeof(T), 64);
}
void dealloc(int32_t channel)
{
assert(channel < num_channels);
if (data[channel]) {
audio::simd::aligned_free(data[channel]);
data[channel] = nullptr;
}
}
};
typedef SamplesFloat<float> SamplesReal;
typedef SamplesFloat<std::complex<float> > SamplesComplex;
} // namespace staffpad

View file

@ -0,0 +1,361 @@
/* SPDX-License-Identifier: zlib */
/*
* This code is cleaned up and modernized version of https://github.com/to-miz/sse_mathfun_extension
*/
#pragma once
#include <emmintrin.h>
#include <xmmintrin.h>
#include <array>
#include <complex>
#include <type_traits>
#include <memory>
#include <utility>
namespace simd_complex_conversions {
// Before C++20 there is no standard and correct way to implement bit_cast.
// It was verified that MSVC generates the correct code for the current use
// cases
namespace details {
template<class To, class From>
std::enable_if_t<
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From>
&& std::is_trivially_copyable_v<To>,
To>
// constexpr support needs compiler magic
bit_cast(const From& src) noexcept
{
static_assert(
std::is_trivially_constructible_v<To>,
"This implementation additionally requires "
"destination type to be trivially constructible");
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
constexpr float PIF = 3.141592653589793238f;
constexpr float PIO2F = 1.5707963267948966192f;
constexpr float cephes_PIF = 3.141592653589793238f;
constexpr float cephes_PIO2F = 1.5707963267948966192f;
constexpr float cephes_PIO4F = 0.7853981633974483096f;
constexpr float cephes_FOPI = 1.27323954473516f; // 4 / M_PI
constexpr float minus_cephes_DP1 = -0.78515625f;
constexpr float minus_cephes_DP2 = -2.4187564849853515625e-4f;
constexpr float minus_cephes_DP3 = -3.77489497744594108e-8f;
constexpr float sincof_p0 = -1.9515295891e-4f;
constexpr float sincof_p1 = 8.3321608736e-3f;
constexpr float sincof_p2 = -1.6666654611e-1f;
constexpr float coscof_p0 = 2.443315711809948e-005f;
constexpr float coscof_p1 = -1.388731625493765e-003f;
constexpr float coscof_p2 = 4.166664568298827e-002f;
constexpr float atancof_p0 = 8.05374449538e-2f;
constexpr float atancof_p1 = 1.38776856032e-1f;
constexpr float atancof_p2 = 1.99777106478e-1f;
constexpr float atancof_p3 = 3.33329491539e-1f;
static const float sign_mask = bit_cast<float>(0x80000000);
static const float inv_sign_mask = bit_cast<float>(~0x80000000);
} // namespace details
inline __m128 atan_ps(__m128 x)
{
using namespace details;
__m128 sign_bit, y;
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, _mm_set1_ps(sign_mask));
/* range reduction, init x and y depending on range */
/* x > 2.414213562373095 */
__m128 cmp0 = _mm_cmpgt_ps(x, _mm_set1_ps(2.414213562373095f));
/* x > 0.4142135623730950 */
__m128 cmp1 = _mm_cmpgt_ps(x, _mm_set1_ps(0.4142135623730950f));
/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
__m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
/* -( 1.0/x ) */
__m128 y0 = _mm_and_ps(cmp0, _mm_set1_ps(cephes_PIO2F));
__m128 x0 = _mm_div_ps(_mm_set1_ps(1.0f), x);
x0 = _mm_xor_ps(x0, _mm_set1_ps(sign_mask));
__m128 y1 = _mm_and_ps(cmp2, _mm_set1_ps(cephes_PIO4F));
/* (x-1.0)/(x+1.0) */
__m128 x1_o = _mm_sub_ps(x, _mm_set1_ps(1.0f));
__m128 x1_u = _mm_add_ps(x, _mm_set1_ps(1.0f));
__m128 x1 = _mm_div_ps(x1_o, x1_u);
__m128 x2 = _mm_and_ps(cmp2, x1);
x0 = _mm_and_ps(cmp0, x0);
x2 = _mm_or_ps(x2, x0);
cmp1 = _mm_or_ps(cmp0, cmp2);
x2 = _mm_and_ps(cmp1, x2);
x = _mm_andnot_ps(cmp1, x);
x = _mm_or_ps(x2, x);
y = _mm_or_ps(y0, y1);
__m128 zz = _mm_mul_ps(x, x);
__m128 acc = _mm_set1_ps(atancof_p0);
acc = _mm_mul_ps(acc, zz);
acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p1));
acc = _mm_mul_ps(acc, zz);
acc = _mm_add_ps(acc, _mm_set1_ps(atancof_p2));
acc = _mm_mul_ps(acc, zz);
acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p3));
acc = _mm_mul_ps(acc, zz);
acc = _mm_mul_ps(acc, x);
acc = _mm_add_ps(acc, x);
y = _mm_add_ps(y, acc);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
inline __m128 atan2_ps(__m128 y, __m128 x)
{
using namespace details;
__m128 zero = _mm_setzero_ps();
__m128 x_eq_0 = _mm_cmpeq_ps(x, zero);
__m128 x_gt_0 = _mm_cmpgt_ps(x, zero);
__m128 x_le_0 = _mm_cmple_ps(x, zero);
__m128 y_eq_0 = _mm_cmpeq_ps(y, zero);
__m128 x_lt_0 = _mm_cmplt_ps(x, zero);
__m128 y_lt_0 = _mm_cmplt_ps(y, zero);
__m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
__m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
__m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
__m128 pio2_mask_sign = _mm_and_ps(y_lt_0, _mm_set1_ps(sign_mask));
__m128 pio2_result = _mm_set1_ps(cephes_PIO2F);
pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
pio2_result = _mm_and_ps(pio2_mask, pio2_result);
__m128 pi_mask = _mm_and_ps(y_eq_0, x_lt_0);
__m128 pi = _mm_set1_ps(cephes_PIF);
__m128 pi_result = _mm_and_ps(pi_mask, pi);
__m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
swap_sign_mask_offset
=_mm_and_ps(swap_sign_mask_offset, _mm_set1_ps(sign_mask));
__m128 offset0 = _mm_setzero_ps();
__m128 offset1 = _mm_set1_ps(cephes_PIF);
offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
__m128 offset = _mm_andnot_ps(x_lt_0, offset0);
offset = _mm_and_ps(x_lt_0, offset1);
__m128 arg = _mm_div_ps(y, x);
__m128 atan_result = atan_ps(arg);
atan_result = _mm_add_ps(atan_result, offset);
/* select between zero_result, pio2_result and atan_result */
__m128 result = _mm_andnot_ps(zero_mask, pio2_result);
atan_result = _mm_andnot_ps(zero_mask, atan_result);
atan_result = _mm_andnot_ps(pio2_mask, atan_result);
result = _mm_or_ps(result, atan_result);
result = _mm_or_ps(result, pi_result);
return result;
}
inline std::pair<__m128, __m128> sincos_ps(__m128 x)
{
using namespace details;
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4;
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_set1_ps(sign_mask));
/* scale by 4/Pi */
y = _mm_mul_ps(x, _mm_set1_ps(cephes_FOPI));
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = _mm_set1_ps(minus_cephes_DP1);
xmm2 = _mm_set1_ps(minus_cephes_DP2);
xmm3 = _mm_set1_ps(minus_cephes_DP3);
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x, x);
y = _mm_set1_ps(coscof_p0);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _mm_set1_ps(coscof_p1));
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _mm_set1_ps(coscof_p2));
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f));
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, _mm_set1_ps(1));
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = _mm_set1_ps(sincof_p0);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p1));
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p2));
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
/* update the sign */
return std::make_pair(
_mm_xor_ps(xmm1, sign_bit_sin), _mm_xor_ps(xmm2, sign_bit_cos));
}
inline float atan2_ss(float y, float x)
{
return _mm_cvtss_f32(atan2_ps(_mm_set_ss(y), _mm_set_ss(x)));
}
inline std::pair<float, float> sincos_ss(float angle)
{
auto res = sincos_ps(_mm_set_ss(angle));
return std::make_pair(_mm_cvtss_f32(res.first), _mm_cvtss_f32(res.second));
}
inline __m128 norm(__m128 x, __m128 y)
{
return _mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
}
inline float sqrt_ss(float x)
{
__m128 sse_value = _mm_set_ss(x);
sse_value = _mm_sqrt_ss(sse_value);
return _mm_cvtss_f32(sse_value);
}
template<typename fnc>
void perform_parallel_simd_aligned(
const std::complex<float>* input, float* output, int n, const fnc& f)
{
for (int i = 0; i <= n - 4; i += 4) {
// Safe according to C++ standard
auto p1 = _mm_load_ps(reinterpret_cast<const float*>(input + i));
auto p2 = _mm_load_ps(reinterpret_cast<const float*>(input + i + 2));
// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}
auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
__m128 out;
f(rp, ip, out);
_mm_store_ps(output + i, out);
}
// deal with last partial packet
for (int i = n & (~3); i < n; ++i) {
__m128 out;
f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out);
output[i] = _mm_cvtss_f32(out);
}
}
inline void rotate_parallel_simd_aligned(
const float* oldPhase, const float* newPhase, std::complex<float>* output,
int n)
{
for (int i = 0; i <= n - 4; i += 4) {
auto [sin, cos] = sincos_ps(
oldPhase
? _mm_sub_ps(_mm_load_ps(newPhase + i), _mm_load_ps(oldPhase + i))
: _mm_load_ps(newPhase + i));
// Safe according to C++ standard
auto p1 = _mm_load_ps(reinterpret_cast<float*>(output + i));
auto p2 = _mm_load_ps(reinterpret_cast<float*>(output + i + 2));
// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}
auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
// We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos)
auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin));
auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos));
p1 = _mm_unpacklo_ps(out_rp, out_ip);
p2 = _mm_unpackhi_ps(out_rp, out_ip);
_mm_store_ps(reinterpret_cast<float*>(output + i), p1);
_mm_store_ps(reinterpret_cast<float*>(output + i + 2), p2);
}
// deal with last partial packet
for (int i = n & (~3); i < n; ++i) {
const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
output[i] *= std::complex<float>(cosf(theta), sinf(theta));
}
}
} // namespace simd_complex_conversions

128
libs/staffpad/SimdTypes.h Normal file
View file

@ -0,0 +1,128 @@
/*
Simd-types for parallel dsp processing.
Aligned memory allocation for simd vectors.
*/
#pragma once
#include <cassert>
#include <cstdlib>
#if _MSC_VER
#define __finl __forceinline
#define __vecc __vectorcall
#else
#define __finl inline __attribute__((always_inline))
#define __vecc
#endif
#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#include "SimdTypes_sse2.h"
#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
#include "SimdTypes_neon.h"
#else
#include "SimdTypes_scalar.h"
#endif
namespace staffpad::audio::simd {
/// reserve aligned memory. Needs to be freed with aligned_free()
inline void* aligned_malloc(size_t required_bytes, size_t alignment)
{
auto offset = alignment - 1 + sizeof(void*);
auto p1 = std::malloc(required_bytes + offset);
if (p1 == nullptr) {
return nullptr;
}
// figure out aligned position
void* p2 = (void*)(((size_t)(p1) + offset) & ~(alignment - 1));
// write malloced pointer in front of aligned data
((void**)p2)[-1] = p1;
return p2;
}
/// free memory allocated with aligned_malloc
inline void aligned_free(void* p)
{
if (p) {
free(((void**)p)[-1]);
}
}
/// create a c++ class at an memory-aligned spot that needs to be deleted using aligned_delete
template<typename cls>
inline cls* aligned_new(int alignment)
{
void* mem = aligned_malloc(sizeof(cls), alignment);
return new (mem) cls();
}
/** delete objects created using aligned_new */
template<typename cls>
inline void aligned_delete(cls* obj)
{
if (obj != nullptr) {
obj->~cls();
aligned_free((void*)obj);
}
}
template<typename T>
inline bool is_aligned(T* obj, int alignment)
{
return (((size_t)obj) & (alignment - 1)) == 0;
}
/** this template allows to write float SIMD code with the supported operators the following way:
float *a_vec;
const float *b_vec;
perform_parallel_simd_aligned(a_vec, b_vec, 512, [](auto &a, auto &b) {
auto t = a;
a = 3.f * a + 12.f * b;
b = 0.25f * a + 3.f * b;
});
*/
// two buffers read/write
template<typename fnc>
__finl void perform_parallel_simd_aligned(float* a, float* b, int n, const fnc& f)
{
// fnc& f needs to be a lambda of type [](auto &a, auto &b){}.
// the autos will be float_x4/float
constexpr int N = 4;
constexpr int byte_size = sizeof(float);
assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size));
for (int i = 0; i <= n - N; i += N) {
auto x = float_x4_load_aligned(a + i);
auto y = float_x4_load_aligned(b + i);
f(x, y);
store_aligned(x, a + i);
store_aligned(y, b + i);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i) {
f(a[i], b[i]);
}
}
/// template for applying math to one data buffer
template<typename fnc>
__finl void perform_parallel_simd_aligned(float* a, int n, const fnc& f)
{
// fnc& f needs to be a lambda of type [](auto &a){}.
constexpr int N = 4;
constexpr int byte_size = sizeof(float);
assert(is_aligned(a, N * byte_size));
for (int i = 0; i <= n - N; i += N) {
auto x = float_x4_load_aligned(a + i);
f(x);
store_aligned(x, a + i);
}
// deal with last partial packet
for (int i = n & (~(N - 1)); i < n; ++i) {
f(a[i]);
}
}
} // namespace staffpad::audio::simd

View file

@ -0,0 +1,161 @@
/*
Neon version of SIMD types.
*/
#pragma once
#if _MSC_VER
#include <arm64_neon.h>
#define __finl __forceinline
#define __vecc __vectorcall
#else
#include <arm_neon.h>
#define __finl inline __attribute__((always_inline))
#define __vecc
#endif
#include <cmath>
namespace staffpad::audio::simd {
struct float_x4
{
float32x4_t s;
__finl float_x4()
{
}
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
__finl float_x4(float val)
{
s = vdupq_n_f32(val);
}
__finl float_x4(const float32x4_t& val)
: s(val)
{
}
/// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
__finl float_x4(float v0, float v1, float v2, float v3)
{
#if _MSC_VER // aggregate initializer won't work unless we have {.n128_f32 = ..} in c++20
s.n128_f32[0] = v0;
s.n128_f32[1] = v1;
s.n128_f32[2] = v2;
s.n128_f32[3] = v3;
#elif __clang__
s = { v0, v1, v2, v3 };
#else
float f[4] = { v0, v1, v2, v3 };
s = vld1q_f32(f);
#endif
}
#if __clang__
private:
// this helper class allows writing to the single registers for clang
// __mm128 is a built-in type -> we can't return a float& reference.
// this is just syntax sugar and clang will remove it during builds.
//
// it allows to write
// float_x4 a;
// a[1] = 2.f;
struct RegisterAccessWrapper
{
float32x4_t& val;
int i;
void operator=(float x)
{
val[i] = x;
}
operator float() noexcept
{
return val[i];
}
};
public:
__finl RegisterAccessWrapper operator[](int n)
{
RegisterAccessWrapper raw = { s, n };
return raw;
}
__finl const float operator[](int n) const
{
return s[n];
}
#elif _MSC_VER
// on msvc returning a ref to a sub-register is possible
__finl float& operator[](int n)
{
return s.n128_f32[n];
}
__finl const float operator[](int n) const
{
return s.n128_f32[n];
}
#endif
};
__finl float_x4 __vecc float_x4_from_float(float x)
{
return vdupq_n_f32(x);
}
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
{
return vld1q_f32(x);
}
__finl void __vecc store_aligned(const float_x4& a, float* x)
{
vst1q_f32(x, a.s);
}
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
{
return vuzp1q_f32(a.s, b.s);
}
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
{
return vuzp2q_f32(a.s, b.s);
}
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
{
return vaddq_f32(a.s, b.s);
}
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
{
return vsubq_f32(a.s, b.s);
}
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
{
return vmulq_f32(a.s, b.s);
}
__finl float_x4 __vecc sqrt(const float_x4& a)
{
return vsqrtq_f32(a.s);
}
__finl float __vecc rint(float a)
{
return std::rint(a);
}
__finl float_x4 __vecc rint(const float_x4& a)
{
return vrndnq_f32(a.s);
}
} // namespace staffpad::audio::simd

View file

@ -0,0 +1,108 @@
/*
scalar emulation of simd types.
*/
#pragma once
#include <algorithm>
#include <cmath>
#if _MSC_VER
#define __finl __forceinline
#define __vecc __vectorcall
#else
#define __finl inline __attribute__((always_inline))
#define __vecc
#endif
namespace staffpad::audio::simd {
struct float_x4
{
float v[4];
__finl float_x4()
{
}
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
__finl float_x4(float val)
{
v[0] = v[1] = v[2] = v[3] = val;
}
/// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
__finl float_x4(float v0, float v1, float v2, float v3)
{
v[0] = v0;
v[1] = v1;
v[2] = v2;
v[3] = v3;
}
__finl float& operator[](int n)
{
return v[n];
}
__finl const float& operator[](int n) const
{
return v[n];
}
};
__finl float_x4 __vecc float_x4_from_float(float x)
{
return { x, x, x, x };
}
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
{
return { x[0], x[1], x[2], x[3] };
}
__finl void __vecc store_aligned(const float_x4& a, float* x)
{
for (int i = 0; i < 4; ++i) {
x[i] = a[i];
}
}
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
{
return { a[0], a[2], b[0], b[2] };
}
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
{
return { a[1], a[1], b[3], b[3] };
}
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
{
return { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
}
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
{
return { a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3] };
}
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
{
return { a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3] };
}
__finl float_x4 __vecc sqrt(const float_x4& a)
{
return { std::sqrt(a[0]), std::sqrt(a[1]), std::sqrt(a[2]), std::sqrt(a[3]) };
}
__finl float __vecc rint(float a)
{
return std::rint(a);
}
__finl float_x4 __vecc rint(const float_x4& a)
{
return { std::rint(a[0]), std::rint(a[1]), std::rint(a[2]), std::rint(a[3]) };
}
} // namespace staffpad::audio::simd

View file

@ -0,0 +1,150 @@
/*
SSE2 simd types
*/
#pragma once
#if _MSC_VER
#define __finl __forceinline
#define __vecc __vectorcall
#else
#define __finl inline __attribute__((always_inline))
#define __vecc
#endif
#include <emmintrin.h>
namespace staffpad::audio::simd {
// this is jumping through some hoops to get the same level of support
// for clang and msvc. With clang, the sse2 types are built-in and have
// some arithmetic operators defined.
// On msvc the sse2 types are structs with no operators.
// to get to the same level and to be able to write algorithms "naturally",
// everything needs to be wrapped in a struct.
struct float_x4
{
__m128 s;
__finl float_x4()
{
}
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
__finl float_x4(float val)
{
s = _mm_set1_ps(val);
}
__finl float_x4(const __m128& val)
: s(val)
{
}
#if __clang__
private:
// this helper class allows writing to the single registers for clang
// __mm128 is a built-in type -> we can't return a float& reference.
// this is just syntax sugar and clang will remove it during builds.
//
// it allows to write
// float_x4 a;
// a[1] = 2.f;
struct RegisterAccessWrapper
{
__m128& val;
int i;
void operator=(float x)
{
val[i] = x;
}
operator float() noexcept
{
return val[i];
}
};
public:
__finl RegisterAccessWrapper operator[](int n)
{
RegisterAccessWrapper raw = { s, n };
return raw;
}
__finl const float operator[](int n) const
{
return s[n];
}
#elif _MSC_VER
// on msvc returning a ref to a sub-register is possible
__finl float& operator[](int n)
{
return s.m128_f32[n];
}
__finl const float operator[](int n) const
{
return s.m128_f32[n];
}
#endif
};
__finl float_x4 __vecc float_x4_from_float(float x)
{
return _mm_set1_ps(x);
}
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
{
return _mm_load_ps(x);
}
__finl void __vecc store_aligned(const float_x4& a, float* x)
{
_mm_store_ps(x, a.s);
}
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
{
return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(2, 0, 2, 0));
}
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
{
return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(3, 1, 3, 1));
}
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
{
return _mm_add_ps(a.s, b.s);
}
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
{
return _mm_sub_ps(a.s, b.s);
}
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
{
return _mm_mul_ps(a.s, b.s);
}
__finl float_x4 __vecc sqrt(const float_x4& a)
{
return _mm_sqrt_ps(a.s);
}
__finl float __vecc rint(float x)
{
__m128i A = _mm_cvtps_epi32(_mm_set_ss(x));
return _mm_cvtss_f32(_mm_cvtepi32_ps(A));
}
__finl float_x4 __vecc rint(const float_x4& a)
{
__m128i A = _mm_cvtps_epi32(a.s);
return _mm_cvtepi32_ps(A);
}
} // namespace staffpad::audio::simd

View file

@ -0,0 +1,559 @@
#include "staffpad/TimeAndPitch.h"
#include <algorithm>
#include <array>
#include <cassert>
#include <random>
#include <stdlib.h>
#include <utility>
#include "CircularSampleBuffer.h"
#include "FourierTransform_pffft.h"
#include "SamplesFloat.h"
#include "SimdTypes.h"
using namespace staffpad::audio;
namespace staffpad {
namespace {
constexpr double twoPi = 6.28318530717958647692f;
/**
* 6-point lagrange interpolator SNR ~ -35.2db
* interpolates between samples 2 and 3 using t.
*
* smp pointer to a memory region containing 6 samples
* t fractional sample to interpolate t >= 0 && t < 1
* return value the interpolated sample
*/
inline float lagrange6(const float (& smp)[6], float t)
{
auto* y = &smp[2];
auto ym1_y1 = y[-1] + y[1];
auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
auto c0 = y[0];
auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
// Estrin's scheme
auto t2 = t * t;
return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
}
void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
{
std::vector<std::complex<float> > v(size);
std::uniform_real_distribution<float> dis(
-3.14159265358979323846f, 3.14159265358979323846f);
std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
}
} // namespace
struct TimeAndPitch::impl
{
impl(int fft_size)
: fft(fft_size)
{
randomGenerator.seed(0);
}
FourierTransform fft;
std::mt19937 randomGenerator;
CircularSampleBuffer<float> inResampleInputBuffer[2];
CircularSampleBuffer<float> inCircularBuffer[2];
CircularSampleBuffer<float> outCircularBuffer[2];
CircularSampleBuffer<float> normalizationBuffer;
SamplesReal fft_timeseries;
SamplesComplex spectrum;
SamplesReal norm;
SamplesReal phase;
SamplesReal last_phase;
SamplesReal phase_accum;
SamplesReal cosWindow;
SamplesReal sqWindow;
SamplesReal last_norm;
SamplesReal random_phases;
double exact_hop_a = 512.0, hop_a_err = 0.0;
double exact_hop_s = 0.0;
double next_exact_hop_s = 512.0;
double hop_s_err = 0.0;
std::vector<int> peak_index, trough_index;
};
TimeAndPitch::TimeAndPitch(
int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
: fftSize(fftSize)
, _reduceImaging(reduceImaging)
, _shiftTimbreCb(std::move(shiftTimbreCb))
{
}
TimeAndPitch::~TimeAndPitch()
{
// Here for ~std::shared_ptr<impl>() to know ~impl()
}
void TimeAndPitch::setup(int numChannels, int maxBlockSize)
{
assert(numChannels == 1 || numChannels == 2);
_numChannels = numChannels;
d = std::make_unique<impl>(fftSize);
_maxBlockSize = maxBlockSize;
_numBins = fftSize / 2 + 1;
// audio sample buffers
d->fft_timeseries.setSize(_numChannels, fftSize);
int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].setSize(_maxBlockSize
+ 6); // needs additional history for resampling. more in the future
d->inCircularBuffer[ch].setSize(fftSize);
d->outCircularBuffer[ch].setSize(outBufferSize);
}
d->normalizationBuffer.setSize(outBufferSize);
// fft coefficient buffers
d->spectrum.setSize(_numChannels, _numBins);
d->norm.setSize(1, _numBins);
d->last_norm.setSize(1, _numBins);
d->phase.setSize(_numChannels, _numBins);
d->last_phase.setSize(_numChannels, _numBins);
d->phase_accum.setSize(_numChannels, _numBins);
d->random_phases.setSize(1, _numBins);
generateRandomPhaseVector(
d->random_phases.getPtr(0), _numBins, d->randomGenerator);
_expectedPhaseChangePerBinPerSample = twoPi / double(fftSize);
// Cos window
// For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
d->cosWindow.setSize(1, fftSize);
d->sqWindow.setSize(1, fftSize);
auto* w = d->cosWindow.getPtr(0);
auto* sqw = d->sqWindow.getPtr(0);
for (int i = 0; i < fftSize; ++i) {
w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
sqw[i] = w[i] * w[i];
}
d->peak_index.reserve(_numBins);
d->trough_index.reserve(_numBins);
// force fft to allocate all buffers now. Currently there is no other way in the fft class
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
reset();
}
int TimeAndPitch::getSamplesToNextHop() const
{
return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter
+ 1); // 1 extra to counter pitch factor error accumulation
}
int TimeAndPitch::getNumAvailableOutputSamples() const
{
return _availableOutputSamples;
}
void TimeAndPitch::reset()
{
_analysis_hop_counter = 0;
_availableOutputSamples = 0;
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].reset();
d->inCircularBuffer[ch].reset();
d->outCircularBuffer[ch].reset();
}
d->normalizationBuffer.reset();
d->last_norm.zeroOut();
d->last_phase.zeroOut();
d->phase_accum.zeroOut();
_outBufferWriteOffset = 0;
d->hop_a_err = 0.0;
d->hop_s_err = 0.0;
d->exact_hop_s = 0.0;
_resampleReadPos = 0.0;
}
namespace {
// wrap a phase value into -PI..PI
inline float _unwrapPhase(float arg)
{
using namespace audio::simd;
return arg - rint(arg * 0.15915494309f) * 6.283185307f;
}
void _unwrapPhaseVec(float* v, int n)
{
audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
}
/// rotate even-sized array by half its size to align fft phase at the center
void _fft_shift(float* v, int n)
{
assert((n & 1) == 0);
int n2 = n >> 1;
audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
auto tmp = a;
a = b;
b = tmp;
});
}
void _lr_to_ms(float* ch1, float* ch2, int n)
{
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
auto l = a, r = b;
a = 0.5f * (l + r);
b = 0.5f * (l - r);
});
}
void _ms_to_lr(float* ch1, float* ch2, int n)
{
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
auto m = a, s = b;
a = m + s;
b = m - s;
});
}
} // namespace
// ----------------------------------------------------------------------------
template<int num_channels>
void TimeAndPitch::_time_stretch(float a_a, float a_s)
{
auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
// Create a norm array
const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
const auto* norms_last = d->last_norm.getPtr(0);
d->peak_index.clear();
d->trough_index.clear();
float lowest = norms[0];
int trough = 0;
if (norms_last[0] >= norms[1]) {
d->peak_index.emplace_back(0);
d->trough_index.emplace_back(0);
}
for (int i = 1; i < _numBins - 1; ++i) {
if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) {
d->peak_index.emplace_back(i);
d->trough_index.emplace_back(trough);
trough = i;
lowest = norms[i];
} else if (norms[i] < lowest) {
lowest = norms[i];
trough = i;
}
}
if (norms_last[_numBins - 1] > norms[_numBins - 2]) {
d->peak_index.emplace_back(_numBins - 1);
d->trough_index.emplace_back(trough);
}
if (d->peak_index.size() == 0) {
// use max norm of last frame
int max_idx = 0;
float max_norm = 0.f;
vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
d->peak_index.emplace_back(max_idx);
}
const float** p = const_cast<const float**>(d->phase.getPtrs());
const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
float** acc = d->phase_accum.getPtrs();
float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
int num_peaks = (int)d->peak_index.size();
// time integration for all peaks
for (int i = 0; i < num_peaks; ++i) {
const int n = d->peak_index[i];
auto fn = float(n);
// determine phase from last frame
float fn_expChange_a = fn * expChange_a;
float fn_expChange_s = fn * expChange_s;
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
}
}
// go from first peak to 0
for (int n = d->peak_index[0]; n > 0; --n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
}
}
// 'grow' from pairs of peaks to the lowest norm in between
for (int i = 0; i < num_peaks - 1; ++i) {
const int mid = d->trough_index[i + 1];
for (int n = d->peak_index[i]; n < mid; ++n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
}
}
for (int n = d->peak_index[i + 1]; n > mid + 1; --n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
}
}
}
// last peak to the end
for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
}
}
d->last_norm.assignSamples(d->norm);
d->last_phase.assignSamples(d->phase);
}
void TimeAndPitch::_applyImagingReduction()
{
// Upsampling brings spectral components down, including those that were
// beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
// spectrum, which, if not lowpass-filtered by the resampler, may yield
// an aliasing-like quality if what is mirrored is a harmonic series. A
// simple thing to do against that is to scramble the phase values there.
// This way we preserve natural energy fluctuations, but the artifact
// isn't noticeable as much anymore, because much more noise-like, which
// is what one typically gets at those frequencies.
constexpr auto alignment = 64 / sizeof(float);
const int imagingBeginBin
=std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;
if (imagingBeginBin >= _numBins) {
return;
}
const auto n = _numBins - imagingBeginBin;
auto pSpec = d->spectrum.getPtr(0);
auto pRand = d->random_phases.getPtr(0);
vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
// Just rotating the random phase vector to produce pseudo-random
// sequences of phases.
std::uniform_int_distribution<size_t> dis(0, n - 1);
const auto middle = dis(d->randomGenerator);
std::rotate(pRand, pRand + middle, pRand + n);
}
/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
void TimeAndPitch::_process_hop(int hop_a, int hop_s)
{
if (d->exact_hop_a != d->exact_hop_s) {
if (_numChannels == 2) {
_lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
}
for (int ch = 0; ch < _numChannels; ++ch) {
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
}
// determine norm/phase
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
// norms of the mid channel only (or sole channel) are needed in
// _time_stretch
vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
for (int ch = 0; ch < _numChannels; ++ch) {
vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
}
if (_shiftTimbreCb) {
_shiftTimbreCb(
1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
}
if (_reduceImaging && _pitchFactor < 1.) {
_applyImagingReduction();
}
if (_numChannels == 1) {
_time_stretch<1>((float)hop_a, (float)hop_s);
} else if (_numChannels == 2) {
_time_stretch<2>((float)hop_a, (float)hop_s);
}
for (int ch = 0; ch < _numChannels; ++ch) {
_unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
}
for (int ch = 0; ch < _numChannels; ++ch) {
vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
d->spectrum.getNumSamples());
}
d->fft.inverseReal(d->spectrum, d->fft_timeseries);
for (int ch = 0; ch < _numChannels; ++ch) {
vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
d->fft_timeseries.getNumSamples());
}
if (_numChannels == 2) {
_ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
}
for (int ch = 0; ch < _numChannels; ++ch) {
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
}
} else { // stretch factor == 1.0 => just apply window
for (int ch = 0; ch < _numChannels; ++ch) {
vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
}
}
{
float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
for (int ch = 0; ch < _numChannels; ++ch) {
d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
gainFact);
}
if (normalize_window) {
d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
}
}
_outBufferWriteOffset += hop_s;
_availableOutputSamples += hop_s;
}
void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
{
assert(timeScale > 0.0);
assert(pitchFactor > 0.0);
_pitchFactor = pitchFactor;
_timeStretch = timeScale * pitchFactor;
_overlap_a = overlap;
double overlap_s = overlap;
if (_timeStretch > 1.0 || !modulate_synthesis_hop) {
_overlap_a *= _timeStretch;
} else {
overlap_s /= _timeStretch;
}
d->exact_hop_a = double(fftSize) / _overlap_a;
if (!modulate_synthesis_hop) {
d->exact_hop_s = double(fftSize) / overlap_s;
} else {
// switch after processing next block, unless it's the first one
d->next_exact_hop_s = double(fftSize) / overlap_s;
if (d->exact_hop_s == 0.0) { // on first chunk set it immediately
d->exact_hop_s = d->next_exact_hop_s;
}
}
}
void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
{
assert(numSamples <= _maxBlockSize);
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
d->inResampleInputBuffer[ch].advance(numSamples);
}
_resampleReadPos -= numSamples;
// determine integer hop size for the next hop.
// The error accumulators will make the value fluctuate by one to reach arbitrary scale
// ratios
if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors
d->exact_hop_s = d->next_exact_hop_s;
}
const int hop_s = int(d->exact_hop_s + d->hop_s_err);
const int hop_a = int(d->exact_hop_a + d->hop_a_err);
double step = 0.0;
double read_pos = _resampleReadPos;
while (read_pos < 0.0)
{
auto int_pos = int(std::floor(read_pos));
float frac_pos = float(read_pos - int_pos);
for (int ch = 0; ch < _numChannels; ++ch) {
float smp[6];
d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
d->inCircularBuffer[ch].writeOffset0(s);
d->inCircularBuffer[ch].advance(1);
}
_analysis_hop_counter++;
step++;
if (_analysis_hop_counter >= hop_a) {
_analysis_hop_counter -= hop_a;
d->hop_s_err += d->exact_hop_s - hop_s;
d->hop_a_err += d->exact_hop_a - hop_a;
for (int ch = 0; ch < _numChannels; ++ch) {
d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
}
_process_hop(hop_a, hop_s);
}
read_pos = _resampleReadPos + step * _pitchFactor;
}
_resampleReadPos = read_pos;
}
void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
{
assert(numSamples <= _maxBlockSize);
for (int ch = 0; ch < _numChannels; ++ch) {
d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
if (normalize_window) {
constexpr float curve
=4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
for (int i = 0; i < numSamples; ++i) {
float x = d->normalizationBuffer.read(i);
out_smp[ch][i] *= x / (x * x + 1 / curve);
}
}
d->outCircularBuffer[ch].advance(numSamples);
}
d->normalizationBuffer.clearBlock(0, numSamples);
d->normalizationBuffer.advance(numSamples);
_outBufferWriteOffset -= numSamples;
_availableOutputSamples -= numSamples;
if (modulate_synthesis_hop) {
d->exact_hop_s = d->next_exact_hop_s;
}
}
void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
{
setTimeStretchAndPitchFactor(1.0, pitchFactor);
feedAudio(smp, numSamples);
retrieveAudio(smp, numSamples);
}
int TimeAndPitch::getLatencySamples() const
{
return fftSize - fftSize / overlap + 3; // 3 for resampling
}
int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const
{
const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
}
} // namespace staffpad

155
libs/staffpad/VectorOps.h Normal file
View file

@ -0,0 +1,155 @@
//
// This header is provided for convenience, to easily wrap vector operations around
// their platform-specific optimised libraries (e.g. IPP, vDSP), if desired.
// This can be done by adding #if defined(...) compile-time branches to each function
// and calling the corresponding library function, e.g. ippsAdd_32f or vDSP_vadd
// inside add().
//
#pragma once
#include <stdlib.h>
#include <complex>
#include <cstdint>
#include <cstring>
#if defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_X64) || \
(defined(_M_IX86_FP) && _M_IX86_FP >= 2))
#define USE_SSE2_COMPLEX 1
#endif
#if USE_SSE2_COMPLEX
# include "SimdComplexConversions_sse2.h"
#endif
namespace staffpad {
namespace vo {
inline void* allocate(int32_t bytes)
{
return ::malloc(bytes);
}
inline void free(void* ptr)
{
return ::free(ptr);
}
template<class T>
inline void copy(const T* src, T* dst, int32_t n)
{
memcpy(dst, src, n * sizeof(T));
}
template<class T>
inline void add(const T* src1, const T* src2, T* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = src1[i] + src2[i];
}
}
template<class T>
inline void subtract(const T* src1, const T* src2, T* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = src2[i] - src1[i];
}
}
template<class T>
inline void constantMultiply(const T* src, T constant, T* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = src[i] * constant;
}
}
template<class T>
inline void constantMultiplyAndAdd(const T* src, T constant, T* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] += src[i] * constant;
}
}
template<class T>
inline void multiply(const T* src1, const T* src2, T* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = src1[i] * src2[i];
}
}
template<class T>
inline void setToZero(T* dst, int32_t n)
{
std::fill(dst, dst + n, 0.f);
}
template<class T>
inline void findMaxElement(const T* src, int32_t n, int32_t& maxIndex, T& maxValue)
{
maxIndex = 0;
maxValue = n > 0 ? src[0] : std::numeric_limits<T>::min();
for (int32_t i = 1; i < n; i++) {
if (src[i] > maxValue) {
maxValue = src[i];
maxIndex = i;
}
}
}
#if USE_SSE2_COMPLEX
inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
{
simd_complex_conversions::perform_parallel_simd_aligned(
src, dst, n,
[](const __m128 rp, const __m128 ip, __m128& out)
{ out = simd_complex_conversions::atan2_ps(ip, rp); });
}
inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
{
simd_complex_conversions::perform_parallel_simd_aligned(
src, dst, n,
[](const __m128 rp, const __m128 ip, __m128& out)
{ out = simd_complex_conversions::norm(rp, ip); });
}
inline void rotate(
const float* oldPhase, const float* newPhase, std::complex<float>* dst,
int32_t n)
{
simd_complex_conversions::rotate_parallel_simd_aligned(
oldPhase, newPhase, dst, n);
}
#else
inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = std::arg(src[i]);
}
}
inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
dst[i] = std::norm(src[i]);
}
}
inline void rotate(const float* oldPhase, const float* newPhase, std::complex<float>* dst, int32_t n)
{
for (int32_t i = 0; i < n; i++) {
const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
dst[i] *= std::complex<float>(cosf(theta), sinf(theta));
}
}
#endif
} // namespace vo
} // namespace staffpad

View file

@ -0,0 +1,379 @@
PFFFT: a pretty fast FFT.
TL;DR
--
PFFFT does 1D Fast Fourier Transforms, of single precision real and
complex vectors. It tries do it fast, it tries to be correct, and it
tries to be small. Computations do take advantage of SSE1 instructions
on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
license is BSD-like.
Why does it exist:
--
I was in search of a good performing FFT library , preferably very
small and with a very liberal license.
When one says "fft library", FFTW ("Fastest Fourier Transform in the
West") is probably the first name that comes to mind -- I guess that
99% of open-source projects that need a FFT do use FFTW, and are happy
with it. However, it is quite a large library , which does everything
fft related (2d transforms, 3d transforms, other transformations such
as discrete cosine , or fast hartley). And it is licensed under the
GNU GPL , which means that it cannot be used in non open-source
products.
An alternative to FFTW that is really small, is the venerable FFTPACK
v4, which is available on NETLIB. A more recent version (v5) exists,
but it is larger as it deals with multi-dimensional transforms. This
is a library that is written in FORTRAN 77, a language that is now
considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
its age, benchmarks show it that it still a very good performing FFT
library, see for example the 1d single precision benchmarks here:
http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
Apple vDSP. The reason for that is that those libraries do take
advantage of the SSE SIMD instructions available on Intel CPUs,
available since the days of the Pentium III. These instructions deal
with small vectors of 4 floats at a time, instead of a single float
for a traditionnal FPU, so when using these instructions one may expect
a 4-fold performance improvement.
The idea was to take this fortran fftpack v4 code, translate to C,
modify it to deal with those SSE instructions, and check that the
final performance is not completely ridiculous when compared to other
SIMD FFT libraries. Translation to C was performed with f2c (
http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
order to remove the thousands of gotos that were introduced by
f2c. You will find the fftpack.h and fftpack.c sources in the
repository, this a complete translation of
http://www.netlib.org/fftpack/ , with the discrete cosine transform
and the test program. There is no license information in the netlib
repository, but it was confirmed to me by the fftpack v5 curators that
the same terms do apply to fftpack v4:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
"BSD-like" license, it is compatible with proprietary projects.
Adapting fftpack to deal with the SIMD 4-element vectors instead of
scalar single precision numbers was more complex than I originally
thought, especially with the real transforms, and I ended up writing
more code than I planned..
The code:
--
Only two files, in good old C, pffft.c and pffft.h . The API is very
very simple, just make sure that you read the comments in pffft.h.
Comparison with other FFTs:
--
The idea was not to break speed records, but to get a decently fast
fft that is at least 50% as fast as the fastest FFT -- especially on
slowest computers . I'm more focused on getting the best performance
on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
on getting top performance on today fastest cpus.
It can be used in a real-time context as the fft functions do not
perform any memory allocation -- that is why they accept a 'work'
array in their arguments.
It is also a bit focused on performing 1D convolutions, that is why it
provides "unordered" FFTs , and a fourier domain convolution
operation.
Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9)
--
The benchmark shows the performance of various fft implementations measured in
MFlops, with the number of floating point operations being defined as 5Nlog2(N)
for a length N complex fft, and 2.5*Nlog2(N) for a real fft.
See http://www.fftw.org/speed/method.html for an explanation of these formulas.
MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
Built with:
gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f
| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 |
| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 |
| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 |
| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 |
| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 |
| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 |
| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 |
| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 |
| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 |
| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 |
| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 |
| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 |
| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 |
| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 |
| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 |
| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 |
| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 |
| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 |
| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 |
| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 |
| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 |
| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 |
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
Built with:
gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
| 64 | 3840 | 7680 | 8777 | | 4389 | 20480 | 11171 |
| 96 | 4214 | 9633 | 8429 | | 4816 | 22477 | 11238 |
| 128 | 3584 | 10240 | 10240 | | 5120 | 23893 | 11947 |
| 192 | 4854 | 11095 | 12945 | | 4854 | 22191 | 14121 |
| 256 | 4096 | 11703 | 16384 | | 5120 | 23406 | 13653 |
| 384 | 4395 | 14651 | 12558 | | 4884 | 19535 | 14651 |
| 512 | 5760 | 13166 | 15360 | | 4608 | 23040 | 15360 |
| 768 | 4907 | 14020 | 16357 | | 4461 | 19628 | 14020 |
| 1024 | 5120 | 14629 | 14629 | | 5120 | 20480 | 15754 |
| 2048 | 5632 | 14080 | 18773 | | 4693 | 12516 | 16091 |
| 4096 | 5120 | 13653 | 17554 | | 4726 | 7680 | 14456 |
| 8192 | 4160 | 7396 | 13312 | | 4437 | 14791 | 13312 |
| 9216 | 4210 | 6124 | 13473 | | 4491 | 7282 | 14970 |
| 16384 | 3976 | 11010 | 14313 | | 4210 | 11450 | 13631 |
| 32768 | 4260 | 10224 | 10954 | | 4260 | 6816 | 11797 |
| 262144 | 3736 | 6896 | 9961 | | 2359 | 8965 | 9437 |
| 1048576 | 2796 | 4534 | 6453 | | 1864 | 3078 | 5592 |
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
Built with:
gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
| 64 | 745 | 2145 | 1706 | 2028 | | 961 | 3356 | 3313 | 2300 |
| 96 | 877 | n/a | 1976 | 1978 | | 1059 | n/a | 3333 | 2233 |
| 128 | 951 | 2808 | 2213 | 2279 | | 1202 | 3803 | 3739 | 2494 |
| 192 | 1002 | n/a | 2456 | 2429 | | 1186 | n/a | 3701 | 2508 |
| 256 | 1065 | 3205 | 2641 | 2793 | | 1302 | 4013 | 3912 | 2663 |
| 384 | 845 | n/a | 2759 | 2499 | | 948 | n/a | 3729 | 2504 |
| 512 | 900 | 3476 | 2956 | 2759 | | 974 | 4057 | 3954 | 2645 |
| 768 | 910 | n/a | 2912 | 2737 | | 975 | n/a | 3837 | 2614 |
| 1024 | 936 | 3583 | 3107 | 3009 | | 1006 | 4124 | 3821 | 2697 |
| 2048 | 1057 | 3585 | 3091 | 2837 | | 1089 | 3889 | 3701 | 2513 |
| 4096 | 1083 | 3524 | 3092 | 2733 | | 1039 | 3617 | 3462 | 2364 |
| 8192 | 874 | 3252 | 2967 | 2363 | | 911 | 3106 | 2789 | 2302 |
| 9216 | 898 | n/a | 2420 | 2290 | | 865 | n/a | 2676 | 2204 |
| 16384 | 903 | 2892 | 2506 | 2421 | | 899 | 3026 | 2797 | 2289 |
| 32768 | 965 | 2837 | 2550 | 2358 | | 920 | 2922 | 2763 | 2240 |
| 262144 | 738 | 2422 | 1589 | 1708 | | 610 | 2038 | 1436 | 1091 |
| 1048576 | 528 | 1207 | 845 | 880 | | 606 | 1020 | 669 | 1036 |
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
Built with:
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|-----------+------------+------------+------------| |------------+------------+------------|
| 64 | 1920 | 3614 | 5120 | | 2194 | 7680 | 6467 |
| 96 | 1873 | 3549 | 5187 | | 2107 | 8429 | 5863 |
| 128 | 2240 | 3773 | 5514 | | 2560 | 7964 | 6827 |
| 192 | 1765 | 4569 | 7767 | | 2284 | 9137 | 7061 |
| 256 | 2048 | 5461 | 7447 | | 2731 | 9638 | 7802 |
| 384 | 1998 | 5861 | 6762 | | 2313 | 9253 | 7644 |
| 512 | 2095 | 6144 | 7680 | | 2194 | 10240 | 7089 |
| 768 | 2230 | 5773 | 7549 | | 2045 | 10331 | 7010 |
| 1024 | 2133 | 6400 | 8533 | | 2133 | 10779 | 7877 |
| 2048 | 2011 | 7040 | 8665 | | 1942 | 10240 | 7768 |
| 4096 | 2194 | 6827 | 8777 | | 1755 | 9452 | 6827 |
| 8192 | 1849 | 6656 | 6656 | | 1752 | 7831 | 6827 |
| 9216 | 1871 | 5858 | 6416 | | 1643 | 6909 | 6266 |
| 16384 | 1883 | 6223 | 6506 | | 1664 | 7340 | 6982 |
| 32768 | 1826 | 6390 | 6667 | | 1631 | 7481 | 6971 |
| 262144 | 1546 | 4075 | 5977 | | 1299 | 3415 | 3551 |
| 1048576 | 1104 | 2071 | 1730 | | 1104 | 1149 | 1834 |
|-----------+------------+------------+------------| |------------+------------+------------|
Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
Built with:
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
| 64 | 452 | 1041 | 1336 | | 549 | 2318 | 1781 |
| 96 | 444 | 1297 | 1297 | | 503 | 2408 | 1686 |
| 128 | 527 | 1525 | 1707 | | 543 | 2655 | 1886 |
| 192 | 498 | 1653 | 1849 | | 539 | 2678 | 1942 |
| 256 | 585 | 1862 | 2156 | | 594 | 2777 | 2244 |
| 384 | 499 | 1870 | 1998 | | 511 | 2586 | 1890 |
| 512 | 562 | 2095 | 2194 | | 542 | 2973 | 2194 |
| 768 | 545 | 2045 | 2133 | | 545 | 2365 | 2133 |
| 1024 | 595 | 2133 | 2438 | | 569 | 2695 | 2179 |
| 2048 | 587 | 2125 | 2347 | | 521 | 2230 | 1707 |
| 4096 | 495 | 1890 | 1834 | | 492 | 1876 | 1672 |
| 8192 | 469 | 1548 | 1729 | | 438 | 1740 | 1664 |
| 9216 | 468 | 1663 | 1663 | | 446 | 1585 | 1531 |
| 16384 | 453 | 1608 | 1767 | | 398 | 1476 | 1664 |
| 32768 | 456 | 1420 | 1503 | | 387 | 1388 | 1345 |
| 262144 | 309 | 385 | 726 | | 262 | 415 | 840 |
| 1048576 | 280 | 351 | 739 | | 261 | 313 | 797 |
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
Built with:
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
(visual c++ is definitively not very good with SSE intrinsics...)
| N (input length) | real FFTPack | real PFFFT | | cplx FFTPack | cplx PFFFT |
|------------------+--------------+--------------| |--------------+--------------|
| 64 | 173 | 1009 | | 174 | 1159 |
| 96 | 169 | 1029 | | 188 | 1201 |
| 128 | 195 | 1242 | | 191 | 1275 |
| 192 | 178 | 1312 | | 184 | 1276 |
| 256 | 196 | 1591 | | 186 | 1281 |
| 384 | 172 | 1409 | | 181 | 1281 |
| 512 | 187 | 1640 | | 181 | 1313 |
| 768 | 171 | 1614 | | 176 | 1258 |
| 1024 | 186 | 1812 | | 178 | 1223 |
| 2048 | 190 | 1707 | | 186 | 1099 |
| 4096 | 182 | 1446 | | 177 | 975 |
| 8192 | 175 | 1345 | | 169 | 1034 |
| 9216 | 165 | 1271 | | 168 | 1023 |
| 16384 | 166 | 1396 | | 165 | 949 |
| 32768 | 172 | 1311 | | 161 | 881 |
| 262144 | 136 | 632 | | 134 | 629 |
| 1048576 | 134 | 698 | | 127 | 623 |
|------------------+--------------+--------------| |--------------+--------------|
Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
Built with:
gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|-----------+------------+------------+------------| |------------+------------+------------|
| 64 | 549 | 452 | 731 | | 512 | 602 | 640 |
| 96 | 421 | 272 | 702 | | 496 | 571 | 602 |
| 128 | 498 | 512 | 815 | | 597 | 618 | 652 |
| 160 | 521 | 536 | 815 | | 586 | 669 | 625 |
| 192 | 539 | 571 | 883 | | 485 | 597 | 626 |
| 256 | 640 | 539 | 975 | | 569 | 611 | 671 |
| 384 | 499 | 610 | 879 | | 499 | 602 | 637 |
| 480 | 518 | 507 | 877 | | 496 | 661 | 616 |
| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 |
| 640 | 542 | 612 | 955 | | 568 | 663 | 645 |
| 768 | 557 | 613 | 981 | | 491 | 663 | 598 |
| 800 | 514 | 353 | 882 | | 514 | 360 | 574 |
| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 |
| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 |
| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 |
| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 |
| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 |
| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 |
| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 |
| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 |
| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 |
| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 |
|-----------+------------+------------+------------| |------------+------------+------------|
Same platform as above, but this time pffft and fftpack are built with clang 3.2:
clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|-----------+------------+------------+------------| |------------+------------+------------|
| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 |
| 96 | 351 | 276 | 843 | | 337 | 571 | 963 |
| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 |
| 160 | 426 | 536 | 987 | | 375 | 669 | 914 |
| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 |
| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 |
| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 |
| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 |
| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 |
| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 |
| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 |
| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 |
| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 |
| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 |
| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 |
| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 |
| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 |
| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 |
| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 |
| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 |
| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 |
| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 |
|-----------+------------+------------+------------| |------------+------------+------------|
So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
(the fftpack performance numbers are better with gcc), while clang is
the best with neon intrinsics (see how pffft perf has improved with
clang 3.2).
NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
Built with:
gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
|-----------+------------+------------| |------------+------------|
| 64 | 1735 | 3308 | | 1994 | 3744 |
| 96 | 1596 | 3448 | | 1987 | 3572 |
| 128 | 1807 | 4076 | | 2255 | 3960 |
| 160 | 1769 | 4083 | | 2071 | 3845 |
| 192 | 1990 | 4233 | | 2017 | 3939 |
| 256 | 2191 | 4882 | | 2254 | 4346 |
| 384 | 1878 | 4492 | | 2073 | 4012 |
| 480 | 1748 | 4398 | | 1923 | 3951 |
| 512 | 2030 | 5064 | | 2267 | 4195 |
| 640 | 1918 | 4756 | | 2094 | 4184 |
| 768 | 2099 | 4907 | | 2048 | 4297 |
| 800 | 1822 | 4555 | | 1880 | 4063 |
| 1024 | 2232 | 5355 | | 2187 | 4420 |
| 2048 | 2176 | 4983 | | 2027 | 3602 |
| 2400 | 1741 | 4256 | | 1710 | 3344 |
| 4096 | 1816 | 3914 | | 1851 | 3349 |
| 8192 | 1716 | 3481 | | 1700 | 3255 |
| 9216 | 1735 | 3589 | | 1653 | 3094 |
| 16384 | 1567 | 3483 | | 1637 | 3244 |
| 32768 | 1624 | 3240 | | 1655 | 3156 |
| 262144 | 1012 | 1898 | | 983 | 1503 |
| 1048576 | 876 | 1154 | | 868 | 1341 |
|-----------+------------+------------| |------------+------------|
The performance on the tegra K1 is pretty impressive. I'm not
including the FFTW numbers as they as slightly below the scalar
fftpack numbers, so something must be wrong (however it seems to be
correctly configured and is using neon simd instructions).
When using clang 3.4 the pffft version is even a bit faster, reaching
5.7 GFlops for real ffts of size 1024.

1866
libs/staffpad/pffft/pffft.c Normal file

File diff suppressed because it is too large Load diff

215
libs/staffpad/pffft/pffft.h Normal file
View file

@ -0,0 +1,215 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
Restrictions:
- 1D transforms only, with 32-bit single precision.
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
#pragma once
#include <stddef.h> // for size_t
#ifndef PFFFT_FLOAT
#if 1
/* default: float */
#define PFFFT_FLOAT float
#else
#define PFFFT_FLOAT double
#ifndef PFFFT_SIMD_DISABLE
/* double only with PFFFT_SIMD_DISABLE */
#define PFFFT_SIMD_DISABLE 1
#endif
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
*/
typedef struct PFFFT_Setup PFFFT_Setup;
/* direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
/*
prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384). Threads usually have a small stack, that
there's no sufficient amount of memory, usually leading to a creash!
Use the heap with pffft_aligned_malloc() in this case.
input and output may alias.
*/
void pffft_transform(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
void pffft_zreorder(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const PFFFT_FLOAT *dft_a, const PFFFT_FLOAT *dft_b, PFFFT_FLOAT *dft_ab, PFFFT_FLOAT scaling);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and put result in dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/* simple helper to get minimum possible fft size */
int pffft_min_fft_size(pffft_transform_t transform);
/* simple helper to determine next power of 2
- without inexact/rounding floating point operations
*/
int pffft_next_power_of_two(int N);
/* simple helper to determine if power of 2 - returns bool */
int pffft_is_power_of_two(int N);
/*
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
*/
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
int pffft_simd_size(void);
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,246 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
#pragma once
#include <assert.h>
#include <string.h>
#include <stdint.h>
/*
* SIMD reference material:
*
* general SIMD introduction:
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
*
* SSE 1:
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
*
* ARM NEON:
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
*
* Altivec:
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
* better one?
*
*/
/*
Altivec support macros
*/
#if !defined(PFFFT_SIMD_DISABLE) && ((defined(__ppc__) || defined(__ppc64__)) && defined(__ALTIVEC__))
#include <altivec.h>
typedef vector float v4sf;
# define SIMD_SZ 4
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
# define VZERO() ((vector float) vec_splat_u8(0))
# define VMUL(a,b) vec_madd(a,b, VZERO())
# define VADD(a,b) vec_add(a,b)
# define VMADD(a,b,c) vec_madd(a,b,c)
# define VSUB(a,b) vec_sub(a,b)
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
# define LD_PS1(p) ld_ps1(&p)
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
}
# define VTRANSPOSE4(x0,x1,x2,x3) { \
v4sf y0 = vec_mergeh(x0, x2); \
v4sf y1 = vec_mergel(x0, x2); \
v4sf y2 = vec_mergeh(x1, x3); \
v4sf y3 = vec_mergel(x1, x3); \
x0 = vec_mergeh(y0, y2); \
x1 = vec_mergel(y0, y2); \
x2 = vec_mergeh(y1, y3); \
x3 = vec_mergel(y1, y3); \
}
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
/*
SSE1 support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
#include <xmmintrin.h>
typedef __m128 v4sf;
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
# define SIMD_SZ 4
# define VREQUIRES_ALIGN 1
# define VZERO() _mm_setzero_ps()
# define VMUL(a,b) _mm_mul_ps(a,b)
# define VADD(a,b) _mm_add_ps(a,b)
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
# define VSUB(a,b) _mm_sub_ps(a,b)
# define LD_PS1(p) _mm_set1_ps(p)
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
/* reverse/flip all floats */
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
/* reverse/flip complex floats */
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
/*
ARM NEON support macros
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64))
# include <arm_neon.h>
typedef float32x4_t v4sf;
# define SIMD_SZ 4
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
# define VZERO() vdupq_n_f32(0)
# define VMUL(a,b) vmulq_f32(a,b)
# define VADD(a,b) vaddq_f32(a,b)
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
# define VSUB(a,b) vsubq_f32(a,b)
# define LD_PS1(p) vld1q_dup_f32(&(p))
# define VLOAD_UNALIGNED(ptr) (*(ptr))
# define VLOAD_ALIGNED(ptr) (*(ptr))
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define VTRANSPOSE4(x0,x1,x2,x3) { \
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
}
// marginally faster version
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
#else
# if !defined(PFFFT_SIMD_DISABLE)
# warning "building with simd disabled !\n";
# define PFFFT_SIMD_DISABLE // fallback to scalar code
# endif
#endif
// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
#ifdef PFFFT_SIMD_DISABLE
typedef float v4sf;
# define SIMD_SZ 1
# define VREQUIRES_ALIGN 0
# define VZERO() 0.f
# define VMUL(a,b) ((a)*(b))
# define VADD(a,b) ((a)+(b))
# define VMADD(a,b,c) ((a)*(b)+(c))
# define VSUB(a,b) ((a)-(b))
# define LD_PS1(p) (p)
# define VLOAD_UNALIGNED(ptr) (*(ptr))
# define VLOAD_ALIGNED(ptr) (*(ptr))
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
#endif
// shortcuts for complex multiplcations
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
#ifndef SVMUL
// multiply a scalar with a vector
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
#endif
typedef union v4sf_union {
v4sf v;
float f[SIMD_SZ];
} v4sf_union;
#if !defined(PFFFT_SIMD_DISABLE)
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
/* detect bugs with the vector support macros */
static void Vvalidate_simd() {
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
v4sf_union a0, a1, a2, a3, t, u;
memcpy(a0.f, f, 4*sizeof(float));
memcpy(a1.f, f+4, 4*sizeof(float));
memcpy(a2.f, f+8, 4*sizeof(float));
memcpy(a3.f, f+12, 4*sizeof(float));
t = a0; u = a1; t.v = VZERO();
printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
t.v = VADD(a1.v, a2.v);
printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
t.v = VMUL(a1.v, a2.v);
printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
t.v = VMADD(a1.v, a2.v,a0.v);
printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
INTERLEAVE2(a1.v,a2.v,t.v,u.v);
printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
t.v=LD_PS1(f[15]);
printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
assertv4(t, 15, 15, 15, 15);
t.v = VSWAPHL(a1.v, a2.v);
printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
assertv4(t, 8, 9, 6, 7);
VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
}
#endif //!PFFFT_SIMD_DISABLE
/* SSE and co like 16-bytes aligned pointers */
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
static
void *Valigned_malloc(size_t nb_bytes) {
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
if (!p0) return (void *) 0;
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
*((void **) p - 1) = p0;
return p;
}
static
void Valigned_free(void *p) {
if (p) free(*((void **) p - 1));
}

View file

@ -0,0 +1,117 @@
//
// Arbitrary time and pitch stretching, based on the Phase-locked Vocoder.
//
#pragma once
#include <complex>
#include <functional>
#include <memory>
#include <vector>
namespace staffpad {
class TimeAndPitch
{
public:
/**
* `magnitude` may be null, in which case the callback should re-compute it.
*/
using ShiftTimbreCb = std::function<void (
double factor, std::complex<float>* spectrum, const float* magnitude)>;
TimeAndPitch(
int fftSize, bool reduceImaging = true, ShiftTimbreCb shiftTimbreCb = {});
~TimeAndPitch();
/**
Setup at least once before processing.
\param numChannels Must be 1 or 2
\param maxBlockSize The caller's maximum block size, e.g. 1024 samples
*/
void setup(int numChannels, int maxBlockSize);
/**
Set independent time stretch and pitch factors (synchronously to processing thread).
The factor change resolution follows the processing block size, with some approximation.
As a general rule, smaller block sizes result in finer changes, although there's a lower
bound due to internal windowed segmentation and OLA resynthesis.
*/
void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor);
/**
Return the ~number of samples until the next hop gets analysed (might be 1
or 2 too high). For extremely small stretch ratios, several analyses might
be needed to produce samples, so you may have to repeat the process.
Eventually, more output samples are produced.
*/
int getSamplesToNextHop() const;
/**
Tells the number of already processed output samples (after time-stretching) that
are ready for consumption.
*/
int getNumAvailableOutputSamples() const;
/**
Feed more input samples. \see `getSamplesToNextHop()` to know how much to feed to
produce more output samples.
*/
void feedAudio(const float* const* in_smp, int numSamples);
/**
Retrieved shifted/stretched samples in output. \see `getNumAvailableOutputSamples()`
to ask a valid amount of `numSamples`.
*/
void retrieveAudio(float* const* out_smp, int numSamples);
/**
Pitch-shift only in-place processing.
*/
void processPitchShift(float* const* smp, int numSamples, double pitchFactor);
/**
Latency in input samples
*/
int getLatencySamples() const;
/**
Latency in output samples, calculated on the given timeStretch factor
*/
int getLatencySamplesForStretchRatio(float timeStretch) const;
/**
Resets the internal state, discards any buffered input
*/
void reset();
private:
const int fftSize;
static constexpr int overlap = 4;
static constexpr bool normalize_window = true; // compensate for ola window overlaps
static constexpr bool modulate_synthesis_hop = true;
void _process_hop(int hop_a, int hop_s);
template<int num_channels>
void _time_stretch(float hop_a, float hop_s);
void _applyImagingReduction();
struct impl;
std::shared_ptr<impl> d;
const bool _reduceImaging;
const ShiftTimbreCb _shiftTimbreCb;
int _numChannels = 1;
int _maxBlockSize = 1024;
double _resampleReadPos = 0.0;
int _availableOutputSamples = 0;
int _numBins = fftSize / 2 + 1;
double _overlap_a = overlap;
int _analysis_hop_counter = 0;
double _expectedPhaseChangePerBinPerSample = 0.01;
double _timeStretch = 1.0;
double _pitchFactor = 1.0;
int _outBufferWriteOffset = 0;
};
} // namespace staffpad

37
libs/staffpad/wscript Normal file
View file

@ -0,0 +1,37 @@
#!/usr/bin/env python
# Version of this package (even if built as a child)
MAJOR = '0'
MINOR = '0'
MICRO = '0'
STAFFPAD_VERSION = "%s.%s.%s" % (MAJOR, MINOR, MICRO)
# Library version (UNIX style major, minor, micro)
# major increment <=> incompatible changes
# minor increment <=> compatible changes (additions)
# micro increment <=> no interface changes
STAFFPAD_LIB_VERSION = '0.0.0'
I18N_PACKAGE = 'staffpad'
staffpad_sources = [
'TimeAndPitch.cpp',
'FourierTransform_pffft.cpp',
'pffft/pffft.c'
]
def options(opt):
pass
def configure(conf):
pass
def build(bld):
obj = bld.stlib(features = 'cxx cxxstlib', source = staffpad_sources)
obj.cxxflags = [ bld.env['compiler_flags_dict']['pic'], '-O3', '-ffast-math' ]
obj.export_includes = ['.']
obj.includes = ['.', 'pffft']
obj.name = 'staffpad'
obj.target = 'staffpad'
obj.vnum = STAFFPAD_LIB_VERSION
obj.defines = [ 'PACKAGE="' + I18N_PACKAGE + '"' ]

View file

@ -332,6 +332,7 @@ children = [
'libs/clearlooks-newer',
'libs/zita-resampler',
'libs/zita-convolver',
'libs/staffpad',
# optionally external libraries
'libs/fluidsynth',
'libs/hidapi',