diff --git a/libs/staffpad/CircularSampleBuffer.h b/libs/staffpad/CircularSampleBuffer.h new file mode 100644 index 0000000000..4b67c7319d --- /dev/null +++ b/libs/staffpad/CircularSampleBuffer.h @@ -0,0 +1,159 @@ +#pragma once + +#include + +#include "VectorOps.h" + +/* + * Utility buffer class for delay-based effects + * Allocates a 2^n size buffer for branchless write and read using a bitmask + * It can read and write to past and future samples, but does not check for overlaps + */ + +namespace staffpad::audio { +template +class CircularSampleBuffer +{ +public: + CircularSampleBuffer() + { + } + + ~CircularSampleBuffer() + { + if (_buffer) { + free(_buffer); + } + } + + void setSize(int n) + { + if (n > _allocatedSize) { + auto oldSize = _allocatedSize; + + auto findLargerPowerOfTwo = [](int32_t number) { + int32_t powerOf2 = 1; + while (powerOf2 < number) { + powerOf2 *= 2; + } + return powerOf2; + }; + + _allocatedSize = findLargerPowerOfTwo(n); + _bufferSizeMask = _allocatedSize - 1; + _buffer = (SampleT*)std::realloc(_buffer, _allocatedSize * sizeof(SampleT)); + + // Reset the new memory region + assert(_buffer); + std::fill(_buffer + oldSize, _buffer + _allocatedSize, 0.f); + } + } + + int getAllocatedSize() const + { + return _allocatedSize; + } + + void reset() + { + if (_buffer && _allocatedSize > 0) { + memset(_buffer, 0, sizeof(SampleT) * _allocatedSize); + } + _position0 = 0; + } + + void write(int offset, const SampleT& sample) + { + _buffer[(_position0 + offset) & _bufferSizeMask] = sample; + } + + void writeOffset0(const SampleT& sample) + { + _buffer[_position0] = sample; + } + + const SampleT& read(int offset) const + { + return _buffer[(_position0 + offset) & _bufferSizeMask]; + } + + /// change the 0 position by n + void advance(int n) + { + _position0 += n; + _position0 &= _bufferSizeMask; + } + +private: + template + void _splitBlockOffsetFunction(int startOffset, int n, fnc f) const + { + assert(n <= _allocatedSize); + int firstIndex = (_position0 + startOffset) & _bufferSizeMask; + int n_to_end = _allocatedSize - firstIndex; + if (n_to_end > n) { + f(firstIndex, 0, n); + } else { + f(firstIndex, 0, n_to_end); + f(0, n_to_end, n - n_to_end); + } + } + +public: + void writeBlock(int startOffset, int n, const SampleT* sourceBlock) + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::copy(&sourceBlock[sampleOff], &_buffer[bufferOff], n); + }); + } + + void readBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::constantMultiply(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n); + }); + } + + void readAddBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::constantMultiplyAndAdd(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n); + }); + } + + void writeAddBlockWithGain(int startOffset, int n, const SampleT* sourceBlock, float gainFactor) + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::constantMultiplyAndAdd(&sourceBlock[sampleOff], gainFactor, &_buffer[bufferOff], n); + }); + } + + void readBlock(int startOffset, int n, SampleT* targetBlock) const + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n); + }); + } + + void readAndClearBlock(int startOffset, int n, SampleT* targetBlock) + { + _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) { + vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n); + vo::setToZero(&_buffer[bufferOff], n); + }); + } + + void clearBlock(int startOffset, int n) + { + _splitBlockOffsetFunction(startOffset, n, + [=](int bufferOff, int sampleOff, int n) { vo::setToZero(&_buffer[bufferOff], n); }); + } + +private: + SampleT* _buffer = nullptr; + + int _position0 = 0; // position of sample index 0 inside _buffer. This is where the next sample will be written to + int _allocatedSize = 0; // 2^n buffer size + int _bufferSizeMask = 0; // 2^n-1 buffer mask +}; +} // namespace staffpad::audio diff --git a/libs/staffpad/FourierTransform_pffft.cpp b/libs/staffpad/FourierTransform_pffft.cpp new file mode 100644 index 0000000000..d19b2227c5 --- /dev/null +++ b/libs/staffpad/FourierTransform_pffft.cpp @@ -0,0 +1,56 @@ +#include "FourierTransform_pffft.h" + +#include "pffft.h" + +namespace staffpad::audio { +FourierTransform::FourierTransform(int32_t newBlockSize) + : _blockSize{newBlockSize} +{ + _pffft_scratch = (float*)pffft_aligned_malloc(_blockSize * sizeof(float)); + realFftSpec = pffft_new_setup(_blockSize, PFFFT_REAL); +} + +FourierTransform::~FourierTransform() +{ + if (_pffft_scratch) { + pffft_aligned_free(_pffft_scratch); + _pffft_scratch = nullptr; + } + if (realFftSpec) { + pffft_destroy_setup(realFftSpec); + realFftSpec = nullptr; + } +} + +void FourierTransform::forwardReal(const SamplesReal& t, SamplesComplex& c) +{ + assert(t.getNumSamples() == _blockSize); + + for (auto ch = 0; ch < t.getNumChannels(); ++ch) { + auto* spec = c.getPtr(ch); // interleaved complex numbers, size _blockSize + 2 + auto* cpx_flt = (float*)spec; + pffft_transform_ordered(realFftSpec, t.getPtr(ch), cpx_flt, _pffft_scratch, PFFFT_FORWARD); + // pffft combines dc and nyq values into the first complex value, + // adjust to CCS format. + auto dc = cpx_flt[0]; + auto nyq = cpx_flt[1]; + spec[0] = { dc, 0.f }; + spec[c.getNumSamples() - 1] = { nyq, 0.f }; + } +} + +void FourierTransform::inverseReal(const SamplesComplex& c, SamplesReal& t) +{ + assert(c.getNumSamples() == _blockSize / 2 + 1); + + for (auto ch = 0; ch < c.getNumChannels(); ++ch) { + auto* spec = c.getPtr(ch); + // Use t to convert in-place from CCS to pffft format + t.assignSamples(ch, (float*)spec); + auto* ts = t.getPtr(ch); + ts[0] = spec[0].real(); + ts[1] = spec[c.getNumSamples() - 1].real(); + pffft_transform_ordered(realFftSpec, ts, ts, _pffft_scratch, PFFFT_BACKWARD); + } +} +} // namespace staffpad::audio diff --git a/libs/staffpad/FourierTransform_pffft.h b/libs/staffpad/FourierTransform_pffft.h new file mode 100644 index 0000000000..8f301e270f --- /dev/null +++ b/libs/staffpad/FourierTransform_pffft.h @@ -0,0 +1,32 @@ +// FFT wrapper for the PFFFT library. It is possible to use different wrappers for +// other FFT libraries or platforms, as long as the CSS complex data format is used. + +#pragma once + +#include + +#include "SamplesFloat.h" + +struct PFFFT_Setup; + +namespace staffpad::audio { +class FourierTransform +{ +public: + FourierTransform(int32_t newBlockSize); + ~FourierTransform(); + + int getSize() const { return static_cast(_blockSize); } + + void forwardReal(const SamplesReal& t, SamplesComplex& c); + void inverseReal(const SamplesComplex& c, SamplesReal& t); + +private: + PFFFT_Setup* realFftSpec = nullptr; + PFFFT_Setup* complexFftSpec = nullptr; + float* _pffft_scratch = nullptr; + + const int32_t _blockSize; + int32_t _order = 0; +}; +} // namespace staffpad::audio diff --git a/libs/staffpad/README b/libs/staffpad/README new file mode 100644 index 0000000000..324f17f7a9 --- /dev/null +++ b/libs/staffpad/README @@ -0,0 +1,10 @@ +StaffPad - Time and Pitch +========================= + +This library was copied from Audacity [1] in October 2025. + +Licensed in terms of the GNU GNU GENERAL PUBLIC LICENSE V2 or later, unless unless otherwise noted. +(https://github.com/audacity/audacity?tab=readme-ov-file#license) + +[1] https://github.com/audacity/audacity/tree/master/au3/libraries/lib-time-and-pitch/StaffPad + git rev 4cdd78b4326456cb77c7102a4f9a6435e0d474a2 diff --git a/libs/staffpad/SamplesFloat.h b/libs/staffpad/SamplesFloat.h new file mode 100644 index 0000000000..3ed2c565f6 --- /dev/null +++ b/libs/staffpad/SamplesFloat.h @@ -0,0 +1,114 @@ +#pragma once + +#include +#include + +#include "SimdTypes.h" +#include "VectorOps.h" + +namespace staffpad { +template +class SamplesFloat +{ +public: + ~SamplesFloat() + { + for (int ch = 0; ch < num_channels; ch++) { + dealloc(ch); + } + } + + void setSize(int32_t numChannels, int32_t samples) + { + for (int ch = 0; ch < num_channels; ch++) { + dealloc(ch); + } + + num_channels = numChannels; + num_samples = samples; + data.resize(num_channels); + for (int ch = 0; ch < num_channels; ch++) { + alloc(ch, num_samples); + } + } + + int32_t getNumChannels() const + { + return num_channels; + } + + int32_t getNumSamples() const + { + return num_samples; + } + + float** getPtrs() + { + return data.data(); + } + + T* getPtr(int32_t channel) + { + assert(channel < num_channels); + assert(data[channel]); + return data[channel]; + } + + const T* getPtr(int32_t channel) const + { + assert(channel < num_channels); + assert(data[channel]); + return data[channel]; + } + + void assignSamples(int32_t channel, const T* input) + { + assert(channel < num_channels); + assert(data[channel]); + vo::copy(input, data[channel], num_samples); + } + + void assignSamples(const SamplesFloat& rhs) + { + assert(num_channels == rhs.num_channels); + assert(num_samples == rhs.num_samples); + for (int ch = 0; ch < num_channels; ch++) { + assert(data[ch]); + vo::copy(rhs.getPtr(ch), getPtr(ch), num_samples); + } + } + + void zeroOut() + { + for (int ch = 0; ch < num_channels; ch++) { + vo::setToZero(data[ch], num_samples); + } + } + +private: + int32_t num_channels{ 0 }; + int32_t num_samples{ 0 }; + std::vector data; + + void alloc(int32_t channel, int32_t samples) + { + assert(channel < num_channels); + if (data[channel]) { + dealloc(channel); + } + data[channel] = (T*)audio::simd::aligned_malloc(samples * sizeof(T), 64); + } + + void dealloc(int32_t channel) + { + assert(channel < num_channels); + if (data[channel]) { + audio::simd::aligned_free(data[channel]); + data[channel] = nullptr; + } + } +}; + +typedef SamplesFloat SamplesReal; +typedef SamplesFloat > SamplesComplex; +} // namespace staffpad diff --git a/libs/staffpad/SimdComplexConversions_sse2.h b/libs/staffpad/SimdComplexConversions_sse2.h new file mode 100644 index 0000000000..c0eb1432f8 --- /dev/null +++ b/libs/staffpad/SimdComplexConversions_sse2.h @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: zlib */ +/* + * This code is cleaned up and modernized version of https://github.com/to-miz/sse_mathfun_extension + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace simd_complex_conversions { +// Before C++20 there is no standard and correct way to implement bit_cast. +// It was verified that MSVC generates the correct code for the current use +// cases +namespace details { +template +std::enable_if_t< + sizeof(To) == sizeof(From) && std::is_trivially_copyable_v + && std::is_trivially_copyable_v, + To> +// constexpr support needs compiler magic +bit_cast(const From& src) noexcept +{ + static_assert( + std::is_trivially_constructible_v, + "This implementation additionally requires " + "destination type to be trivially constructible"); + + To dst; + std::memcpy(&dst, &src, sizeof(To)); + return dst; +} + +constexpr float PIF = 3.141592653589793238f; +constexpr float PIO2F = 1.5707963267948966192f; + +constexpr float cephes_PIF = 3.141592653589793238f; +constexpr float cephes_PIO2F = 1.5707963267948966192f; +constexpr float cephes_PIO4F = 0.7853981633974483096f; +constexpr float cephes_FOPI = 1.27323954473516f; // 4 / M_PI +constexpr float minus_cephes_DP1 = -0.78515625f; +constexpr float minus_cephes_DP2 = -2.4187564849853515625e-4f; +constexpr float minus_cephes_DP3 = -3.77489497744594108e-8f; +constexpr float sincof_p0 = -1.9515295891e-4f; +constexpr float sincof_p1 = 8.3321608736e-3f; +constexpr float sincof_p2 = -1.6666654611e-1f; +constexpr float coscof_p0 = 2.443315711809948e-005f; +constexpr float coscof_p1 = -1.388731625493765e-003f; +constexpr float coscof_p2 = 4.166664568298827e-002f; + +constexpr float atancof_p0 = 8.05374449538e-2f; +constexpr float atancof_p1 = 1.38776856032e-1f; +constexpr float atancof_p2 = 1.99777106478e-1f; +constexpr float atancof_p3 = 3.33329491539e-1f; + +static const float sign_mask = bit_cast(0x80000000); +static const float inv_sign_mask = bit_cast(~0x80000000); +} // namespace details + +inline __m128 atan_ps(__m128 x) +{ + using namespace details; + + __m128 sign_bit, y; + + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask)); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, _mm_set1_ps(sign_mask)); + + /* range reduction, init x and y depending on range */ + /* x > 2.414213562373095 */ + __m128 cmp0 = _mm_cmpgt_ps(x, _mm_set1_ps(2.414213562373095f)); + /* x > 0.4142135623730950 */ + __m128 cmp1 = _mm_cmpgt_ps(x, _mm_set1_ps(0.4142135623730950f)); + + /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */ + __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1); + + /* -( 1.0/x ) */ + __m128 y0 = _mm_and_ps(cmp0, _mm_set1_ps(cephes_PIO2F)); + __m128 x0 = _mm_div_ps(_mm_set1_ps(1.0f), x); + x0 = _mm_xor_ps(x0, _mm_set1_ps(sign_mask)); + + __m128 y1 = _mm_and_ps(cmp2, _mm_set1_ps(cephes_PIO4F)); + /* (x-1.0)/(x+1.0) */ + __m128 x1_o = _mm_sub_ps(x, _mm_set1_ps(1.0f)); + __m128 x1_u = _mm_add_ps(x, _mm_set1_ps(1.0f)); + __m128 x1 = _mm_div_ps(x1_o, x1_u); + + __m128 x2 = _mm_and_ps(cmp2, x1); + x0 = _mm_and_ps(cmp0, x0); + x2 = _mm_or_ps(x2, x0); + cmp1 = _mm_or_ps(cmp0, cmp2); + x2 = _mm_and_ps(cmp1, x2); + x = _mm_andnot_ps(cmp1, x); + x = _mm_or_ps(x2, x); + + y = _mm_or_ps(y0, y1); + + __m128 zz = _mm_mul_ps(x, x); + __m128 acc = _mm_set1_ps(atancof_p0); + acc = _mm_mul_ps(acc, zz); + acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p1)); + acc = _mm_mul_ps(acc, zz); + acc = _mm_add_ps(acc, _mm_set1_ps(atancof_p2)); + acc = _mm_mul_ps(acc, zz); + acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p3)); + acc = _mm_mul_ps(acc, zz); + acc = _mm_mul_ps(acc, x); + acc = _mm_add_ps(acc, x); + y = _mm_add_ps(y, acc); + + /* update the sign */ + y = _mm_xor_ps(y, sign_bit); + + return y; +} + +inline __m128 atan2_ps(__m128 y, __m128 x) +{ + using namespace details; + + __m128 zero = _mm_setzero_ps(); + __m128 x_eq_0 = _mm_cmpeq_ps(x, zero); + __m128 x_gt_0 = _mm_cmpgt_ps(x, zero); + __m128 x_le_0 = _mm_cmple_ps(x, zero); + __m128 y_eq_0 = _mm_cmpeq_ps(y, zero); + __m128 x_lt_0 = _mm_cmplt_ps(x, zero); + __m128 y_lt_0 = _mm_cmplt_ps(y, zero); + + __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0); + __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0); + zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case); + + __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0); + __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, _mm_set1_ps(sign_mask)); + __m128 pio2_result = _mm_set1_ps(cephes_PIO2F); + pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign); + pio2_result = _mm_and_ps(pio2_mask, pio2_result); + + __m128 pi_mask = _mm_and_ps(y_eq_0, x_lt_0); + __m128 pi = _mm_set1_ps(cephes_PIF); + __m128 pi_result = _mm_and_ps(pi_mask, pi); + + __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0); + swap_sign_mask_offset + =_mm_and_ps(swap_sign_mask_offset, _mm_set1_ps(sign_mask)); + + __m128 offset0 = _mm_setzero_ps(); + __m128 offset1 = _mm_set1_ps(cephes_PIF); + offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset); + + __m128 offset = _mm_andnot_ps(x_lt_0, offset0); + offset = _mm_and_ps(x_lt_0, offset1); + + __m128 arg = _mm_div_ps(y, x); + __m128 atan_result = atan_ps(arg); + atan_result = _mm_add_ps(atan_result, offset); + + /* select between zero_result, pio2_result and atan_result */ + + __m128 result = _mm_andnot_ps(zero_mask, pio2_result); + atan_result = _mm_andnot_ps(zero_mask, atan_result); + atan_result = _mm_andnot_ps(pio2_mask, atan_result); + result = _mm_or_ps(result, atan_result); + result = _mm_or_ps(result, pi_result); + + return result; +} + +inline std::pair<__m128, __m128> sincos_ps(__m128 x) +{ + using namespace details; + __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; + __m128i emm0, emm2, emm4; + + sign_bit_sin = x; + /* take the absolute value */ + x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask)); + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_set1_ps(sign_mask)); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, _mm_set1_ps(cephes_FOPI)); + + /* store the integer part of y in emm2 */ + emm2 = _mm_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1)); + emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1)); + y = _mm_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4)); + emm0 = _mm_slli_epi32(emm0, 29); + __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2)); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + __m128 poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = _mm_set1_ps(minus_cephes_DP1); + xmm2 = _mm_set1_ps(minus_cephes_DP2); + xmm3 = _mm_set1_ps(minus_cephes_DP3); + xmm1 = _mm_mul_ps(y, xmm1); + xmm2 = _mm_mul_ps(y, xmm2); + xmm3 = _mm_mul_ps(y, xmm3); + x = _mm_add_ps(x, xmm1); + x = _mm_add_ps(x, xmm2); + x = _mm_add_ps(x, xmm3); + + emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2)); + emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4)); + emm4 = _mm_slli_epi32(emm4, 29); + __m128 sign_bit_cos = _mm_castsi128_ps(emm4); + + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + __m128 z = _mm_mul_ps(x, x); + y = _mm_set1_ps(coscof_p0); + + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _mm_set1_ps(coscof_p1)); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _mm_set1_ps(coscof_p2)); + y = _mm_mul_ps(y, z); + y = _mm_mul_ps(y, z); + __m128 tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f)); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, _mm_set1_ps(1)); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + __m128 y2 = _mm_set1_ps(sincof_p0); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p1)); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p2)); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_mul_ps(y2, x); + y2 = _mm_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + __m128 ysin2 = _mm_and_ps(xmm3, y2); + __m128 ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2, ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); + + /* update the sign */ + return std::make_pair( + _mm_xor_ps(xmm1, sign_bit_sin), _mm_xor_ps(xmm2, sign_bit_cos)); +} + +inline float atan2_ss(float y, float x) +{ + return _mm_cvtss_f32(atan2_ps(_mm_set_ss(y), _mm_set_ss(x))); +} + +inline std::pair sincos_ss(float angle) +{ + auto res = sincos_ps(_mm_set_ss(angle)); + return std::make_pair(_mm_cvtss_f32(res.first), _mm_cvtss_f32(res.second)); +} + +inline __m128 norm(__m128 x, __m128 y) +{ + return _mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); +} + +inline float sqrt_ss(float x) +{ + __m128 sse_value = _mm_set_ss(x); + sse_value = _mm_sqrt_ss(sse_value); + return _mm_cvtss_f32(sse_value); +} + +template +void perform_parallel_simd_aligned( + const std::complex* input, float* output, int n, const fnc& f) +{ + for (int i = 0; i <= n - 4; i += 4) { + // Safe according to C++ standard + auto p1 = _mm_load_ps(reinterpret_cast(input + i)); + auto p2 = _mm_load_ps(reinterpret_cast(input + i + 2)); + + // p1 = {real(c1), imag(c1), real(c2), imag(c2)} + // p2 = {real(c3), imag(c3), real(c4), imag(c4)} + + auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0)); + auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1)); + + __m128 out; + f(rp, ip, out); + + _mm_store_ps(output + i, out); + } + // deal with last partial packet + for (int i = n & (~3); i < n; ++i) { + __m128 out; + f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out); + output[i] = _mm_cvtss_f32(out); + } +} + +inline void rotate_parallel_simd_aligned( + const float* oldPhase, const float* newPhase, std::complex* output, + int n) +{ + for (int i = 0; i <= n - 4; i += 4) { + auto [sin, cos] = sincos_ps( + oldPhase + ? _mm_sub_ps(_mm_load_ps(newPhase + i), _mm_load_ps(oldPhase + i)) + : _mm_load_ps(newPhase + i)); + + // Safe according to C++ standard + auto p1 = _mm_load_ps(reinterpret_cast(output + i)); + auto p2 = _mm_load_ps(reinterpret_cast(output + i + 2)); + + // p1 = {real(c1), imag(c1), real(c2), imag(c2)} + // p2 = {real(c3), imag(c3), real(c4), imag(c4)} + + auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0)); + auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1)); + + // We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos) + + auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin)); + auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos)); + + p1 = _mm_unpacklo_ps(out_rp, out_ip); + p2 = _mm_unpackhi_ps(out_rp, out_ip); + + _mm_store_ps(reinterpret_cast(output + i), p1); + _mm_store_ps(reinterpret_cast(output + i + 2), p2); + } + // deal with last partial packet + for (int i = n & (~3); i < n; ++i) { + const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i]; + output[i] *= std::complex(cosf(theta), sinf(theta)); + } +} +} // namespace simd_complex_conversions diff --git a/libs/staffpad/SimdTypes.h b/libs/staffpad/SimdTypes.h new file mode 100644 index 0000000000..4c0433dd32 --- /dev/null +++ b/libs/staffpad/SimdTypes.h @@ -0,0 +1,128 @@ +/* + Simd-types for parallel dsp processing. + Aligned memory allocation for simd vectors. + */ + +#pragma once +#include +#include + +#if _MSC_VER +#define __finl __forceinline +#define __vecc __vectorcall +#else +#define __finl inline __attribute__((always_inline)) +#define __vecc +#endif + +#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#include "SimdTypes_sse2.h" +#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64) +#include "SimdTypes_neon.h" +#else +#include "SimdTypes_scalar.h" +#endif + +namespace staffpad::audio::simd { +/// reserve aligned memory. Needs to be freed with aligned_free() +inline void* aligned_malloc(size_t required_bytes, size_t alignment) +{ + auto offset = alignment - 1 + sizeof(void*); + auto p1 = std::malloc(required_bytes + offset); + if (p1 == nullptr) { + return nullptr; + } + // figure out aligned position + void* p2 = (void*)(((size_t)(p1) + offset) & ~(alignment - 1)); + // write malloced pointer in front of aligned data + ((void**)p2)[-1] = p1; + return p2; +} + +/// free memory allocated with aligned_malloc +inline void aligned_free(void* p) +{ + if (p) { + free(((void**)p)[-1]); + } +} + +/// create a c++ class at an memory-aligned spot that needs to be deleted using aligned_delete +template +inline cls* aligned_new(int alignment) +{ + void* mem = aligned_malloc(sizeof(cls), alignment); + return new (mem) cls(); +} + +/** delete objects created using aligned_new */ +template +inline void aligned_delete(cls* obj) +{ + if (obj != nullptr) { + obj->~cls(); + aligned_free((void*)obj); + } +} + +template +inline bool is_aligned(T* obj, int alignment) +{ + return (((size_t)obj) & (alignment - 1)) == 0; +} + +/** this template allows to write float SIMD code with the supported operators the following way: + +float *a_vec; +const float *b_vec; +perform_parallel_simd_aligned(a_vec, b_vec, 512, [](auto &a, auto &b) { +auto t = a; +a = 3.f * a + 12.f * b; +b = 0.25f * a + 3.f * b; +}); +*/ + +// two buffers read/write +template +__finl void perform_parallel_simd_aligned(float* a, float* b, int n, const fnc& f) +{ + // fnc& f needs to be a lambda of type [](auto &a, auto &b){}. + // the autos will be float_x4/float + constexpr int N = 4; + constexpr int byte_size = sizeof(float); + + assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size)); + + for (int i = 0; i <= n - N; i += N) { + auto x = float_x4_load_aligned(a + i); + auto y = float_x4_load_aligned(b + i); + f(x, y); + store_aligned(x, a + i); + store_aligned(y, b + i); + } + // deal with last partial packet + for (int i = n & (~(N - 1)); i < n; ++i) { + f(a[i], b[i]); + } +} + +/// template for applying math to one data buffer +template +__finl void perform_parallel_simd_aligned(float* a, int n, const fnc& f) +{ + // fnc& f needs to be a lambda of type [](auto &a){}. + constexpr int N = 4; + constexpr int byte_size = sizeof(float); + assert(is_aligned(a, N * byte_size)); + + for (int i = 0; i <= n - N; i += N) { + auto x = float_x4_load_aligned(a + i); + f(x); + store_aligned(x, a + i); + } + // deal with last partial packet + for (int i = n & (~(N - 1)); i < n; ++i) { + f(a[i]); + } +} +} // namespace staffpad::audio::simd diff --git a/libs/staffpad/SimdTypes_neon.h b/libs/staffpad/SimdTypes_neon.h new file mode 100644 index 0000000000..9d9e2c58e6 --- /dev/null +++ b/libs/staffpad/SimdTypes_neon.h @@ -0,0 +1,161 @@ +/* + Neon version of SIMD types. + */ + +#pragma once + +#if _MSC_VER +#include +#define __finl __forceinline +#define __vecc __vectorcall +#else +#include +#define __finl inline __attribute__((always_inline)) +#define __vecc +#endif + +#include + +namespace staffpad::audio::simd { +struct float_x4 +{ + float32x4_t s; + + __finl float_x4() + { + } + + /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f}; + __finl float_x4(float val) + { + s = vdupq_n_f32(val); + } + + __finl float_x4(const float32x4_t& val) + : s(val) + { + } + + /// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f}; + __finl float_x4(float v0, float v1, float v2, float v3) + { +#if _MSC_VER // aggregate initializer won't work unless we have {.n128_f32 = ..} in c++20 + s.n128_f32[0] = v0; + s.n128_f32[1] = v1; + s.n128_f32[2] = v2; + s.n128_f32[3] = v3; +#elif __clang__ + s = { v0, v1, v2, v3 }; +#else + float f[4] = { v0, v1, v2, v3 }; + s = vld1q_f32(f); +#endif + } + +#if __clang__ +private: + // this helper class allows writing to the single registers for clang + // __mm128 is a built-in type -> we can't return a float& reference. + // this is just syntax sugar and clang will remove it during builds. + // + // it allows to write + // float_x4 a; + // a[1] = 2.f; + struct RegisterAccessWrapper + { + float32x4_t& val; + int i; + + void operator=(float x) + { + val[i] = x; + } + + operator float() noexcept + { + return val[i]; + } + }; + +public: + __finl RegisterAccessWrapper operator[](int n) + { + RegisterAccessWrapper raw = { s, n }; + return raw; + } + + __finl const float operator[](int n) const + { + return s[n]; + } + +#elif _MSC_VER + // on msvc returning a ref to a sub-register is possible + __finl float& operator[](int n) + { + return s.n128_f32[n]; + } + + __finl const float operator[](int n) const + { + return s.n128_f32[n]; + } + +#endif +}; + +__finl float_x4 __vecc float_x4_from_float(float x) +{ + return vdupq_n_f32(x); +} + +__finl float_x4 __vecc float_x4_load_aligned(const float* x) +{ + return vld1q_f32(x); +} + +__finl void __vecc store_aligned(const float_x4& a, float* x) +{ + vst1q_f32(x, a.s); +} + +__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b) +{ + return vuzp1q_f32(a.s, b.s); +} + +__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b) +{ + return vuzp2q_f32(a.s, b.s); +} + +__finl float_x4 __vecc operator+(float_x4 a, float_x4 b) +{ + return vaddq_f32(a.s, b.s); +} + +__finl float_x4 __vecc operator-(float_x4 a, float_x4 b) +{ + return vsubq_f32(a.s, b.s); +} + +__finl float_x4 __vecc operator*(float_x4 a, float_x4 b) +{ + return vmulq_f32(a.s, b.s); +} + +__finl float_x4 __vecc sqrt(const float_x4& a) +{ + return vsqrtq_f32(a.s); +} + +__finl float __vecc rint(float a) +{ + return std::rint(a); +} + +__finl float_x4 __vecc rint(const float_x4& a) +{ + return vrndnq_f32(a.s); +} +} // namespace staffpad::audio::simd diff --git a/libs/staffpad/SimdTypes_scalar.h b/libs/staffpad/SimdTypes_scalar.h new file mode 100644 index 0000000000..1b3dfae146 --- /dev/null +++ b/libs/staffpad/SimdTypes_scalar.h @@ -0,0 +1,108 @@ +/* + scalar emulation of simd types. + */ + +#pragma once +#include +#include + +#if _MSC_VER +#define __finl __forceinline +#define __vecc __vectorcall +#else +#define __finl inline __attribute__((always_inline)) +#define __vecc +#endif + +namespace staffpad::audio::simd { +struct float_x4 +{ + float v[4]; + + __finl float_x4() + { + } + + /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f}; + __finl float_x4(float val) + { + v[0] = v[1] = v[2] = v[3] = val; + } + + /// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f}; + __finl float_x4(float v0, float v1, float v2, float v3) + { + v[0] = v0; + v[1] = v1; + v[2] = v2; + v[3] = v3; + } + + __finl float& operator[](int n) + { + return v[n]; + } + + __finl const float& operator[](int n) const + { + return v[n]; + } +}; + +__finl float_x4 __vecc float_x4_from_float(float x) +{ + return { x, x, x, x }; +} + +__finl float_x4 __vecc float_x4_load_aligned(const float* x) +{ + return { x[0], x[1], x[2], x[3] }; +} + +__finl void __vecc store_aligned(const float_x4& a, float* x) +{ + for (int i = 0; i < 4; ++i) { + x[i] = a[i]; + } +} + +__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b) +{ + return { a[0], a[2], b[0], b[2] }; +} + +__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b) +{ + return { a[1], a[1], b[3], b[3] }; +} + +__finl float_x4 __vecc operator+(float_x4 a, float_x4 b) +{ + return { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] }; +} + +__finl float_x4 __vecc operator-(float_x4 a, float_x4 b) +{ + return { a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3] }; +} + +__finl float_x4 __vecc operator*(float_x4 a, float_x4 b) +{ + return { a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3] }; +} + +__finl float_x4 __vecc sqrt(const float_x4& a) +{ + return { std::sqrt(a[0]), std::sqrt(a[1]), std::sqrt(a[2]), std::sqrt(a[3]) }; +} + +__finl float __vecc rint(float a) +{ + return std::rint(a); +} + +__finl float_x4 __vecc rint(const float_x4& a) +{ + return { std::rint(a[0]), std::rint(a[1]), std::rint(a[2]), std::rint(a[3]) }; +} +} // namespace staffpad::audio::simd diff --git a/libs/staffpad/SimdTypes_sse2.h b/libs/staffpad/SimdTypes_sse2.h new file mode 100644 index 0000000000..4dfad7c6d5 --- /dev/null +++ b/libs/staffpad/SimdTypes_sse2.h @@ -0,0 +1,150 @@ +/* +SSE2 simd types +*/ + +#pragma once + +#if _MSC_VER +#define __finl __forceinline +#define __vecc __vectorcall +#else +#define __finl inline __attribute__((always_inline)) +#define __vecc +#endif + +#include + +namespace staffpad::audio::simd { +// this is jumping through some hoops to get the same level of support +// for clang and msvc. With clang, the sse2 types are built-in and have +// some arithmetic operators defined. +// On msvc the sse2 types are structs with no operators. +// to get to the same level and to be able to write algorithms "naturally", +// everything needs to be wrapped in a struct. +struct float_x4 +{ + __m128 s; + __finl float_x4() + { + } + + /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f}; + __finl float_x4(float val) + { + s = _mm_set1_ps(val); + } + + __finl float_x4(const __m128& val) + : s(val) + { + } + +#if __clang__ +private: + // this helper class allows writing to the single registers for clang + // __mm128 is a built-in type -> we can't return a float& reference. + // this is just syntax sugar and clang will remove it during builds. + // + // it allows to write + // float_x4 a; + // a[1] = 2.f; + struct RegisterAccessWrapper + { + __m128& val; + int i; + + void operator=(float x) + { + val[i] = x; + } + + operator float() noexcept + { + return val[i]; + } + }; + +public: + __finl RegisterAccessWrapper operator[](int n) + { + RegisterAccessWrapper raw = { s, n }; + return raw; + } + + __finl const float operator[](int n) const + { + return s[n]; + } + +#elif _MSC_VER + // on msvc returning a ref to a sub-register is possible + __finl float& operator[](int n) + { + return s.m128_f32[n]; + } + + __finl const float operator[](int n) const + { + return s.m128_f32[n]; + } + +#endif +}; + +__finl float_x4 __vecc float_x4_from_float(float x) +{ + return _mm_set1_ps(x); +} + +__finl float_x4 __vecc float_x4_load_aligned(const float* x) +{ + return _mm_load_ps(x); +} + +__finl void __vecc store_aligned(const float_x4& a, float* x) +{ + _mm_store_ps(x, a.s); +} + +__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b) +{ + return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(2, 0, 2, 0)); +} + +__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b) +{ + return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(3, 1, 3, 1)); +} + +__finl float_x4 __vecc operator+(float_x4 a, float_x4 b) +{ + return _mm_add_ps(a.s, b.s); +} + +__finl float_x4 __vecc operator-(float_x4 a, float_x4 b) +{ + return _mm_sub_ps(a.s, b.s); +} + +__finl float_x4 __vecc operator*(float_x4 a, float_x4 b) +{ + return _mm_mul_ps(a.s, b.s); +} + +__finl float_x4 __vecc sqrt(const float_x4& a) +{ + return _mm_sqrt_ps(a.s); +} + +__finl float __vecc rint(float x) +{ + __m128i A = _mm_cvtps_epi32(_mm_set_ss(x)); + return _mm_cvtss_f32(_mm_cvtepi32_ps(A)); +} + +__finl float_x4 __vecc rint(const float_x4& a) +{ + __m128i A = _mm_cvtps_epi32(a.s); + return _mm_cvtepi32_ps(A); +} +} // namespace staffpad::audio::simd diff --git a/libs/staffpad/TimeAndPitch.cpp b/libs/staffpad/TimeAndPitch.cpp new file mode 100644 index 0000000000..bf93b469fe --- /dev/null +++ b/libs/staffpad/TimeAndPitch.cpp @@ -0,0 +1,559 @@ +#include "staffpad/TimeAndPitch.h" + +#include +#include +#include +#include +#include +#include + +#include "CircularSampleBuffer.h" +#include "FourierTransform_pffft.h" +#include "SamplesFloat.h" +#include "SimdTypes.h" + +using namespace staffpad::audio; + +namespace staffpad { +namespace { +constexpr double twoPi = 6.28318530717958647692f; + +/** + * 6-point lagrange interpolator SNR ~ -35.2db + * interpolates between samples 2 and 3 using t. + * + * smp pointer to a memory region containing 6 samples + * t fractional sample to interpolate t >= 0 && t < 1 + * return value the interpolated sample + */ +inline float lagrange6(const float (& smp)[6], float t) +{ + auto* y = &smp[2]; + + auto ym1_y1 = y[-1] + y[1]; + auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]); + auto c0 = y[0]; + auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3]; + auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2; + auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]); + auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2; + auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]); + + // Estrin's scheme + auto t2 = t * t; + return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2; +} + +void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen) +{ + std::vector > v(size); + std::uniform_real_distribution dis( + -3.14159265358979323846f, 3.14159265358979323846f); + std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); }); +} +} // namespace + +struct TimeAndPitch::impl +{ + impl(int fft_size) + : fft(fft_size) + { + randomGenerator.seed(0); + } + + FourierTransform fft; + std::mt19937 randomGenerator; + CircularSampleBuffer inResampleInputBuffer[2]; + CircularSampleBuffer inCircularBuffer[2]; + CircularSampleBuffer outCircularBuffer[2]; + CircularSampleBuffer normalizationBuffer; + + SamplesReal fft_timeseries; + SamplesComplex spectrum; + SamplesReal norm; + SamplesReal phase; + SamplesReal last_phase; + SamplesReal phase_accum; + SamplesReal cosWindow; + SamplesReal sqWindow; + SamplesReal last_norm; + SamplesReal random_phases; + + double exact_hop_a = 512.0, hop_a_err = 0.0; + double exact_hop_s = 0.0; + double next_exact_hop_s = 512.0; + double hop_s_err = 0.0; + + std::vector peak_index, trough_index; +}; + +TimeAndPitch::TimeAndPitch( + int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb) + : fftSize(fftSize) + , _reduceImaging(reduceImaging) + , _shiftTimbreCb(std::move(shiftTimbreCb)) +{ +} + +TimeAndPitch::~TimeAndPitch() +{ + // Here for ~std::shared_ptr() to know ~impl() +} + +void TimeAndPitch::setup(int numChannels, int maxBlockSize) +{ + assert(numChannels == 1 || numChannels == 2); + _numChannels = numChannels; + + d = std::make_unique(fftSize); + _maxBlockSize = maxBlockSize; + _numBins = fftSize / 2 + 1; + + // audio sample buffers + d->fft_timeseries.setSize(_numChannels, fftSize); + int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety + for (int ch = 0; ch < _numChannels; ++ch) { + d->inResampleInputBuffer[ch].setSize(_maxBlockSize + + 6); // needs additional history for resampling. more in the future + d->inCircularBuffer[ch].setSize(fftSize); + d->outCircularBuffer[ch].setSize(outBufferSize); + } + d->normalizationBuffer.setSize(outBufferSize); + + // fft coefficient buffers + d->spectrum.setSize(_numChannels, _numBins); + d->norm.setSize(1, _numBins); + d->last_norm.setSize(1, _numBins); + d->phase.setSize(_numChannels, _numBins); + d->last_phase.setSize(_numChannels, _numBins); + d->phase_accum.setSize(_numChannels, _numBins); + d->random_phases.setSize(1, _numBins); + generateRandomPhaseVector( + d->random_phases.getPtr(0), _numBins, d->randomGenerator); + + _expectedPhaseChangePerBinPerSample = twoPi / double(fftSize); + + // Cos window + // For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero. + d->cosWindow.setSize(1, fftSize); + d->sqWindow.setSize(1, fftSize); + auto* w = d->cosWindow.getPtr(0); + auto* sqw = d->sqWindow.getPtr(0); + for (int i = 0; i < fftSize; ++i) { + w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f; + sqw[i] = w[i] * w[i]; + } + + d->peak_index.reserve(_numBins); + d->trough_index.reserve(_numBins); + + // force fft to allocate all buffers now. Currently there is no other way in the fft class + d->fft.forwardReal(d->fft_timeseries, d->spectrum); + + reset(); +} + +int TimeAndPitch::getSamplesToNextHop() const +{ + return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter + + 1); // 1 extra to counter pitch factor error accumulation +} + +int TimeAndPitch::getNumAvailableOutputSamples() const +{ + return _availableOutputSamples; +} + +void TimeAndPitch::reset() +{ + _analysis_hop_counter = 0; + _availableOutputSamples = 0; + for (int ch = 0; ch < _numChannels; ++ch) { + d->inResampleInputBuffer[ch].reset(); + d->inCircularBuffer[ch].reset(); + d->outCircularBuffer[ch].reset(); + } + d->normalizationBuffer.reset(); + d->last_norm.zeroOut(); + d->last_phase.zeroOut(); + d->phase_accum.zeroOut(); + _outBufferWriteOffset = 0; + d->hop_a_err = 0.0; + d->hop_s_err = 0.0; + d->exact_hop_s = 0.0; + _resampleReadPos = 0.0; +} + +namespace { +// wrap a phase value into -PI..PI +inline float _unwrapPhase(float arg) +{ + using namespace audio::simd; + return arg - rint(arg * 0.15915494309f) * 6.283185307f; +} + +void _unwrapPhaseVec(float* v, int n) +{ + audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; }); +} + +/// rotate even-sized array by half its size to align fft phase at the center +void _fft_shift(float* v, int n) +{ + assert((n & 1) == 0); + int n2 = n >> 1; + audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) { + auto tmp = a; + a = b; + b = tmp; + }); +} + +void _lr_to_ms(float* ch1, float* ch2, int n) +{ + audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) { + auto l = a, r = b; + a = 0.5f * (l + r); + b = 0.5f * (l - r); + }); +} + +void _ms_to_lr(float* ch1, float* ch2, int n) +{ + audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) { + auto m = a, s = b; + a = m + s; + b = m - s; + }); +} +} // namespace + +// ---------------------------------------------------------------------------- + +template +void TimeAndPitch::_time_stretch(float a_a, float a_s) +{ + auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes + + // Create a norm array + const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel + const auto* norms_last = d->last_norm.getPtr(0); + + d->peak_index.clear(); + d->trough_index.clear(); + float lowest = norms[0]; + int trough = 0; + if (norms_last[0] >= norms[1]) { + d->peak_index.emplace_back(0); + d->trough_index.emplace_back(0); + } + for (int i = 1; i < _numBins - 1; ++i) { + if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) { + d->peak_index.emplace_back(i); + d->trough_index.emplace_back(trough); + trough = i; + lowest = norms[i]; + } else if (norms[i] < lowest) { + lowest = norms[i]; + trough = i; + } + } + if (norms_last[_numBins - 1] > norms[_numBins - 2]) { + d->peak_index.emplace_back(_numBins - 1); + d->trough_index.emplace_back(trough); + } + + if (d->peak_index.size() == 0) { + // use max norm of last frame + int max_idx = 0; + float max_norm = 0.f; + vo::findMaxElement(norms_last, _numBins, max_idx, max_norm); + d->peak_index.emplace_back(max_idx); + } + + const float** p = const_cast(d->phase.getPtrs()); + const float** p_l = const_cast(d->last_phase.getPtrs()); + float** acc = d->phase_accum.getPtrs(); + + float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample); + float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample); + + int num_peaks = (int)d->peak_index.size(); + // time integration for all peaks + for (int i = 0; i < num_peaks; ++i) { + const int n = d->peak_index[i]; + auto fn = float(n); + + // determine phase from last frame + float fn_expChange_a = fn * expChange_a; + float fn_expChange_s = fn * expChange_s; + + for (int ch = 0; ch < num_channels; ++ch) { + acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s; + } + } + + // go from first peak to 0 + for (int n = d->peak_index[0]; n > 0; --n) { + for (int ch = 0; ch < num_channels; ++ch) { + acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]); + } + } + + // 'grow' from pairs of peaks to the lowest norm in between + for (int i = 0; i < num_peaks - 1; ++i) { + const int mid = d->trough_index[i + 1]; + for (int n = d->peak_index[i]; n < mid; ++n) { + for (int ch = 0; ch < num_channels; ++ch) { + acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]); + } + } + for (int n = d->peak_index[i + 1]; n > mid + 1; --n) { + for (int ch = 0; ch < num_channels; ++ch) { + acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]); + } + } + } + + // last peak to the end + for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) { + for (int ch = 0; ch < num_channels; ++ch) { + acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]); + } + } + + d->last_norm.assignSamples(d->norm); + d->last_phase.assignSamples(d->phase); +} + +void TimeAndPitch::_applyImagingReduction() +{ + // Upsampling brings spectral components down, including those that were + // beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the + // spectrum, which, if not lowpass-filtered by the resampler, may yield + // an aliasing-like quality if what is mirrored is a harmonic series. A + // simple thing to do against that is to scramble the phase values there. + // This way we preserve natural energy fluctuations, but the artifact + // isn't noticeable as much anymore, because much more noise-like, which + // is what one typically gets at those frequencies. + constexpr auto alignment = 64 / sizeof(float); + const int imagingBeginBin + =std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment; + + if (imagingBeginBin >= _numBins) { + return; + } + + const auto n = _numBins - imagingBeginBin; + auto pSpec = d->spectrum.getPtr(0); + auto pRand = d->random_phases.getPtr(0); + vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n); + // Just rotating the random phase vector to produce pseudo-random + // sequences of phases. + std::uniform_int_distribution dis(0, n - 1); + const auto middle = dis(d->randomGenerator); + std::rotate(pRand, pRand + middle, pRand + n); +} + +/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer +void TimeAndPitch::_process_hop(int hop_a, int hop_s) +{ + if (d->exact_hop_a != d->exact_hop_s) { + if (_numChannels == 2) { + _lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize); + } + + for (int ch = 0; ch < _numChannels; ++ch) { + vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize); + _fft_shift(d->fft_timeseries.getPtr(ch), fftSize); + } + + // determine norm/phase + d->fft.forwardReal(d->fft_timeseries, d->spectrum); + // norms of the mid channel only (or sole channel) are needed in + // _time_stretch + vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples()); + for (int ch = 0; ch < _numChannels; ++ch) { + vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples()); + } + + if (_shiftTimbreCb) { + _shiftTimbreCb( + 1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0)); + } + + if (_reduceImaging && _pitchFactor < 1.) { + _applyImagingReduction(); + } + + if (_numChannels == 1) { + _time_stretch<1>((float)hop_a, (float)hop_s); + } else if (_numChannels == 2) { + _time_stretch<2>((float)hop_a, (float)hop_s); + } + + for (int ch = 0; ch < _numChannels; ++ch) { + _unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins); + } + + for (int ch = 0; ch < _numChannels; ++ch) { + vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch), + d->spectrum.getNumSamples()); + } + d->fft.inverseReal(d->spectrum, d->fft_timeseries); + + for (int ch = 0; ch < _numChannels; ++ch) { + vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch), + d->fft_timeseries.getNumSamples()); + } + + if (_numChannels == 2) { + _ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize); + } + + for (int ch = 0; ch < _numChannels; ++ch) { + _fft_shift(d->fft_timeseries.getPtr(ch), fftSize); + vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize); + } + } else { // stretch factor == 1.0 => just apply window + for (int ch = 0; ch < _numChannels; ++ch) { + vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize); + } + } + + { + float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor + for (int ch = 0; ch < _numChannels; ++ch) { + d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch), + gainFact); + } + + if (normalize_window) { + d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact); + } + } + _outBufferWriteOffset += hop_s; + _availableOutputSamples += hop_s; +} + +void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor) +{ + assert(timeScale > 0.0); + assert(pitchFactor > 0.0); + _pitchFactor = pitchFactor; + _timeStretch = timeScale * pitchFactor; + + _overlap_a = overlap; + double overlap_s = overlap; + if (_timeStretch > 1.0 || !modulate_synthesis_hop) { + _overlap_a *= _timeStretch; + } else { + overlap_s /= _timeStretch; + } + + d->exact_hop_a = double(fftSize) / _overlap_a; + if (!modulate_synthesis_hop) { + d->exact_hop_s = double(fftSize) / overlap_s; + } else { + // switch after processing next block, unless it's the first one + d->next_exact_hop_s = double(fftSize) / overlap_s; + if (d->exact_hop_s == 0.0) { // on first chunk set it immediately + d->exact_hop_s = d->next_exact_hop_s; + } + } +} + +void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples) +{ + assert(numSamples <= _maxBlockSize); + for (int ch = 0; ch < _numChannels; ++ch) { + d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]); + d->inResampleInputBuffer[ch].advance(numSamples); + } + _resampleReadPos -= numSamples; + + // determine integer hop size for the next hop. + // The error accumulators will make the value fluctuate by one to reach arbitrary scale + // ratios + if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors + d->exact_hop_s = d->next_exact_hop_s; + } + + const int hop_s = int(d->exact_hop_s + d->hop_s_err); + const int hop_a = int(d->exact_hop_a + d->hop_a_err); + + double step = 0.0; + double read_pos = _resampleReadPos; + while (read_pos < 0.0) + { + auto int_pos = int(std::floor(read_pos)); + float frac_pos = float(read_pos - int_pos); + for (int ch = 0; ch < _numChannels; ++ch) { + float smp[6]; + d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp); + float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos); + d->inCircularBuffer[ch].writeOffset0(s); + d->inCircularBuffer[ch].advance(1); + } + _analysis_hop_counter++; + step++; + + if (_analysis_hop_counter >= hop_a) { + _analysis_hop_counter -= hop_a; + d->hop_s_err += d->exact_hop_s - hop_s; + d->hop_a_err += d->exact_hop_a - hop_a; + for (int ch = 0; ch < _numChannels; ++ch) { + d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch)); + } + _process_hop(hop_a, hop_s); + } + read_pos = _resampleReadPos + step * _pitchFactor; + } + _resampleReadPos = read_pos; +} + +void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples) +{ + assert(numSamples <= _maxBlockSize); + for (int ch = 0; ch < _numChannels; ++ch) { + d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]); + if (normalize_window) { + constexpr float curve + =4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks + for (int i = 0; i < numSamples; ++i) { + float x = d->normalizationBuffer.read(i); + out_smp[ch][i] *= x / (x * x + 1 / curve); + } + } + d->outCircularBuffer[ch].advance(numSamples); + } + + d->normalizationBuffer.clearBlock(0, numSamples); + d->normalizationBuffer.advance(numSamples); + + _outBufferWriteOffset -= numSamples; + _availableOutputSamples -= numSamples; + + if (modulate_synthesis_hop) { + d->exact_hop_s = d->next_exact_hop_s; + } +} + +void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor) +{ + setTimeStretchAndPitchFactor(1.0, pitchFactor); + feedAudio(smp, numSamples); + retrieveAudio(smp, numSamples); +} + +int TimeAndPitch::getLatencySamples() const +{ + return fftSize - fftSize / overlap + 3; // 3 for resampling +} + +int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const +{ + const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f); + return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff))); +} +} // namespace staffpad diff --git a/libs/staffpad/VectorOps.h b/libs/staffpad/VectorOps.h new file mode 100644 index 0000000000..7179dbfbb8 --- /dev/null +++ b/libs/staffpad/VectorOps.h @@ -0,0 +1,155 @@ +// +// This header is provided for convenience, to easily wrap vector operations around +// their platform-specific optimised libraries (e.g. IPP, vDSP), if desired. +// This can be done by adding #if defined(...) compile-time branches to each function +// and calling the corresponding library function, e.g. ippsAdd_32f or vDSP_vadd +// inside add(). +// + +#pragma once + +#include + +#include +#include +#include + +#if defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 2)) +#define USE_SSE2_COMPLEX 1 +#endif + +#if USE_SSE2_COMPLEX +# include "SimdComplexConversions_sse2.h" +#endif + +namespace staffpad { +namespace vo { +inline void* allocate(int32_t bytes) +{ + return ::malloc(bytes); +} + +inline void free(void* ptr) +{ + return ::free(ptr); +} + +template +inline void copy(const T* src, T* dst, int32_t n) +{ + memcpy(dst, src, n * sizeof(T)); +} + +template +inline void add(const T* src1, const T* src2, T* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = src1[i] + src2[i]; + } +} + +template +inline void subtract(const T* src1, const T* src2, T* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = src2[i] - src1[i]; + } +} + +template +inline void constantMultiply(const T* src, T constant, T* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = src[i] * constant; + } +} + +template +inline void constantMultiplyAndAdd(const T* src, T constant, T* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] += src[i] * constant; + } +} + +template +inline void multiply(const T* src1, const T* src2, T* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = src1[i] * src2[i]; + } +} + +template +inline void setToZero(T* dst, int32_t n) +{ + std::fill(dst, dst + n, 0.f); +} + +template +inline void findMaxElement(const T* src, int32_t n, int32_t& maxIndex, T& maxValue) +{ + maxIndex = 0; + maxValue = n > 0 ? src[0] : std::numeric_limits::min(); + + for (int32_t i = 1; i < n; i++) { + if (src[i] > maxValue) { + maxValue = src[i]; + maxIndex = i; + } + } +} + +#if USE_SSE2_COMPLEX + +inline void calcPhases(const std::complex* src, float* dst, int32_t n) +{ + simd_complex_conversions::perform_parallel_simd_aligned( + src, dst, n, + [](const __m128 rp, const __m128 ip, __m128& out) + { out = simd_complex_conversions::atan2_ps(ip, rp); }); +} + +inline void calcNorms(const std::complex* src, float* dst, int32_t n) +{ + simd_complex_conversions::perform_parallel_simd_aligned( + src, dst, n, + [](const __m128 rp, const __m128 ip, __m128& out) + { out = simd_complex_conversions::norm(rp, ip); }); +} + +inline void rotate( + const float* oldPhase, const float* newPhase, std::complex* dst, + int32_t n) +{ + simd_complex_conversions::rotate_parallel_simd_aligned( + oldPhase, newPhase, dst, n); +} + +#else +inline void calcPhases(const std::complex* src, float* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = std::arg(src[i]); + } +} + +inline void calcNorms(const std::complex* src, float* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + dst[i] = std::norm(src[i]); + } +} + +inline void rotate(const float* oldPhase, const float* newPhase, std::complex* dst, int32_t n) +{ + for (int32_t i = 0; i < n; i++) { + const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i]; + dst[i] *= std::complex(cosf(theta), sinf(theta)); + } +} + +#endif +} // namespace vo +} // namespace staffpad diff --git a/libs/staffpad/pffft/README.txt b/libs/staffpad/pffft/README.txt new file mode 100644 index 0000000000..878f4a67ee --- /dev/null +++ b/libs/staffpad/pffft/README.txt @@ -0,0 +1,379 @@ +PFFFT: a pretty fast FFT. + +TL;DR +-- + +PFFFT does 1D Fast Fourier Transforms, of single precision real and +complex vectors. It tries do it fast, it tries to be correct, and it +tries to be small. Computations do take advantage of SSE1 instructions +on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The +license is BSD-like. + + +Why does it exist: +-- + +I was in search of a good performing FFT library , preferably very +small and with a very liberal license. + +When one says "fft library", FFTW ("Fastest Fourier Transform in the +West") is probably the first name that comes to mind -- I guess that +99% of open-source projects that need a FFT do use FFTW, and are happy +with it. However, it is quite a large library , which does everything +fft related (2d transforms, 3d transforms, other transformations such +as discrete cosine , or fast hartley). And it is licensed under the +GNU GPL , which means that it cannot be used in non open-source +products. + +An alternative to FFTW that is really small, is the venerable FFTPACK +v4, which is available on NETLIB. A more recent version (v5) exists, +but it is larger as it deals with multi-dimensional transforms. This +is a library that is written in FORTRAN 77, a language that is now +considered as a bit antiquated by many. FFTPACKv4 was written in 1985, +by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite +its age, benchmarks show it that it still a very good performing FFT +library, see for example the 1d single precision benchmarks here: +http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not +competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML, +Apple vDSP. The reason for that is that those libraries do take +advantage of the SSE SIMD instructions available on Intel CPUs, +available since the days of the Pentium III. These instructions deal +with small vectors of 4 floats at a time, instead of a single float +for a traditionnal FPU, so when using these instructions one may expect +a 4-fold performance improvement. + +The idea was to take this fortran fftpack v4 code, translate to C, +modify it to deal with those SSE instructions, and check that the +final performance is not completely ridiculous when compared to other +SIMD FFT libraries. Translation to C was performed with f2c ( +http://www.netlib.org/f2c/ ). The resulting file was a bit edited in +order to remove the thousands of gotos that were introduced by +f2c. You will find the fftpack.h and fftpack.c sources in the +repository, this a complete translation of +http://www.netlib.org/fftpack/ , with the discrete cosine transform +and the test program. There is no license information in the netlib +repository, but it was confirmed to me by the fftpack v5 curators that +the same terms do apply to fftpack v4: +http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a +"BSD-like" license, it is compatible with proprietary projects. + +Adapting fftpack to deal with the SIMD 4-element vectors instead of +scalar single precision numbers was more complex than I originally +thought, especially with the real transforms, and I ended up writing +more code than I planned.. + + +The code: +-- + +Only two files, in good old C, pffft.c and pffft.h . The API is very +very simple, just make sure that you read the comments in pffft.h. + + +Comparison with other FFTs: +-- + +The idea was not to break speed records, but to get a decently fast +fft that is at least 50% as fast as the fastest FFT -- especially on +slowest computers . I'm more focused on getting the best performance +on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than +on getting top performance on today fastest cpus. + +It can be used in a real-time context as the fft functions do not +perform any memory allocation -- that is why they accept a 'work' +array in their arguments. + +It is also a bit focused on performing 1D convolutions, that is why it +provides "unordered" FFTs , and a fourier domain convolution +operation. + + +Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9) +-- + +The benchmark shows the performance of various fft implementations measured in +MFlops, with the number of floating point operations being defined as 5Nlog2(N) +for a length N complex fft, and 2.5*Nlog2(N) for a real fft. +See http://www.fftw.org/speed/method.html for an explanation of these formulas. + +MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600 + +Built with: + + gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f + +| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| +| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 | +| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 | +| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 | +| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 | +| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 | +| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 | +| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 | +| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 | +| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 | +| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 | +| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 | +| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 | +| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 | +| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 | +| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 | +| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 | +| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 | +| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 | +| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 | +| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 | +| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 | +| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| + + +Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600 + +Built with: +gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm + +| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT | +|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| +| 64 | 3840 | 7680 | 8777 | | 4389 | 20480 | 11171 | +| 96 | 4214 | 9633 | 8429 | | 4816 | 22477 | 11238 | +| 128 | 3584 | 10240 | 10240 | | 5120 | 23893 | 11947 | +| 192 | 4854 | 11095 | 12945 | | 4854 | 22191 | 14121 | +| 256 | 4096 | 11703 | 16384 | | 5120 | 23406 | 13653 | +| 384 | 4395 | 14651 | 12558 | | 4884 | 19535 | 14651 | +| 512 | 5760 | 13166 | 15360 | | 4608 | 23040 | 15360 | +| 768 | 4907 | 14020 | 16357 | | 4461 | 19628 | 14020 | +| 1024 | 5120 | 14629 | 14629 | | 5120 | 20480 | 15754 | +| 2048 | 5632 | 14080 | 18773 | | 4693 | 12516 | 16091 | +| 4096 | 5120 | 13653 | 17554 | | 4726 | 7680 | 14456 | +| 8192 | 4160 | 7396 | 13312 | | 4437 | 14791 | 13312 | +| 9216 | 4210 | 6124 | 13473 | | 4491 | 7282 | 14970 | +| 16384 | 3976 | 11010 | 14313 | | 4210 | 11450 | 13631 | +| 32768 | 4260 | 10224 | 10954 | | 4260 | 6816 | 11797 | +| 262144 | 3736 | 6896 | 9961 | | 2359 | 8965 | 9437 | +| 1048576 | 2796 | 4534 | 6453 | | 1864 | 3078 | 5592 | +|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| + + + +MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo + +Built with: + + gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib + +| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| +| 64 | 745 | 2145 | 1706 | 2028 | | 961 | 3356 | 3313 | 2300 | +| 96 | 877 | n/a | 1976 | 1978 | | 1059 | n/a | 3333 | 2233 | +| 128 | 951 | 2808 | 2213 | 2279 | | 1202 | 3803 | 3739 | 2494 | +| 192 | 1002 | n/a | 2456 | 2429 | | 1186 | n/a | 3701 | 2508 | +| 256 | 1065 | 3205 | 2641 | 2793 | | 1302 | 4013 | 3912 | 2663 | +| 384 | 845 | n/a | 2759 | 2499 | | 948 | n/a | 3729 | 2504 | +| 512 | 900 | 3476 | 2956 | 2759 | | 974 | 4057 | 3954 | 2645 | +| 768 | 910 | n/a | 2912 | 2737 | | 975 | n/a | 3837 | 2614 | +| 1024 | 936 | 3583 | 3107 | 3009 | | 1006 | 4124 | 3821 | 2697 | +| 2048 | 1057 | 3585 | 3091 | 2837 | | 1089 | 3889 | 3701 | 2513 | +| 4096 | 1083 | 3524 | 3092 | 2733 | | 1039 | 3617 | 3462 | 2364 | +| 8192 | 874 | 3252 | 2967 | 2363 | | 911 | 3106 | 2789 | 2302 | +| 9216 | 898 | n/a | 2420 | 2290 | | 865 | n/a | 2676 | 2204 | +| 16384 | 903 | 2892 | 2506 | 2421 | | 899 | 3026 | 2797 | 2289 | +| 32768 | 965 | 2837 | 2550 | 2358 | | 920 | 2922 | 2763 | 2240 | +| 262144 | 738 | 2422 | 1589 | 1708 | | 610 | 2038 | 1436 | 1091 | +| 1048576 | 528 | 1207 | 845 | 880 | | 606 | 1020 | 669 | 1036 | +|-----------+------------+------------+------------+------------| |------------+------------+------------+------------| + + + +Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad + +Built with: +gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + +| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------| |------------+------------+------------| +| 64 | 1920 | 3614 | 5120 | | 2194 | 7680 | 6467 | +| 96 | 1873 | 3549 | 5187 | | 2107 | 8429 | 5863 | +| 128 | 2240 | 3773 | 5514 | | 2560 | 7964 | 6827 | +| 192 | 1765 | 4569 | 7767 | | 2284 | 9137 | 7061 | +| 256 | 2048 | 5461 | 7447 | | 2731 | 9638 | 7802 | +| 384 | 1998 | 5861 | 6762 | | 2313 | 9253 | 7644 | +| 512 | 2095 | 6144 | 7680 | | 2194 | 10240 | 7089 | +| 768 | 2230 | 5773 | 7549 | | 2045 | 10331 | 7010 | +| 1024 | 2133 | 6400 | 8533 | | 2133 | 10779 | 7877 | +| 2048 | 2011 | 7040 | 8665 | | 1942 | 10240 | 7768 | +| 4096 | 2194 | 6827 | 8777 | | 1755 | 9452 | 6827 | +| 8192 | 1849 | 6656 | 6656 | | 1752 | 7831 | 6827 | +| 9216 | 1871 | 5858 | 6416 | | 1643 | 6909 | 6266 | +| 16384 | 1883 | 6223 | 6506 | | 1664 | 7340 | 6982 | +| 32768 | 1826 | 6390 | 6667 | | 1631 | 7481 | 6971 | +| 262144 | 1546 | 4075 | 5977 | | 1299 | 3415 | 3551 | +| 1048576 | 1104 | 2071 | 1730 | | 1104 | 1149 | 1834 | +|-----------+------------+------------+------------| |------------+------------+------------| + + + +Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270 + +Built with: +gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm + +| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT | +|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| +| 64 | 452 | 1041 | 1336 | | 549 | 2318 | 1781 | +| 96 | 444 | 1297 | 1297 | | 503 | 2408 | 1686 | +| 128 | 527 | 1525 | 1707 | | 543 | 2655 | 1886 | +| 192 | 498 | 1653 | 1849 | | 539 | 2678 | 1942 | +| 256 | 585 | 1862 | 2156 | | 594 | 2777 | 2244 | +| 384 | 499 | 1870 | 1998 | | 511 | 2586 | 1890 | +| 512 | 562 | 2095 | 2194 | | 542 | 2973 | 2194 | +| 768 | 545 | 2045 | 2133 | | 545 | 2365 | 2133 | +| 1024 | 595 | 2133 | 2438 | | 569 | 2695 | 2179 | +| 2048 | 587 | 2125 | 2347 | | 521 | 2230 | 1707 | +| 4096 | 495 | 1890 | 1834 | | 492 | 1876 | 1672 | +| 8192 | 469 | 1548 | 1729 | | 438 | 1740 | 1664 | +| 9216 | 468 | 1663 | 1663 | | 446 | 1585 | 1531 | +| 16384 | 453 | 1608 | 1767 | | 398 | 1476 | 1664 | +| 32768 | 456 | 1420 | 1503 | | 387 | 1388 | 1345 | +| 262144 | 309 | 385 | 726 | | 262 | 415 | 840 | +| 1048576 | 280 | 351 | 739 | | 261 | 313 | 797 | +|------------------+--------------+--------------+--------------| |--------------+--------------+--------------| + + + +Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270 + +Built with: +cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c + +(visual c++ is definitively not very good with SSE intrinsics...) + +| N (input length) | real FFTPack | real PFFFT | | cplx FFTPack | cplx PFFFT | +|------------------+--------------+--------------| |--------------+--------------| +| 64 | 173 | 1009 | | 174 | 1159 | +| 96 | 169 | 1029 | | 188 | 1201 | +| 128 | 195 | 1242 | | 191 | 1275 | +| 192 | 178 | 1312 | | 184 | 1276 | +| 256 | 196 | 1591 | | 186 | 1281 | +| 384 | 172 | 1409 | | 181 | 1281 | +| 512 | 187 | 1640 | | 181 | 1313 | +| 768 | 171 | 1614 | | 176 | 1258 | +| 1024 | 186 | 1812 | | 178 | 1223 | +| 2048 | 190 | 1707 | | 186 | 1099 | +| 4096 | 182 | 1446 | | 177 | 975 | +| 8192 | 175 | 1345 | | 169 | 1034 | +| 9216 | 165 | 1271 | | 168 | 1023 | +| 16384 | 166 | 1396 | | 165 | 949 | +| 32768 | 172 | 1311 | | 161 | 881 | +| 262144 | 136 | 632 | | 134 | 629 | +| 1048576 | 134 | 698 | | 127 | 623 | +|------------------+--------------+--------------| |--------------+--------------| + + + +Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3) + +Built with: +gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f + +| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------| |------------+------------+------------| +| 64 | 549 | 452 | 731 | | 512 | 602 | 640 | +| 96 | 421 | 272 | 702 | | 496 | 571 | 602 | +| 128 | 498 | 512 | 815 | | 597 | 618 | 652 | +| 160 | 521 | 536 | 815 | | 586 | 669 | 625 | +| 192 | 539 | 571 | 883 | | 485 | 597 | 626 | +| 256 | 640 | 539 | 975 | | 569 | 611 | 671 | +| 384 | 499 | 610 | 879 | | 499 | 602 | 637 | +| 480 | 518 | 507 | 877 | | 496 | 661 | 616 | +| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 | +| 640 | 542 | 612 | 955 | | 568 | 663 | 645 | +| 768 | 557 | 613 | 981 | | 491 | 663 | 598 | +| 800 | 514 | 353 | 882 | | 514 | 360 | 574 | +| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 | +| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 | +| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 | +| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 | +| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 | +| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 | +| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 | +| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 | +| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 | +| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 | +|-----------+------------+------------+------------| |------------+------------+------------| + +Same platform as above, but this time pffft and fftpack are built with clang 3.2: + +clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f + +| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT | +|-----------+------------+------------+------------| |------------+------------+------------| +| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 | +| 96 | 351 | 276 | 843 | | 337 | 571 | 963 | +| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 | +| 160 | 426 | 536 | 987 | | 375 | 669 | 914 | +| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 | +| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 | +| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 | +| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 | +| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 | +| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 | +| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 | +| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 | +| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 | +| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 | +| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 | +| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 | +| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 | +| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 | +| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 | +| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 | +| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 | +| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 | +|-----------+------------+------------+------------| |------------+------------+------------| + +So it looks like, on ARM, gcc 4.7 is the best at scalar floating point +(the fftpack performance numbers are better with gcc), while clang is +the best with neon intrinsics (see how pffft perf has improved with +clang 3.2). + + +NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1). + +Built with: +gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm + +| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT | +|-----------+------------+------------| |------------+------------| +| 64 | 1735 | 3308 | | 1994 | 3744 | +| 96 | 1596 | 3448 | | 1987 | 3572 | +| 128 | 1807 | 4076 | | 2255 | 3960 | +| 160 | 1769 | 4083 | | 2071 | 3845 | +| 192 | 1990 | 4233 | | 2017 | 3939 | +| 256 | 2191 | 4882 | | 2254 | 4346 | +| 384 | 1878 | 4492 | | 2073 | 4012 | +| 480 | 1748 | 4398 | | 1923 | 3951 | +| 512 | 2030 | 5064 | | 2267 | 4195 | +| 640 | 1918 | 4756 | | 2094 | 4184 | +| 768 | 2099 | 4907 | | 2048 | 4297 | +| 800 | 1822 | 4555 | | 1880 | 4063 | +| 1024 | 2232 | 5355 | | 2187 | 4420 | +| 2048 | 2176 | 4983 | | 2027 | 3602 | +| 2400 | 1741 | 4256 | | 1710 | 3344 | +| 4096 | 1816 | 3914 | | 1851 | 3349 | +| 8192 | 1716 | 3481 | | 1700 | 3255 | +| 9216 | 1735 | 3589 | | 1653 | 3094 | +| 16384 | 1567 | 3483 | | 1637 | 3244 | +| 32768 | 1624 | 3240 | | 1655 | 3156 | +| 262144 | 1012 | 1898 | | 983 | 1503 | +| 1048576 | 876 | 1154 | | 868 | 1341 | +|-----------+------------+------------| |------------+------------| + +The performance on the tegra K1 is pretty impressive. I'm not +including the FFTW numbers as they as slightly below the scalar +fftpack numbers, so something must be wrong (however it seems to be +correctly configured and is using neon simd instructions). + +When using clang 3.4 the pffft version is even a bit faster, reaching +5.7 GFlops for real ffts of size 1024. diff --git a/libs/staffpad/pffft/pffft.c b/libs/staffpad/pffft/pffft.c new file mode 100644 index 0000000000..779f266831 --- /dev/null +++ b/libs/staffpad/pffft/pffft.c @@ -0,0 +1,1866 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + Copyright (c) 2020 Hayati Ayguen ( h_ayguen@web.de ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB + (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber + of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. + + + PFFFT : a Pretty Fast FFT. + + This file is largerly based on the original FFTPACK implementation, modified in + order to take advantage of SIMD instructions of modern CPUs. +*/ + +/* + ChangeLog: + - 2011/10/02, version 1: This is the very first release of this file. +*/ + +#include "pffft.h" + +/* detect compiler flavour */ +#if defined(_MSC_VER) +# define COMPILER_MSVC +#elif defined(__GNUC__) +# define COMPILER_GCC +#endif + +#include +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI (3.14159265358979323846) +#endif + +#ifndef M_SQRT2 +#define M_SQRT2 (1.41421356237309504880) +#endif + +#if defined(COMPILER_GCC) +# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) +# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; +#elif defined(COMPILER_MSVC) +# define ALWAYS_INLINE(return_type) __forceinline return_type +# define NEVER_INLINE(return_type) __declspec(noinline) return_type +# define RESTRICT __restrict +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) +#endif + + +#ifdef COMPILER_MSVC +#pragma warning( disable : 4244 4305 4204 4456 ) +#endif + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ + + +// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code +//#define PFFFT_SIMD_DISABLE + +#ifdef PFFFT_FLOAT +#define float PFFFT_FLOAT +#endif + + +#include "pfsimd_macros.h" + +/* detect bugs with the vector support macros */ +void validate_pffft_simd(void); // silence warning +void validate_pffft_simd(void) { +#ifndef PFFFT_SIMD_DISABLE + Vvalidate_simd(); +#endif +} + +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... +void *pffft_aligned_malloc(size_t nb_bytes) { + return Valigned_malloc(nb_bytes); +} + +void pffft_aligned_free(void *p) { + Valigned_free(p); +} + +int pffft_simd_size(void) { return SIMD_SZ; } + +/* + passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 +*/ +static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { + int k, i; + int l1ido = l1*ido; + if (ido <= 2) { + for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { + ch[0] = VADD(cc[0], cc[ido+0]); + ch[l1ido] = VSUB(cc[0], cc[ido+0]); + ch[1] = VADD(cc[1], cc[ido+1]); + ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); + } + } else { + for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { + for (i=0; i 2); + for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { + for (i=0; i 2); + for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { + for (i = 0; i < ido-1; i += 2) { + ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); + ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); + ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); + ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); + tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); + tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); + ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); + cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); + ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); + cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); + ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); + cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); + ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + dr3 = VSUB(cr3, ci4); + dr4 = VADD(cr3, ci4); + di3 = VADD(ci3, cr4); + di4 = VSUB(ci3, cr4); + dr5 = VADD(cr2, ci5); + dr2 = VSUB(cr2, ci5); + di5 = VSUB(ci2, cr5); + di2 = VADD(ci2, cr5); + wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; + wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1]; + VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} + +static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { + static const float minus_one = -1.f; + int i, k, l1ido = l1*ido; + for (k=0; k < l1ido; k += ido) { + v4sf a = cc[k], b = cc[k + l1ido]; + ch[2*k] = VADD(a, b); + ch[2*(k+ido)-1] = VSUB(a, b); + } + if (ido < 2) return; + if (ido != 2) { + for (k=0; k < l1ido; k += ido) { + for (i=2; i 5) { + wa[i1-1] = wa[i-1]; + wa[i1] = wa[i]; + } + } + l1 = l2; + } +} /* cffti1 */ + + +v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { + v4sf *in = (v4sf*)input_readonly; + v4sf *out = (in == work2 ? work1 : work2); + int nf = ifac[1], k1; + int l1 = 1; + int iw = 0; + assert(in != out && work1 != work2); + for (k1=2; k1<=nf+1; k1++) { + int ip = ifac[k1]; + int l2 = ip*l1; + int ido = n / l2; + int idot = ido + ido; + switch (ip) { + case 5: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + int ix4 = ix3 + idot; + passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign); + } break; + case 4: { + int ix2 = iw + idot; + int ix3 = ix2 + idot; + passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign); + } break; + case 2: { + passf2_ps(idot, l1, in, out, &wa[iw], isign); + } break; + case 3: { + int ix2 = iw + idot; + passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign); + } break; + default: + assert(0); + } + l1 = l2; + iw += (ip - 1)*idot; + if (out == work2) { + out = work1; in = work2; + } else { + out = work2; in = work1; + } + } + + return in; /* this is in fact the output .. */ +} + + +struct PFFFT_Setup { + int N; + int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) + int ifac[15]; + pffft_transform_t transform; + v4sf *data; // allocated room for twiddle coefs + float *e; // points into 'data' , N/4*3 elements + float *twiddle; // points into 'data', N/4 elements +}; + +PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { + PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup)); + int k, m; + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } + if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } + //assert((N % 32) == 0); + s->N = N; + s->transform = transform; + /* nb of complex simd vectors */ + s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf)); + s->e = (float*)s->data; + s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); + + if (transform == PFFFT_REAL) { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*(float)M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cosf(A); + s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sinf(A); + } + } + rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } else { + for (k=0; k < s->Ncvec; ++k) { + int i = k/SIMD_SZ; + int j = k%SIMD_SZ; + for (m=0; m < SIMD_SZ-1; ++m) { + float A = -2*(float)M_PI*(m+1)*k / N; + s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cosf(A); + s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sinf(A); + } + } + cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); + } + + /* check that N is decomposable with allowed prime factors */ + for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } + if (m != N/SIMD_SZ) { + pffft_destroy_setup(s); s = 0; + } + + return s; +} + + +void pffft_destroy_setup(PFFFT_Setup *s) { + pffft_aligned_free(s->data); + free(s); +} + +#if !defined(PFFFT_SIMD_DISABLE) + +/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ +static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { + v4sf g0, g1; + int k; + INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; + + *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] + for (k=1; k < N; ++k) { + v4sf h0, h1; + INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; + *--out = VSWAPHL(g1, h0); + *--out = VSWAPHL(h0, h1); + g1 = h1; + } + *--out = VSWAPHL(g1, g0); +} + +static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { + v4sf g0, g1, h0, h1; + int k; + g0 = g1 = in[0]; ++in; + for (k=1; k < N; ++k) { + h0 = *in++; h1 = *in++; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; + g1 = h1; + } + h0 = *in++; h1 = g0; + g1 = VSWAPHL(g1, h0); + h0 = VSWAPHL(h0, h1); + UNINTERLEAVE2(h0, g1, out[0], out[1]); +} + +void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N, Ncvec = setup->Ncvec; + const v4sf *vin = (const v4sf*)in; + v4sf *vout = (v4sf*)out; + assert(in != out); + if (setup->transform == PFFFT_REAL) { + int k, dk = N/32; + if (direction == PFFFT_FORWARD) { + for (k=0; k < dk; ++k) { + INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); + INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); + } + reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2)); + reversed_copy(dk, vin+6, 8, (v4sf*)(out + N)); + } else { + for (k=0; k < dk; ++k) { + UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); + UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); + } + unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8); + unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8); + } + } else { + if (direction == PFFFT_FORWARD) { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); + } + } else { + for (k=0; k < Ncvec; ++k) { + int kk = (k/4) + (k%4)*(Ncvec/4); + UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); + } + } + } +} + +void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 -1 1 -1 0 0 0 0] [r2] + [1 0 -1 0 0 1 0 -1] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 1 0 -1 1 0 -1 0] [i1] + [0 0 0 0 1 -1 1 -1] [i2] + [0 -1 0 1 1 0 -1 0] [i3] + */ + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + +void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + assert(in != out); + for (k=0; k < dk; ++k) { + r0 = in[8*k+0]; i0 = in[8*k+1]; + r1 = in[8*k+2]; i1 = in[8*k+3]; + r2 = in[8*k+4]; i2 = in[8*k+5]; + r3 = in[8*k+6]; i3 = in[8*k+7]; + + sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); + si0 = VADD(i0,i2); di0 = VSUB(i0, i2); + si1 = VADD(i1,i3); di1 = VSUB(i1, i3); + + r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); + r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); + r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); + r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); + + VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); + VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); + VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; + *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; + } +} + + +static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, + const v4sf *e, v4sf *out) { + v4sf r0, i0, r1, i1, r2, i2, r3, i3; + v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; + r0 = *in0; i0 = *in1; + r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 -1 0 0 -1 0 1] [r1] + [1 0 -1 0 0 1 0 -1] [r2] + [1 -1 1 -1 0 0 0 0] [r3] + [0 0 0 0 1 1 1 1] * [i0] + [0 -1 0 1 -1 0 1 0] [i1] + [0 -1 0 1 1 0 -1 0] [i2] + [0 0 0 0 -1 1 -1 1] [i3] + */ + + //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + VCPLXMUL(r1,i1,e[0],e[1]); + VCPLXMUL(r2,i2,e[2],e[3]); + VCPLXMUL(r3,i3,e[4],e[5]); + + //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; + //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; + + sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); + sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); + si0 = VADD(i0,i2); di0 = VSUB(i0,i2); + si1 = VADD(i1,i3); di1 = VSUB(i3,i1); + + r0 = VADD(sr0, sr1); + r3 = VSUB(sr0, sr1); + i0 = VADD(si0, si1); + i3 = VSUB(si1, si0); + r1 = VADD(dr0, di1); + r2 = VSUB(dr0, di1); + i1 = VSUB(dr1, di0); + i2 = VADD(dr1, di0); + + *out++ = r0; + *out++ = i0; + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; + +} + +static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union cr, ci, *uout = (v4sf_union*)out; + v4sf save = in[7], zero=VZERO(); + float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; + static const float s = (float)M_SQRT2/2; + + cr.v = in[0]; ci.v = in[Ncvec*2-1]; + assert(in != out); + pffft_real_finalize_4x4(&zero, &zero, in+1, e, out); + + /* + [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] + + [Xr(1)] ] [1 1 1 1 0 0 0 0] + [Xr(N/4) ] [0 0 0 0 1 s 0 -s] + [Xr(N/2) ] [1 0 -1 0 0 0 0 0] + [Xr(3N/4)] [0 0 0 0 1 -s 0 s] + [Xi(1) ] [1 -1 1 -1 0 0 0 0] + [Xi(N/4) ] [0 0 0 0 0 -s -1 -s] + [Xi(N/2) ] [0 -1 0 1 0 0 0 0] + [Xi(3N/4)] [0 0 0 0 0 -s 1 -s] + */ + + xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; + xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; + xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2; + xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2; + xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1; + xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1; + xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3; + xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3; + + for (k=1; k < dk; ++k) { + v4sf save_next = in[8*k+7]; + pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, + e + k*6, out + k*8); + save = save_next; + } + +} + +static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, + const v4sf *e, v4sf *out, int first) { + v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; + /* + transformation for each column is: + + [1 1 1 1 0 0 0 0] [r0] + [1 0 0 -1 0 -1 -1 0] [r1] + [1 -1 -1 1 0 0 0 0] [r2] + [1 0 0 -1 0 1 1 0] [r3] + [0 0 0 0 1 -1 1 -1] * [i0] + [0 -1 1 0 1 0 0 1] [i1] + [0 0 0 0 1 1 -1 -1] [i2] + [0 1 -1 0 1 0 0 1] [i3] + */ + + v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); + v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); + v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); + v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); + + r0 = VADD(sr0, sr1); + r2 = VSUB(sr0, sr1); + r1 = VSUB(dr0, si1); + r3 = VADD(dr0, si1); + i0 = VSUB(di0, di1); + i2 = VADD(di0, di1); + i1 = VSUB(si0, dr1); + i3 = VADD(si0, dr1); + + VCPLXMULCONJ(r1,i1,e[0],e[1]); + VCPLXMULCONJ(r2,i2,e[2],e[3]); + VCPLXMULCONJ(r3,i3,e[4],e[5]); + + VTRANSPOSE4(r0,r1,r2,r3); + VTRANSPOSE4(i0,i1,i2,i3); + + if (!first) { + *out++ = r0; + *out++ = i0; + } + *out++ = r1; + *out++ = i1; + *out++ = r2; + *out++ = i2; + *out++ = r3; + *out++ = i3; +} + +static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { + int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks + /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ + + v4sf_union Xr, Xi, *uout = (v4sf_union*)out; + float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; + static const float s = (float)M_SQRT2; + assert(in != out); + for (k=0; k < 4; ++k) { + Xr.f[k] = ((float*)in)[8*k]; + Xi.f[k] = ((float*)in)[8*k+4]; + } + + pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values + + /* + [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] + + [cr0] [1 0 2 0 1 0 0 0] + [cr1] [1 0 0 0 -1 0 -2 0] + [cr2] [1 0 -2 0 1 0 0 0] + [cr3] [1 0 0 0 -1 0 2 0] + [ci0] [0 2 0 2 0 0 0 0] + [ci1] [0 s 0 -s 0 -s 0 -s] + [ci2] [0 0 0 0 0 -2 0 2] + [ci3] [0 -s 0 s 0 -s 0 -s] + */ + for (k=1; k < dk; ++k) { + pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0); + } + + cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; + cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; + cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; + cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; + ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0; + ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; + ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2; + ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; +} + + +void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, + pffft_direction_t direction, int ordered) { + int k, Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + // temporary buffer is allocated on the stack if the scratch pointer is NULL + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + + const v4sf *vinput = (const v4sf*)finput; + v4sf *voutput = (v4sf*)foutput; + v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack }; + int ib = (nf_odd ^ ordered ? 1 : 0); + + assert(VALIGNED(finput) && VALIGNED(foutput)); + + //assert(finput != foutput); + if (direction == PFFFT_FORWARD) { + ib = !ib; + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } else { + v4sf *tmp = buff[ib]; + for (k=0; k < Ncvec; ++k) { + UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); + } + ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e); + } + if (ordered) { + pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD); + } else ib = !ib; + } else { + if (vinput == buff[ib]) { + ib = !ib; // may happen when finput == foutput + } + if (ordered) { + pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); + vinput = buff[ib]; ib = !ib; + } + if (setup->transform == PFFFT_REAL) { + pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e); + ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + for (k=0; k < Ncvec; ++k) { + INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); + } + } + } + + if (buff[ib] != voutput) { + /* extra copy required -- this situation should only happen when finput == foutput */ + assert(finput==foutput); + for (k=0; k < Ncvec; ++k) { + v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; + voutput[2*k] = a; voutput[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == voutput); +} + +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + int Ncvec = s->Ncvec; + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM + v4sf vscal = LD_PS1(scaling); + int i; +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + ar = ((v4sf_union*)va)[0].f[0]; + ai = ((v4sf_union*)va)[1].f[0]; + br = ((v4sf_union*)vb)[0].f[0]; + bi = ((v4sf_union*)vb)[1].f[0]; + abr = ((v4sf_union*)vab)[0].f[0]; + abi = ((v4sf_union*)vab)[1].f[0]; + +#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc + const float *a_ = a, *b_ = b; float *ab_ = ab; + int N = Ncvec; + asm volatile("mov r8, %2 \n" + "vdup.f32 q15, %4 \n" + "1: \n" + "pld [%0,#64] \n" + "pld [%1,#64] \n" + "pld [%2,#64] \n" + "pld [%0,#96] \n" + "pld [%1,#96] \n" + "pld [%2,#96] \n" + "vld1.f32 {q0,q1}, [%0,:128]! \n" + "vld1.f32 {q4,q5}, [%1,:128]! \n" + "vld1.f32 {q2,q3}, [%0,:128]! \n" + "vld1.f32 {q6,q7}, [%1,:128]! \n" + "vld1.f32 {q8,q9}, [r8,:128]! \n" + + "vmul.f32 q10, q0, q4 \n" + "vmul.f32 q11, q0, q5 \n" + "vmul.f32 q12, q2, q6 \n" + "vmul.f32 q13, q2, q7 \n" + "vmls.f32 q10, q1, q5 \n" + "vmla.f32 q11, q1, q4 \n" + "vld1.f32 {q0,q1}, [r8,:128]! \n" + "vmls.f32 q12, q3, q7 \n" + "vmla.f32 q13, q3, q6 \n" + "vmla.f32 q8, q10, q15 \n" + "vmla.f32 q9, q11, q15 \n" + "vmla.f32 q0, q12, q15 \n" + "vmla.f32 q1, q13, q15 \n" + "vst1.f32 {q8,q9},[%2,:128]! \n" + "vst1.f32 {q0,q1},[%2,:128]! \n" + "subs %3, #2 \n" + "bne 1b \n" + : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); +#else // default routine, works fine for non-arm cpus with current compilers + for (i=0; i < Ncvec; i += 2) { + v4sf ar, ai, br, bi; + ar = va[2*i+0]; ai = va[2*i+1]; + br = vb[2*i+0]; bi = vb[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); + vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); + ar = va[2*i+2]; ai = va[2*i+3]; + br = vb[2*i+2]; bi = vb[2*i+3]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); + vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); + } +#endif + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling; + ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling; + } +} + + +void pffft_zconvolve_no_accu(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + v4sf vscal = LD_PS1(scaling); + const v4sf * RESTRICT va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * RESTRICT vab = (v4sf*)ab; + float sar, sai, sbr, sbi; + const int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(vab); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(vab+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(vab+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); + __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + sar = ((v4sf_union*)va)[0].f[0]; + sai = ((v4sf_union*)va)[1].f[0]; + sbr = ((v4sf_union*)vb)[0].f[0]; + sbi = ((v4sf_union*)vb)[1].f[0]; + + /* default routine, works fine for non-arm cpus with current compilers */ + for (k=0; k < NcvecMulTwo; k += 4) { + v4sf var, vai, vbr, vbi; + var = va[k+0]; vai = va[k+1]; + vbr = vb[k+0]; vbi = vb[k+1]; + VCPLXMUL(var, vai, vbr, vbi); + vab[k+0] = VMUL(var, vscal); + vab[k+1] = VMUL(vai, vscal); + var = va[k+2]; vai = va[k+3]; + vbr = vb[k+2]; vbi = vb[k+3]; + VCPLXMUL(var, vai, vbr, vbi); + vab[k+2] = VMUL(var, vscal); + vab[k+3] = VMUL(vai, vscal); + } + + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling; + ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling; + } +} + + +#else // defined(PFFFT_SIMD_DISABLE) + +// standard routine using scalar floats, without SIMD stuff. + +#define pffft_zreorder_nosimd pffft_zreorder +void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { + int k, N = setup->N; + if (setup->transform == PFFFT_COMPLEX) { + for (k=0; k < 2*N; ++k) out[k] = in[k]; + return; + } + else if (direction == PFFFT_FORWARD) { + float x_N = in[N-1]; + for (k=N-1; k > 1; --k) out[k] = in[k-1]; + out[0] = in[0]; + out[1] = x_N; + } else { + float x_N = in[1]; + for (k=1; k < N-1; ++k) out[k] = in[k+1]; + out[0] = in[0]; + out[N-1] = x_N; + } +} + +#define pffft_transform_internal_nosimd pffft_transform_internal +void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, + pffft_direction_t direction, int ordered) { + int Ncvec = setup->Ncvec; + int nf_odd = (setup->ifac[1] & 1); + + // temporary buffer is allocated on the stack if the scratch pointer is NULL + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); + float *buff[2]; + int ib; + if (scratch == 0) scratch = scratch_on_stack; + buff[0] = output; buff[1] = scratch; + + if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered. + ib = (nf_odd ^ ordered ? 1 : 0); + + if (direction == PFFFT_FORWARD) { + if (setup->transform == PFFFT_REAL) { + ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); + } + if (ordered) { + pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; + } + } else { + if (input == buff[ib]) { + ib = !ib; // may happen when finput == foutput + } + if (ordered) { + pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); + input = buff[!ib]; + } + if (setup->transform == PFFFT_REAL) { + ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); + } else { + ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], + setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); + } + } + if (buff[ib] != output) { + int k; + // extra copy required -- this situation should happens only when finput == foutput + assert(input==output); + for (k=0; k < Ncvec; ++k) { + float a = buff[ib][2*k], b = buff[ib][2*k+1]; + output[2*k] = a; output[2*k+1] = b; + } + ib = !ib; + } + assert(buff[ib] == output); +} + +#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate +void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, + float *ab, float scaling) { + int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + + if (s->transform == PFFFT_REAL) { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]*scaling; + ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; + ++ab; ++a; ++b; NcvecMulTwo -= 2; + } + for (k=0; k < NcvecMulTwo; k += 2) { + float ar, ai, br, bi; + ar = a[k+0]; ai = a[k+1]; + br = b[k+0]; bi = b[k+1]; + VCPLXMUL(ar, ai, br, bi); + ab[k+0] += ar*scaling; + ab[k+1] += ai*scaling; + } +} + + +#define pffft_zconvolve_no_accu_nosimd pffft_zconvolve_no_accu +void pffft_zconvolve_no_accu_nosimd(PFFFT_Setup *s, const float *a, const float *b, + float *ab, float scaling) { + int NcvecMulTwo = 2*s->Ncvec; /* int Ncvec = s->Ncvec; */ + int k; /* was i -- but always used "2*i" - except at for() */ + + if (s->transform == PFFFT_REAL) { + // take care of the fftpack ordering + ab[0] += a[0]*b[0]*scaling; + ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling; + ++ab; ++a; ++b; NcvecMulTwo -= 2; + } + for (k=0; k < NcvecMulTwo; k += 2) { + float ar, ai, br, bi; + ar = a[k+0]; ai = a[k+1]; + br = b[k+0]; bi = b[k+1]; + VCPLXMUL(ar, ai, br, bi); + ab[k+0] = ar*scaling; + ab[k+1] = ai*scaling; + } +} + + +#endif // defined(PFFFT_SIMD_DISABLE) + +void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0); +} + +void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { + pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1); +} + + +int pffft_next_power_of_two(int N) { + /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */ + /* compute the next highest power of 2 of 32-bit v */ + unsigned v = N; + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +int pffft_is_power_of_two(int N) { + /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */ + int f = N && !(N & (N - 1)); + return f; +} + +int pffft_min_fft_size(pffft_transform_t transform) { + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) + return ( 2 * SIMD_SZ * SIMD_SZ ); + else if (transform == PFFFT_COMPLEX) + return ( SIMD_SZ * SIMD_SZ ); + else + return 1; +} + diff --git a/libs/staffpad/pffft/pffft.h b/libs/staffpad/pffft/pffft.h new file mode 100644 index 0000000000..66459ea8dd --- /dev/null +++ b/libs/staffpad/pffft/pffft.h @@ -0,0 +1,215 @@ +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Based on original fortran 77 code from FFTPACKv4 from NETLIB, + authored by Dr Paul Swarztrauber of NCAR, in 1985. + + As confirmed by the NCAR fftpack software curators, the following + FFTPACKv5 license applies to FFTPACKv4 sources. My changes are + released under the same terms. + + FFTPACK license: + + http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html + + Copyright (c) 2004 the University Corporation for Atmospheric + Research ("UCAR"). All rights reserved. Developed by NCAR's + Computational and Information Systems Laboratory, UCAR, + www.cisl.ucar.edu. + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +/* + PFFFT : a Pretty Fast FFT. + + This is basically an adaptation of the single precision fftpack + (v4) as found on netlib taking advantage of SIMD instruction found + on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). + + For architectures where no SIMD instruction is available, the code + falls back to a scalar version. + + Restrictions: + + - 1D transforms only, with 32-bit single precision. + + - supports only transforms for inputs of length N of the form + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. + + - all (float*) pointers in the functions below are expected to + have an "simd-compatible" alignment, that is 16 bytes on x86 and + powerpc CPUs. + + You can allocate such buffers with the functions + pffft_aligned_malloc / pffft_aligned_free (or with stuff like + posix_memalign..) + +*/ + +#pragma once + +#include // for size_t + +#ifndef PFFFT_FLOAT + #if 1 + /* default: float */ + #define PFFFT_FLOAT float + #else + #define PFFFT_FLOAT double + #ifndef PFFFT_SIMD_DISABLE + /* double only with PFFFT_SIMD_DISABLE */ + #define PFFFT_SIMD_DISABLE 1 + #endif + #endif +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + + /* opaque struct holding internal stuff (precomputed twiddle factors) + this struct can be shared by many threads as it contains only + read-only data. + */ + typedef struct PFFFT_Setup PFFFT_Setup; + + /* direction of the transform */ + typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t; + + /* type of transform */ + typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; + + /* + prepare for performing transforms of size N -- the returned + PFFFT_Setup structure is read-only so it can safely be shared by + multiple concurrent threads. + */ + PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); + void pffft_destroy_setup(PFFFT_Setup *); + /* + Perform a Fourier transform , The z-domain data is stored in the + most efficient order for transforming it back, or using it for + convolution. If you need to have its content sorted in the + "usual" way, that is as an array of interleaved complex numbers, + either use pffft_transform_ordered , or call pffft_zreorder after + the forward fft, and before the backward fft. + + Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. + Typically you will want to scale the backward transform by 1/N. + + The 'work' pointer should point to an area of N (2*N for complex + fft) floats, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). Threads usually have a small stack, that + there's no sufficient amount of memory, usually leading to a creash! + Use the heap with pffft_aligned_malloc() in this case. + + input and output may alias. + */ + void pffft_transform(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction); + + /* + Similar to pffft_transform, but makes sure that the output is + ordered as expected (interleaved complex numbers). This is + similar to calling pffft_transform and then pffft_zreorder. + + input and output may alias. + */ + void pffft_transform_ordered(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction); + + /* + call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., + PFFFT_FORWARD) if you want to have the frequency components in + the correct "canonical" order, as interleaved complex numbers. + + (for real transforms, both 0-frequency and half frequency + components, which are real, are assembled in the first entry as + F(0)+i*F(n/2+1). Note that the original fftpack did place + F(n/2+1) at the end of the arrays). + + input and output should not alias. + */ + void pffft_zreorder(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, pffft_direction_t direction); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and accumulate them into dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab += (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const PFFFT_FLOAT *dft_a, const PFFFT_FLOAT *dft_b, PFFFT_FLOAT *dft_ab, PFFFT_FLOAT scaling); + + /* + Perform a multiplication of the frequency components of dft_a and + dft_b and put result in dft_ab. The arrays should have + been obtained with pffft_transform(.., PFFFT_FORWARD) and should + *not* have been reordered with pffft_zreorder (otherwise just + perform the operation yourself as the dft coefs are stored as + interleaved complex numbers). + + the operation performed is: dft_ab = (dft_a * fdt_b)*scaling + + The dft_a, dft_b and dft_ab pointers may alias. + */ + void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); + + /* simple helper to get minimum possible fft size */ + int pffft_min_fft_size(pffft_transform_t transform); + + /* simple helper to determine next power of 2 + - without inexact/rounding floating point operations + */ + int pffft_next_power_of_two(int N); + + /* simple helper to determine if power of 2 - returns bool */ + int pffft_is_power_of_two(int N); + + /* + the float buffers must have the correct alignment (16-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. + */ + void *pffft_aligned_malloc(size_t nb_bytes); + void pffft_aligned_free(void *); + + /* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */ + int pffft_simd_size(void); + +#ifdef __cplusplus +} +#endif diff --git a/libs/staffpad/pffft/pfsimd_macros.h b/libs/staffpad/pffft/pfsimd_macros.h new file mode 100644 index 0000000000..ee9a0f80d2 --- /dev/null +++ b/libs/staffpad/pffft/pfsimd_macros.h @@ -0,0 +1,246 @@ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) + + Redistribution and use of the Software in source and binary forms, + with or without modification, is permitted provided that the + following conditions are met: + + - Neither the names of NCAR's Computational and Information Systems + Laboratory, the University Corporation for Atmospheric Research, + nor the names of its sponsors or contributors may be used to + endorse or promote products derived from this Software without + specific prior written permission. + + - Redistributions of source code must retain the above copyright + notices, this list of conditions, and the disclaimer below. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer below in the + documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE + SOFTWARE. +*/ + +#pragma once + +#include +#include +#include + + +/* + * SIMD reference material: + * + * general SIMD introduction: + * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing + * + * SSE 1: + * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ + * + * ARM NEON: + * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics + * + * Altivec: + * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf + * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html + * better one? + * + */ + + +/* + Altivec support macros +*/ +#if !defined(PFFFT_SIMD_DISABLE) && ((defined(__ppc__) || defined(__ppc64__)) && defined(__ALTIVEC__)) +#include +typedef vector float v4sf; +# define SIMD_SZ 4 +# define VREQUIRES_ALIGN 1 /* not sure, if really required */ +# define VZERO() ((vector float) vec_splat_u8(0)) +# define VMUL(a,b) vec_madd(a,b, VZERO()) +# define VADD(a,b) vec_add(a,b) +# define VMADD(a,b,c) vec_madd(a,b,c) +# define VSUB(a,b) vec_sub(a,b) +inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } +# define LD_PS1(p) ld_ps1(&p) +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { \ + vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \ + vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \ + v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ + } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + v4sf y0 = vec_mergeh(x0, x2); \ + v4sf y1 = vec_mergel(x0, x2); \ + v4sf y2 = vec_mergeh(x1, x3); \ + v4sf y3 = vec_mergel(x1, x3); \ + x0 = vec_mergeh(y0, y2); \ + x1 = vec_mergel(y0, y2); \ + x2 = vec_mergeh(y1, y3); \ + x3 = vec_mergel(y1, y3); \ + } +# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) + +/* + SSE1 support macros +*/ +#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86)) + +#include +typedef __m128 v4sf; +/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions + * anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ +# define SIMD_SZ 4 +# define VREQUIRES_ALIGN 1 +# define VZERO() _mm_setzero_ps() +# define VMUL(a,b) _mm_mul_ps(a,b) +# define VADD(a,b) _mm_add_ps(a,b) +# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) +# define VSUB(a,b) _mm_sub_ps(a,b) +# define LD_PS1(p) _mm_set1_ps(p) +# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr) +# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr) +# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } +# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) +# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) +/* reverse/flip all floats */ +# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3)) +/* reverse/flip complex floats */ +# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) + +/* + ARM NEON support macros +*/ +#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)) +# include +typedef float32x4_t v4sf; +# define SIMD_SZ 4 +# define VREQUIRES_ALIGN 0 /* usually no alignment required */ +# define VZERO() vdupq_n_f32(0) +# define VMUL(a,b) vmulq_f32(a,b) +# define VADD(a,b) vaddq_f32(a,b) +# define VMADD(a,b,c) vmlaq_f32(c,a,b) +# define VSUB(a,b) vsubq_f32(a,b) +# define LD_PS1(p) vld1q_dup_f32(&(p)) +# define VLOAD_UNALIGNED(ptr) (*(ptr)) +# define VLOAD_ALIGNED(ptr) (*(ptr)) +# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } +# define VTRANSPOSE4(x0,x1,x2,x3) { \ + float32x4x2_t t0_ = vzipq_f32(x0, x2); \ + float32x4x2_t t1_ = vzipq_f32(x1, x3); \ + float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ + float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ + x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ + } +// marginally faster version +//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) +#else +# if !defined(PFFFT_SIMD_DISABLE) +# warning "building with simd disabled !\n"; +# define PFFFT_SIMD_DISABLE // fallback to scalar code +# endif +#endif + +// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead +#ifdef PFFFT_SIMD_DISABLE +typedef float v4sf; +# define SIMD_SZ 1 +# define VREQUIRES_ALIGN 0 +# define VZERO() 0.f +# define VMUL(a,b) ((a)*(b)) +# define VADD(a,b) ((a)+(b)) +# define VMADD(a,b,c) ((a)*(b)+(c)) +# define VSUB(a,b) ((a)-(b)) +# define LD_PS1(p) (p) +# define VLOAD_UNALIGNED(ptr) (*(ptr)) +# define VLOAD_ALIGNED(ptr) (*(ptr)) +# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) +#endif + +// shortcuts for complex multiplcations +#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } +#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +// multiply a scalar with a vector +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif + +typedef union v4sf_union { + v4sf v; + float f[SIMD_SZ]; +} v4sf_union; + +#if !defined(PFFFT_SIMD_DISABLE) + +#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) + +/* detect bugs with the vector support macros */ +static void Vvalidate_simd() { + float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; + v4sf_union a0, a1, a2, a3, t, u; + memcpy(a0.f, f, 4*sizeof(float)); + memcpy(a1.f, f+4, 4*sizeof(float)); + memcpy(a2.f, f+8, 4*sizeof(float)); + memcpy(a3.f, f+12, 4*sizeof(float)); + + t = a0; u = a1; t.v = VZERO(); + printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); + t.v = VADD(a1.v, a2.v); + printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); + t.v = VMUL(a1.v, a2.v); + printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); + t.v = VMADD(a1.v, a2.v,a0.v); + printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); + + INTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); + UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); + printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); + assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); + + t.v=LD_PS1(f[15]); + printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 15, 15, 15, 15); + t.v = VSWAPHL(a1.v, a2.v); + printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); + assertv4(t, 8, 9, 6, 7); + VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); + printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", + a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], + a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); + assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); +} +#endif //!PFFFT_SIMD_DISABLE + +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... + +static +void *Valigned_malloc(size_t nb_bytes) { + void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); + if (!p0) return (void *) 0; + p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); + *((void **) p - 1) = p0; + return p; +} + +static +void Valigned_free(void *p) { + if (p) free(*((void **) p - 1)); +} diff --git a/libs/staffpad/staffpad/TimeAndPitch.h b/libs/staffpad/staffpad/TimeAndPitch.h new file mode 100644 index 0000000000..e1277bfb12 --- /dev/null +++ b/libs/staffpad/staffpad/TimeAndPitch.h @@ -0,0 +1,117 @@ +// +// Arbitrary time and pitch stretching, based on the Phase-locked Vocoder. +// +#pragma once + +#include +#include +#include +#include + +namespace staffpad { +class TimeAndPitch +{ +public: + /** + * `magnitude` may be null, in which case the callback should re-compute it. + */ + using ShiftTimbreCb = std::function* spectrum, const float* magnitude)>; + + TimeAndPitch( + int fftSize, bool reduceImaging = true, ShiftTimbreCb shiftTimbreCb = {}); + ~TimeAndPitch(); + /** + Setup at least once before processing. + \param numChannels Must be 1 or 2 + \param maxBlockSize The caller's maximum block size, e.g. 1024 samples + */ + void setup(int numChannels, int maxBlockSize); + + /** + Set independent time stretch and pitch factors (synchronously to processing thread). + The factor change resolution follows the processing block size, with some approximation. + As a general rule, smaller block sizes result in finer changes, although there's a lower + bound due to internal windowed segmentation and OLA resynthesis. + */ + void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor); + + /** + Return the ~number of samples until the next hop gets analysed (might be 1 + or 2 too high). For extremely small stretch ratios, several analyses might + be needed to produce samples, so you may have to repeat the process. + Eventually, more output samples are produced. + */ + int getSamplesToNextHop() const; + + /** + Tells the number of already processed output samples (after time-stretching) that + are ready for consumption. + */ + int getNumAvailableOutputSamples() const; + + /** + Feed more input samples. \see `getSamplesToNextHop()` to know how much to feed to + produce more output samples. + */ + void feedAudio(const float* const* in_smp, int numSamples); + + /** + Retrieved shifted/stretched samples in output. \see `getNumAvailableOutputSamples()` + to ask a valid amount of `numSamples`. + */ + void retrieveAudio(float* const* out_smp, int numSamples); + + /** + Pitch-shift only in-place processing. + */ + void processPitchShift(float* const* smp, int numSamples, double pitchFactor); + + /** + Latency in input samples + */ + int getLatencySamples() const; + + /** + Latency in output samples, calculated on the given timeStretch factor + */ + int getLatencySamplesForStretchRatio(float timeStretch) const; + + /** + Resets the internal state, discards any buffered input + */ + void reset(); + +private: + const int fftSize; + static constexpr int overlap = 4; + static constexpr bool normalize_window = true; // compensate for ola window overlaps + static constexpr bool modulate_synthesis_hop = true; + + void _process_hop(int hop_a, int hop_s); + template + void _time_stretch(float hop_a, float hop_s); + void _applyImagingReduction(); + + struct impl; + std::shared_ptr d; + + const bool _reduceImaging; + const ShiftTimbreCb _shiftTimbreCb; + + int _numChannels = 1; + int _maxBlockSize = 1024; + double _resampleReadPos = 0.0; + int _availableOutputSamples = 0; + int _numBins = fftSize / 2 + 1; + double _overlap_a = overlap; + + int _analysis_hop_counter = 0; + + double _expectedPhaseChangePerBinPerSample = 0.01; + double _timeStretch = 1.0; + double _pitchFactor = 1.0; + + int _outBufferWriteOffset = 0; +}; +} // namespace staffpad diff --git a/libs/staffpad/wscript b/libs/staffpad/wscript new file mode 100644 index 0000000000..0d2e4ddf17 --- /dev/null +++ b/libs/staffpad/wscript @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +# Version of this package (even if built as a child) +MAJOR = '0' +MINOR = '0' +MICRO = '0' +STAFFPAD_VERSION = "%s.%s.%s" % (MAJOR, MINOR, MICRO) + +# Library version (UNIX style major, minor, micro) +# major increment <=> incompatible changes +# minor increment <=> compatible changes (additions) +# micro increment <=> no interface changes +STAFFPAD_LIB_VERSION = '0.0.0' + +I18N_PACKAGE = 'staffpad' + +staffpad_sources = [ + 'TimeAndPitch.cpp', + 'FourierTransform_pffft.cpp', + 'pffft/pffft.c' +] + +def options(opt): + pass + +def configure(conf): + pass + +def build(bld): + obj = bld.stlib(features = 'cxx cxxstlib', source = staffpad_sources) + obj.cxxflags = [ bld.env['compiler_flags_dict']['pic'], '-O3', '-ffast-math' ] + obj.export_includes = ['.'] + obj.includes = ['.', 'pffft'] + obj.name = 'staffpad' + obj.target = 'staffpad' + obj.vnum = STAFFPAD_LIB_VERSION + obj.defines = [ 'PACKAGE="' + I18N_PACKAGE + '"' ] diff --git a/wscript b/wscript index c1ce824c80..dd145a86f6 100644 --- a/wscript +++ b/wscript @@ -332,6 +332,7 @@ children = [ 'libs/clearlooks-newer', 'libs/zita-resampler', 'libs/zita-convolver', + 'libs/staffpad', # optionally external libraries 'libs/fluidsynth', 'libs/hidapi',