mirror of
https://github.com/Ardour/ardour.git
synced 2025-12-11 09:06:33 +01:00
Add StaffPad library, thanks to Audacity
This commit is contained in:
parent
4ef4288b2a
commit
6507bcd93b
19 changed files with 4854 additions and 0 deletions
159
libs/staffpad/CircularSampleBuffer.h
Normal file
159
libs/staffpad/CircularSampleBuffer.h
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "VectorOps.h"
|
||||
|
||||
/*
|
||||
* Utility buffer class for delay-based effects
|
||||
* Allocates a 2^n size buffer for branchless write and read using a bitmask
|
||||
* It can read and write to past and future samples, but does not check for overlaps
|
||||
*/
|
||||
|
||||
namespace staffpad::audio {
|
||||
template<typename SampleT>
|
||||
class CircularSampleBuffer
|
||||
{
|
||||
public:
|
||||
CircularSampleBuffer()
|
||||
{
|
||||
}
|
||||
|
||||
~CircularSampleBuffer()
|
||||
{
|
||||
if (_buffer) {
|
||||
free(_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
void setSize(int n)
|
||||
{
|
||||
if (n > _allocatedSize) {
|
||||
auto oldSize = _allocatedSize;
|
||||
|
||||
auto findLargerPowerOfTwo = [](int32_t number) {
|
||||
int32_t powerOf2 = 1;
|
||||
while (powerOf2 < number) {
|
||||
powerOf2 *= 2;
|
||||
}
|
||||
return powerOf2;
|
||||
};
|
||||
|
||||
_allocatedSize = findLargerPowerOfTwo(n);
|
||||
_bufferSizeMask = _allocatedSize - 1;
|
||||
_buffer = (SampleT*)std::realloc(_buffer, _allocatedSize * sizeof(SampleT));
|
||||
|
||||
// Reset the new memory region
|
||||
assert(_buffer);
|
||||
std::fill(_buffer + oldSize, _buffer + _allocatedSize, 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
int getAllocatedSize() const
|
||||
{
|
||||
return _allocatedSize;
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
if (_buffer && _allocatedSize > 0) {
|
||||
memset(_buffer, 0, sizeof(SampleT) * _allocatedSize);
|
||||
}
|
||||
_position0 = 0;
|
||||
}
|
||||
|
||||
void write(int offset, const SampleT& sample)
|
||||
{
|
||||
_buffer[(_position0 + offset) & _bufferSizeMask] = sample;
|
||||
}
|
||||
|
||||
void writeOffset0(const SampleT& sample)
|
||||
{
|
||||
_buffer[_position0] = sample;
|
||||
}
|
||||
|
||||
const SampleT& read(int offset) const
|
||||
{
|
||||
return _buffer[(_position0 + offset) & _bufferSizeMask];
|
||||
}
|
||||
|
||||
/// change the 0 position by n
|
||||
void advance(int n)
|
||||
{
|
||||
_position0 += n;
|
||||
_position0 &= _bufferSizeMask;
|
||||
}
|
||||
|
||||
private:
|
||||
template<typename fnc>
|
||||
void _splitBlockOffsetFunction(int startOffset, int n, fnc f) const
|
||||
{
|
||||
assert(n <= _allocatedSize);
|
||||
int firstIndex = (_position0 + startOffset) & _bufferSizeMask;
|
||||
int n_to_end = _allocatedSize - firstIndex;
|
||||
if (n_to_end > n) {
|
||||
f(firstIndex, 0, n);
|
||||
} else {
|
||||
f(firstIndex, 0, n_to_end);
|
||||
f(0, n_to_end, n - n_to_end);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
void writeBlock(int startOffset, int n, const SampleT* sourceBlock)
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::copy(&sourceBlock[sampleOff], &_buffer[bufferOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void readBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::constantMultiply(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void readAddBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::constantMultiplyAndAdd(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void writeAddBlockWithGain(int startOffset, int n, const SampleT* sourceBlock, float gainFactor)
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::constantMultiplyAndAdd(&sourceBlock[sampleOff], gainFactor, &_buffer[bufferOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void readBlock(int startOffset, int n, SampleT* targetBlock) const
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void readAndClearBlock(int startOffset, int n, SampleT* targetBlock)
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
|
||||
vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
|
||||
vo::setToZero(&_buffer[bufferOff], n);
|
||||
});
|
||||
}
|
||||
|
||||
void clearBlock(int startOffset, int n)
|
||||
{
|
||||
_splitBlockOffsetFunction(startOffset, n,
|
||||
[=](int bufferOff, int sampleOff, int n) { vo::setToZero(&_buffer[bufferOff], n); });
|
||||
}
|
||||
|
||||
private:
|
||||
SampleT* _buffer = nullptr;
|
||||
|
||||
int _position0 = 0; // position of sample index 0 inside _buffer. This is where the next sample will be written to
|
||||
int _allocatedSize = 0; // 2^n buffer size
|
||||
int _bufferSizeMask = 0; // 2^n-1 buffer mask
|
||||
};
|
||||
} // namespace staffpad::audio
|
||||
56
libs/staffpad/FourierTransform_pffft.cpp
Normal file
56
libs/staffpad/FourierTransform_pffft.cpp
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
#include "FourierTransform_pffft.h"
|
||||
|
||||
#include "pffft.h"
|
||||
|
||||
namespace staffpad::audio {
|
||||
FourierTransform::FourierTransform(int32_t newBlockSize)
|
||||
: _blockSize{newBlockSize}
|
||||
{
|
||||
_pffft_scratch = (float*)pffft_aligned_malloc(_blockSize * sizeof(float));
|
||||
realFftSpec = pffft_new_setup(_blockSize, PFFFT_REAL);
|
||||
}
|
||||
|
||||
FourierTransform::~FourierTransform()
|
||||
{
|
||||
if (_pffft_scratch) {
|
||||
pffft_aligned_free(_pffft_scratch);
|
||||
_pffft_scratch = nullptr;
|
||||
}
|
||||
if (realFftSpec) {
|
||||
pffft_destroy_setup(realFftSpec);
|
||||
realFftSpec = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void FourierTransform::forwardReal(const SamplesReal& t, SamplesComplex& c)
|
||||
{
|
||||
assert(t.getNumSamples() == _blockSize);
|
||||
|
||||
for (auto ch = 0; ch < t.getNumChannels(); ++ch) {
|
||||
auto* spec = c.getPtr(ch); // interleaved complex numbers, size _blockSize + 2
|
||||
auto* cpx_flt = (float*)spec;
|
||||
pffft_transform_ordered(realFftSpec, t.getPtr(ch), cpx_flt, _pffft_scratch, PFFFT_FORWARD);
|
||||
// pffft combines dc and nyq values into the first complex value,
|
||||
// adjust to CCS format.
|
||||
auto dc = cpx_flt[0];
|
||||
auto nyq = cpx_flt[1];
|
||||
spec[0] = { dc, 0.f };
|
||||
spec[c.getNumSamples() - 1] = { nyq, 0.f };
|
||||
}
|
||||
}
|
||||
|
||||
void FourierTransform::inverseReal(const SamplesComplex& c, SamplesReal& t)
|
||||
{
|
||||
assert(c.getNumSamples() == _blockSize / 2 + 1);
|
||||
|
||||
for (auto ch = 0; ch < c.getNumChannels(); ++ch) {
|
||||
auto* spec = c.getPtr(ch);
|
||||
// Use t to convert in-place from CCS to pffft format
|
||||
t.assignSamples(ch, (float*)spec);
|
||||
auto* ts = t.getPtr(ch);
|
||||
ts[0] = spec[0].real();
|
||||
ts[1] = spec[c.getNumSamples() - 1].real();
|
||||
pffft_transform_ordered(realFftSpec, ts, ts, _pffft_scratch, PFFFT_BACKWARD);
|
||||
}
|
||||
}
|
||||
} // namespace staffpad::audio
|
||||
32
libs/staffpad/FourierTransform_pffft.h
Normal file
32
libs/staffpad/FourierTransform_pffft.h
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
// FFT wrapper for the PFFFT library. It is possible to use different wrappers for
|
||||
// other FFT libraries or platforms, as long as the CSS complex data format is used.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "SamplesFloat.h"
|
||||
|
||||
struct PFFFT_Setup;
|
||||
|
||||
namespace staffpad::audio {
|
||||
class FourierTransform
|
||||
{
|
||||
public:
|
||||
FourierTransform(int32_t newBlockSize);
|
||||
~FourierTransform();
|
||||
|
||||
int getSize() const { return static_cast<int>(_blockSize); }
|
||||
|
||||
void forwardReal(const SamplesReal& t, SamplesComplex& c);
|
||||
void inverseReal(const SamplesComplex& c, SamplesReal& t);
|
||||
|
||||
private:
|
||||
PFFFT_Setup* realFftSpec = nullptr;
|
||||
PFFFT_Setup* complexFftSpec = nullptr;
|
||||
float* _pffft_scratch = nullptr;
|
||||
|
||||
const int32_t _blockSize;
|
||||
int32_t _order = 0;
|
||||
};
|
||||
} // namespace staffpad::audio
|
||||
10
libs/staffpad/README
Normal file
10
libs/staffpad/README
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
StaffPad - Time and Pitch
|
||||
=========================
|
||||
|
||||
This library was copied from Audacity [1] in October 2025.
|
||||
|
||||
Licensed in terms of the GNU GNU GENERAL PUBLIC LICENSE V2 or later, unless unless otherwise noted.
|
||||
(https://github.com/audacity/audacity?tab=readme-ov-file#license)
|
||||
|
||||
[1] https://github.com/audacity/audacity/tree/master/au3/libraries/lib-time-and-pitch/StaffPad
|
||||
git rev 4cdd78b4326456cb77c7102a4f9a6435e0d474a2
|
||||
114
libs/staffpad/SamplesFloat.h
Normal file
114
libs/staffpad/SamplesFloat.h
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
#pragma once
|
||||
|
||||
#include <complex>
|
||||
#include <vector>
|
||||
|
||||
#include "SimdTypes.h"
|
||||
#include "VectorOps.h"
|
||||
|
||||
namespace staffpad {
|
||||
template<typename T = float>
|
||||
class SamplesFloat
|
||||
{
|
||||
public:
|
||||
~SamplesFloat()
|
||||
{
|
||||
for (int ch = 0; ch < num_channels; ch++) {
|
||||
dealloc(ch);
|
||||
}
|
||||
}
|
||||
|
||||
void setSize(int32_t numChannels, int32_t samples)
|
||||
{
|
||||
for (int ch = 0; ch < num_channels; ch++) {
|
||||
dealloc(ch);
|
||||
}
|
||||
|
||||
num_channels = numChannels;
|
||||
num_samples = samples;
|
||||
data.resize(num_channels);
|
||||
for (int ch = 0; ch < num_channels; ch++) {
|
||||
alloc(ch, num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t getNumChannels() const
|
||||
{
|
||||
return num_channels;
|
||||
}
|
||||
|
||||
int32_t getNumSamples() const
|
||||
{
|
||||
return num_samples;
|
||||
}
|
||||
|
||||
float** getPtrs()
|
||||
{
|
||||
return data.data();
|
||||
}
|
||||
|
||||
T* getPtr(int32_t channel)
|
||||
{
|
||||
assert(channel < num_channels);
|
||||
assert(data[channel]);
|
||||
return data[channel];
|
||||
}
|
||||
|
||||
const T* getPtr(int32_t channel) const
|
||||
{
|
||||
assert(channel < num_channels);
|
||||
assert(data[channel]);
|
||||
return data[channel];
|
||||
}
|
||||
|
||||
void assignSamples(int32_t channel, const T* input)
|
||||
{
|
||||
assert(channel < num_channels);
|
||||
assert(data[channel]);
|
||||
vo::copy(input, data[channel], num_samples);
|
||||
}
|
||||
|
||||
void assignSamples(const SamplesFloat& rhs)
|
||||
{
|
||||
assert(num_channels == rhs.num_channels);
|
||||
assert(num_samples == rhs.num_samples);
|
||||
for (int ch = 0; ch < num_channels; ch++) {
|
||||
assert(data[ch]);
|
||||
vo::copy(rhs.getPtr(ch), getPtr(ch), num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
void zeroOut()
|
||||
{
|
||||
for (int ch = 0; ch < num_channels; ch++) {
|
||||
vo::setToZero(data[ch], num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
int32_t num_channels{ 0 };
|
||||
int32_t num_samples{ 0 };
|
||||
std::vector<T*> data;
|
||||
|
||||
void alloc(int32_t channel, int32_t samples)
|
||||
{
|
||||
assert(channel < num_channels);
|
||||
if (data[channel]) {
|
||||
dealloc(channel);
|
||||
}
|
||||
data[channel] = (T*)audio::simd::aligned_malloc(samples * sizeof(T), 64);
|
||||
}
|
||||
|
||||
void dealloc(int32_t channel)
|
||||
{
|
||||
assert(channel < num_channels);
|
||||
if (data[channel]) {
|
||||
audio::simd::aligned_free(data[channel]);
|
||||
data[channel] = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef SamplesFloat<float> SamplesReal;
|
||||
typedef SamplesFloat<std::complex<float> > SamplesComplex;
|
||||
} // namespace staffpad
|
||||
361
libs/staffpad/SimdComplexConversions_sse2.h
Normal file
361
libs/staffpad/SimdComplexConversions_sse2.h
Normal file
|
|
@ -0,0 +1,361 @@
|
|||
/* SPDX-License-Identifier: zlib */
|
||||
/*
|
||||
* This code is cleaned up and modernized version of https://github.com/to-miz/sse_mathfun_extension
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#include <array>
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace simd_complex_conversions {
|
||||
// Before C++20 there is no standard and correct way to implement bit_cast.
|
||||
// It was verified that MSVC generates the correct code for the current use
|
||||
// cases
|
||||
namespace details {
|
||||
template<class To, class From>
|
||||
std::enable_if_t<
|
||||
sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From>
|
||||
&& std::is_trivially_copyable_v<To>,
|
||||
To>
|
||||
// constexpr support needs compiler magic
|
||||
bit_cast(const From& src) noexcept
|
||||
{
|
||||
static_assert(
|
||||
std::is_trivially_constructible_v<To>,
|
||||
"This implementation additionally requires "
|
||||
"destination type to be trivially constructible");
|
||||
|
||||
To dst;
|
||||
std::memcpy(&dst, &src, sizeof(To));
|
||||
return dst;
|
||||
}
|
||||
|
||||
constexpr float PIF = 3.141592653589793238f;
|
||||
constexpr float PIO2F = 1.5707963267948966192f;
|
||||
|
||||
constexpr float cephes_PIF = 3.141592653589793238f;
|
||||
constexpr float cephes_PIO2F = 1.5707963267948966192f;
|
||||
constexpr float cephes_PIO4F = 0.7853981633974483096f;
|
||||
constexpr float cephes_FOPI = 1.27323954473516f; // 4 / M_PI
|
||||
constexpr float minus_cephes_DP1 = -0.78515625f;
|
||||
constexpr float minus_cephes_DP2 = -2.4187564849853515625e-4f;
|
||||
constexpr float minus_cephes_DP3 = -3.77489497744594108e-8f;
|
||||
constexpr float sincof_p0 = -1.9515295891e-4f;
|
||||
constexpr float sincof_p1 = 8.3321608736e-3f;
|
||||
constexpr float sincof_p2 = -1.6666654611e-1f;
|
||||
constexpr float coscof_p0 = 2.443315711809948e-005f;
|
||||
constexpr float coscof_p1 = -1.388731625493765e-003f;
|
||||
constexpr float coscof_p2 = 4.166664568298827e-002f;
|
||||
|
||||
constexpr float atancof_p0 = 8.05374449538e-2f;
|
||||
constexpr float atancof_p1 = 1.38776856032e-1f;
|
||||
constexpr float atancof_p2 = 1.99777106478e-1f;
|
||||
constexpr float atancof_p3 = 3.33329491539e-1f;
|
||||
|
||||
static const float sign_mask = bit_cast<float>(0x80000000);
|
||||
static const float inv_sign_mask = bit_cast<float>(~0x80000000);
|
||||
} // namespace details
|
||||
|
||||
inline __m128 atan_ps(__m128 x)
|
||||
{
|
||||
using namespace details;
|
||||
|
||||
__m128 sign_bit, y;
|
||||
|
||||
sign_bit = x;
|
||||
/* take the absolute value */
|
||||
x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
|
||||
/* extract the sign bit (upper one) */
|
||||
sign_bit = _mm_and_ps(sign_bit, _mm_set1_ps(sign_mask));
|
||||
|
||||
/* range reduction, init x and y depending on range */
|
||||
/* x > 2.414213562373095 */
|
||||
__m128 cmp0 = _mm_cmpgt_ps(x, _mm_set1_ps(2.414213562373095f));
|
||||
/* x > 0.4142135623730950 */
|
||||
__m128 cmp1 = _mm_cmpgt_ps(x, _mm_set1_ps(0.4142135623730950f));
|
||||
|
||||
/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
|
||||
__m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
|
||||
|
||||
/* -( 1.0/x ) */
|
||||
__m128 y0 = _mm_and_ps(cmp0, _mm_set1_ps(cephes_PIO2F));
|
||||
__m128 x0 = _mm_div_ps(_mm_set1_ps(1.0f), x);
|
||||
x0 = _mm_xor_ps(x0, _mm_set1_ps(sign_mask));
|
||||
|
||||
__m128 y1 = _mm_and_ps(cmp2, _mm_set1_ps(cephes_PIO4F));
|
||||
/* (x-1.0)/(x+1.0) */
|
||||
__m128 x1_o = _mm_sub_ps(x, _mm_set1_ps(1.0f));
|
||||
__m128 x1_u = _mm_add_ps(x, _mm_set1_ps(1.0f));
|
||||
__m128 x1 = _mm_div_ps(x1_o, x1_u);
|
||||
|
||||
__m128 x2 = _mm_and_ps(cmp2, x1);
|
||||
x0 = _mm_and_ps(cmp0, x0);
|
||||
x2 = _mm_or_ps(x2, x0);
|
||||
cmp1 = _mm_or_ps(cmp0, cmp2);
|
||||
x2 = _mm_and_ps(cmp1, x2);
|
||||
x = _mm_andnot_ps(cmp1, x);
|
||||
x = _mm_or_ps(x2, x);
|
||||
|
||||
y = _mm_or_ps(y0, y1);
|
||||
|
||||
__m128 zz = _mm_mul_ps(x, x);
|
||||
__m128 acc = _mm_set1_ps(atancof_p0);
|
||||
acc = _mm_mul_ps(acc, zz);
|
||||
acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p1));
|
||||
acc = _mm_mul_ps(acc, zz);
|
||||
acc = _mm_add_ps(acc, _mm_set1_ps(atancof_p2));
|
||||
acc = _mm_mul_ps(acc, zz);
|
||||
acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p3));
|
||||
acc = _mm_mul_ps(acc, zz);
|
||||
acc = _mm_mul_ps(acc, x);
|
||||
acc = _mm_add_ps(acc, x);
|
||||
y = _mm_add_ps(y, acc);
|
||||
|
||||
/* update the sign */
|
||||
y = _mm_xor_ps(y, sign_bit);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
inline __m128 atan2_ps(__m128 y, __m128 x)
|
||||
{
|
||||
using namespace details;
|
||||
|
||||
__m128 zero = _mm_setzero_ps();
|
||||
__m128 x_eq_0 = _mm_cmpeq_ps(x, zero);
|
||||
__m128 x_gt_0 = _mm_cmpgt_ps(x, zero);
|
||||
__m128 x_le_0 = _mm_cmple_ps(x, zero);
|
||||
__m128 y_eq_0 = _mm_cmpeq_ps(y, zero);
|
||||
__m128 x_lt_0 = _mm_cmplt_ps(x, zero);
|
||||
__m128 y_lt_0 = _mm_cmplt_ps(y, zero);
|
||||
|
||||
__m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
|
||||
__m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
|
||||
zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
|
||||
|
||||
__m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
|
||||
__m128 pio2_mask_sign = _mm_and_ps(y_lt_0, _mm_set1_ps(sign_mask));
|
||||
__m128 pio2_result = _mm_set1_ps(cephes_PIO2F);
|
||||
pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
|
||||
pio2_result = _mm_and_ps(pio2_mask, pio2_result);
|
||||
|
||||
__m128 pi_mask = _mm_and_ps(y_eq_0, x_lt_0);
|
||||
__m128 pi = _mm_set1_ps(cephes_PIF);
|
||||
__m128 pi_result = _mm_and_ps(pi_mask, pi);
|
||||
|
||||
__m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
|
||||
swap_sign_mask_offset
|
||||
=_mm_and_ps(swap_sign_mask_offset, _mm_set1_ps(sign_mask));
|
||||
|
||||
__m128 offset0 = _mm_setzero_ps();
|
||||
__m128 offset1 = _mm_set1_ps(cephes_PIF);
|
||||
offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
|
||||
|
||||
__m128 offset = _mm_andnot_ps(x_lt_0, offset0);
|
||||
offset = _mm_and_ps(x_lt_0, offset1);
|
||||
|
||||
__m128 arg = _mm_div_ps(y, x);
|
||||
__m128 atan_result = atan_ps(arg);
|
||||
atan_result = _mm_add_ps(atan_result, offset);
|
||||
|
||||
/* select between zero_result, pio2_result and atan_result */
|
||||
|
||||
__m128 result = _mm_andnot_ps(zero_mask, pio2_result);
|
||||
atan_result = _mm_andnot_ps(zero_mask, atan_result);
|
||||
atan_result = _mm_andnot_ps(pio2_mask, atan_result);
|
||||
result = _mm_or_ps(result, atan_result);
|
||||
result = _mm_or_ps(result, pi_result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
inline std::pair<__m128, __m128> sincos_ps(__m128 x)
|
||||
{
|
||||
using namespace details;
|
||||
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
sign_bit_sin = x;
|
||||
/* take the absolute value */
|
||||
x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
|
||||
/* extract the sign bit (upper one) */
|
||||
sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_set1_ps(sign_mask));
|
||||
|
||||
/* scale by 4/Pi */
|
||||
y = _mm_mul_ps(x, _mm_set1_ps(cephes_FOPI));
|
||||
|
||||
/* store the integer part of y in emm2 */
|
||||
emm2 = _mm_cvttps_epi32(y);
|
||||
|
||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||
emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
|
||||
emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
|
||||
y = _mm_cvtepi32_ps(emm2);
|
||||
|
||||
emm4 = emm2;
|
||||
|
||||
/* get the swap sign flag for the sine */
|
||||
emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
|
||||
emm0 = _mm_slli_epi32(emm0, 29);
|
||||
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
||||
|
||||
/* get the polynom selection mask for the sine*/
|
||||
emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
|
||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||
|
||||
/* The magic pass: "Extended precision modular arithmetic"
|
||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||
xmm1 = _mm_set1_ps(minus_cephes_DP1);
|
||||
xmm2 = _mm_set1_ps(minus_cephes_DP2);
|
||||
xmm3 = _mm_set1_ps(minus_cephes_DP3);
|
||||
xmm1 = _mm_mul_ps(y, xmm1);
|
||||
xmm2 = _mm_mul_ps(y, xmm2);
|
||||
xmm3 = _mm_mul_ps(y, xmm3);
|
||||
x = _mm_add_ps(x, xmm1);
|
||||
x = _mm_add_ps(x, xmm2);
|
||||
x = _mm_add_ps(x, xmm3);
|
||||
|
||||
emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
|
||||
emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
|
||||
emm4 = _mm_slli_epi32(emm4, 29);
|
||||
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
||||
|
||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||
|
||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||
__m128 z = _mm_mul_ps(x, x);
|
||||
y = _mm_set1_ps(coscof_p0);
|
||||
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, _mm_set1_ps(coscof_p1));
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_add_ps(y, _mm_set1_ps(coscof_p2));
|
||||
y = _mm_mul_ps(y, z);
|
||||
y = _mm_mul_ps(y, z);
|
||||
__m128 tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f));
|
||||
y = _mm_sub_ps(y, tmp);
|
||||
y = _mm_add_ps(y, _mm_set1_ps(1));
|
||||
|
||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||
|
||||
__m128 y2 = _mm_set1_ps(sincof_p0);
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p1));
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p2));
|
||||
y2 = _mm_mul_ps(y2, z);
|
||||
y2 = _mm_mul_ps(y2, x);
|
||||
y2 = _mm_add_ps(y2, x);
|
||||
|
||||
/* select the correct result from the two polynoms */
|
||||
xmm3 = poly_mask;
|
||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||
y2 = _mm_sub_ps(y2, ysin2);
|
||||
y = _mm_sub_ps(y, ysin1);
|
||||
|
||||
xmm1 = _mm_add_ps(ysin1, ysin2);
|
||||
xmm2 = _mm_add_ps(y, y2);
|
||||
|
||||
/* update the sign */
|
||||
return std::make_pair(
|
||||
_mm_xor_ps(xmm1, sign_bit_sin), _mm_xor_ps(xmm2, sign_bit_cos));
|
||||
}
|
||||
|
||||
inline float atan2_ss(float y, float x)
|
||||
{
|
||||
return _mm_cvtss_f32(atan2_ps(_mm_set_ss(y), _mm_set_ss(x)));
|
||||
}
|
||||
|
||||
inline std::pair<float, float> sincos_ss(float angle)
|
||||
{
|
||||
auto res = sincos_ps(_mm_set_ss(angle));
|
||||
return std::make_pair(_mm_cvtss_f32(res.first), _mm_cvtss_f32(res.second));
|
||||
}
|
||||
|
||||
inline __m128 norm(__m128 x, __m128 y)
|
||||
{
|
||||
return _mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
|
||||
}
|
||||
|
||||
inline float sqrt_ss(float x)
|
||||
{
|
||||
__m128 sse_value = _mm_set_ss(x);
|
||||
sse_value = _mm_sqrt_ss(sse_value);
|
||||
return _mm_cvtss_f32(sse_value);
|
||||
}
|
||||
|
||||
template<typename fnc>
|
||||
void perform_parallel_simd_aligned(
|
||||
const std::complex<float>* input, float* output, int n, const fnc& f)
|
||||
{
|
||||
for (int i = 0; i <= n - 4; i += 4) {
|
||||
// Safe according to C++ standard
|
||||
auto p1 = _mm_load_ps(reinterpret_cast<const float*>(input + i));
|
||||
auto p2 = _mm_load_ps(reinterpret_cast<const float*>(input + i + 2));
|
||||
|
||||
// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
|
||||
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}
|
||||
|
||||
auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
__m128 out;
|
||||
f(rp, ip, out);
|
||||
|
||||
_mm_store_ps(output + i, out);
|
||||
}
|
||||
// deal with last partial packet
|
||||
for (int i = n & (~3); i < n; ++i) {
|
||||
__m128 out;
|
||||
f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out);
|
||||
output[i] = _mm_cvtss_f32(out);
|
||||
}
|
||||
}
|
||||
|
||||
inline void rotate_parallel_simd_aligned(
|
||||
const float* oldPhase, const float* newPhase, std::complex<float>* output,
|
||||
int n)
|
||||
{
|
||||
for (int i = 0; i <= n - 4; i += 4) {
|
||||
auto [sin, cos] = sincos_ps(
|
||||
oldPhase
|
||||
? _mm_sub_ps(_mm_load_ps(newPhase + i), _mm_load_ps(oldPhase + i))
|
||||
: _mm_load_ps(newPhase + i));
|
||||
|
||||
// Safe according to C++ standard
|
||||
auto p1 = _mm_load_ps(reinterpret_cast<float*>(output + i));
|
||||
auto p2 = _mm_load_ps(reinterpret_cast<float*>(output + i + 2));
|
||||
|
||||
// p1 = {real(c1), imag(c1), real(c2), imag(c2)}
|
||||
// p2 = {real(c3), imag(c3), real(c4), imag(c4)}
|
||||
|
||||
auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
// We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos)
|
||||
|
||||
auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin));
|
||||
auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos));
|
||||
|
||||
p1 = _mm_unpacklo_ps(out_rp, out_ip);
|
||||
p2 = _mm_unpackhi_ps(out_rp, out_ip);
|
||||
|
||||
_mm_store_ps(reinterpret_cast<float*>(output + i), p1);
|
||||
_mm_store_ps(reinterpret_cast<float*>(output + i + 2), p2);
|
||||
}
|
||||
// deal with last partial packet
|
||||
for (int i = n & (~3); i < n; ++i) {
|
||||
const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
|
||||
output[i] *= std::complex<float>(cosf(theta), sinf(theta));
|
||||
}
|
||||
}
|
||||
} // namespace simd_complex_conversions
|
||||
128
libs/staffpad/SimdTypes.h
Normal file
128
libs/staffpad/SimdTypes.h
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
Simd-types for parallel dsp processing.
|
||||
Aligned memory allocation for simd vectors.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
|
||||
#if _MSC_VER
|
||||
#define __finl __forceinline
|
||||
#define __vecc __vectorcall
|
||||
#else
|
||||
#define __finl inline __attribute__((always_inline))
|
||||
#define __vecc
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#include "SimdTypes_sse2.h"
|
||||
#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#include "SimdTypes_neon.h"
|
||||
#else
|
||||
#include "SimdTypes_scalar.h"
|
||||
#endif
|
||||
|
||||
namespace staffpad::audio::simd {
|
||||
/// reserve aligned memory. Needs to be freed with aligned_free()
|
||||
inline void* aligned_malloc(size_t required_bytes, size_t alignment)
|
||||
{
|
||||
auto offset = alignment - 1 + sizeof(void*);
|
||||
auto p1 = std::malloc(required_bytes + offset);
|
||||
if (p1 == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
// figure out aligned position
|
||||
void* p2 = (void*)(((size_t)(p1) + offset) & ~(alignment - 1));
|
||||
// write malloced pointer in front of aligned data
|
||||
((void**)p2)[-1] = p1;
|
||||
return p2;
|
||||
}
|
||||
|
||||
/// free memory allocated with aligned_malloc
|
||||
inline void aligned_free(void* p)
|
||||
{
|
||||
if (p) {
|
||||
free(((void**)p)[-1]);
|
||||
}
|
||||
}
|
||||
|
||||
/// create a c++ class at an memory-aligned spot that needs to be deleted using aligned_delete
|
||||
template<typename cls>
|
||||
inline cls* aligned_new(int alignment)
|
||||
{
|
||||
void* mem = aligned_malloc(sizeof(cls), alignment);
|
||||
return new (mem) cls();
|
||||
}
|
||||
|
||||
/** delete objects created using aligned_new */
|
||||
template<typename cls>
|
||||
inline void aligned_delete(cls* obj)
|
||||
{
|
||||
if (obj != nullptr) {
|
||||
obj->~cls();
|
||||
aligned_free((void*)obj);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline bool is_aligned(T* obj, int alignment)
|
||||
{
|
||||
return (((size_t)obj) & (alignment - 1)) == 0;
|
||||
}
|
||||
|
||||
/** this template allows to write float SIMD code with the supported operators the following way:
|
||||
|
||||
float *a_vec;
|
||||
const float *b_vec;
|
||||
perform_parallel_simd_aligned(a_vec, b_vec, 512, [](auto &a, auto &b) {
|
||||
auto t = a;
|
||||
a = 3.f * a + 12.f * b;
|
||||
b = 0.25f * a + 3.f * b;
|
||||
});
|
||||
*/
|
||||
|
||||
// two buffers read/write
|
||||
template<typename fnc>
|
||||
__finl void perform_parallel_simd_aligned(float* a, float* b, int n, const fnc& f)
|
||||
{
|
||||
// fnc& f needs to be a lambda of type [](auto &a, auto &b){}.
|
||||
// the autos will be float_x4/float
|
||||
constexpr int N = 4;
|
||||
constexpr int byte_size = sizeof(float);
|
||||
|
||||
assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size));
|
||||
|
||||
for (int i = 0; i <= n - N; i += N) {
|
||||
auto x = float_x4_load_aligned(a + i);
|
||||
auto y = float_x4_load_aligned(b + i);
|
||||
f(x, y);
|
||||
store_aligned(x, a + i);
|
||||
store_aligned(y, b + i);
|
||||
}
|
||||
// deal with last partial packet
|
||||
for (int i = n & (~(N - 1)); i < n; ++i) {
|
||||
f(a[i], b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/// template for applying math to one data buffer
|
||||
template<typename fnc>
|
||||
__finl void perform_parallel_simd_aligned(float* a, int n, const fnc& f)
|
||||
{
|
||||
// fnc& f needs to be a lambda of type [](auto &a){}.
|
||||
constexpr int N = 4;
|
||||
constexpr int byte_size = sizeof(float);
|
||||
assert(is_aligned(a, N * byte_size));
|
||||
|
||||
for (int i = 0; i <= n - N; i += N) {
|
||||
auto x = float_x4_load_aligned(a + i);
|
||||
f(x);
|
||||
store_aligned(x, a + i);
|
||||
}
|
||||
// deal with last partial packet
|
||||
for (int i = n & (~(N - 1)); i < n; ++i) {
|
||||
f(a[i]);
|
||||
}
|
||||
}
|
||||
} // namespace staffpad::audio::simd
|
||||
161
libs/staffpad/SimdTypes_neon.h
Normal file
161
libs/staffpad/SimdTypes_neon.h
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
Neon version of SIMD types.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if _MSC_VER
|
||||
#include <arm64_neon.h>
|
||||
#define __finl __forceinline
|
||||
#define __vecc __vectorcall
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
#define __finl inline __attribute__((always_inline))
|
||||
#define __vecc
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
|
||||
namespace staffpad::audio::simd {
|
||||
struct float_x4
|
||||
{
|
||||
float32x4_t s;
|
||||
|
||||
__finl float_x4()
|
||||
{
|
||||
}
|
||||
|
||||
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
|
||||
__finl float_x4(float val)
|
||||
{
|
||||
s = vdupq_n_f32(val);
|
||||
}
|
||||
|
||||
__finl float_x4(const float32x4_t& val)
|
||||
: s(val)
|
||||
{
|
||||
}
|
||||
|
||||
/// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
|
||||
__finl float_x4(float v0, float v1, float v2, float v3)
|
||||
{
|
||||
#if _MSC_VER // aggregate initializer won't work unless we have {.n128_f32 = ..} in c++20
|
||||
s.n128_f32[0] = v0;
|
||||
s.n128_f32[1] = v1;
|
||||
s.n128_f32[2] = v2;
|
||||
s.n128_f32[3] = v3;
|
||||
#elif __clang__
|
||||
s = { v0, v1, v2, v3 };
|
||||
#else
|
||||
float f[4] = { v0, v1, v2, v3 };
|
||||
s = vld1q_f32(f);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __clang__
|
||||
private:
|
||||
// this helper class allows writing to the single registers for clang
|
||||
// __mm128 is a built-in type -> we can't return a float& reference.
|
||||
// this is just syntax sugar and clang will remove it during builds.
|
||||
//
|
||||
// it allows to write
|
||||
// float_x4 a;
|
||||
// a[1] = 2.f;
|
||||
struct RegisterAccessWrapper
|
||||
{
|
||||
float32x4_t& val;
|
||||
int i;
|
||||
|
||||
void operator=(float x)
|
||||
{
|
||||
val[i] = x;
|
||||
}
|
||||
|
||||
operator float() noexcept
|
||||
{
|
||||
return val[i];
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
__finl RegisterAccessWrapper operator[](int n)
|
||||
{
|
||||
RegisterAccessWrapper raw = { s, n };
|
||||
return raw;
|
||||
}
|
||||
|
||||
__finl const float operator[](int n) const
|
||||
{
|
||||
return s[n];
|
||||
}
|
||||
|
||||
#elif _MSC_VER
|
||||
// on msvc returning a ref to a sub-register is possible
|
||||
__finl float& operator[](int n)
|
||||
{
|
||||
return s.n128_f32[n];
|
||||
}
|
||||
|
||||
__finl const float operator[](int n) const
|
||||
{
|
||||
return s.n128_f32[n];
|
||||
}
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
__finl float_x4 __vecc float_x4_from_float(float x)
|
||||
{
|
||||
return vdupq_n_f32(x);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
|
||||
{
|
||||
return vld1q_f32(x);
|
||||
}
|
||||
|
||||
__finl void __vecc store_aligned(const float_x4& a, float* x)
|
||||
{
|
||||
vst1q_f32(x, a.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return vuzp1q_f32(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return vuzp2q_f32(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
|
||||
{
|
||||
return vaddq_f32(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
|
||||
{
|
||||
return vsubq_f32(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
|
||||
{
|
||||
return vmulq_f32(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc sqrt(const float_x4& a)
|
||||
{
|
||||
return vsqrtq_f32(a.s);
|
||||
}
|
||||
|
||||
__finl float __vecc rint(float a)
|
||||
{
|
||||
return std::rint(a);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc rint(const float_x4& a)
|
||||
{
|
||||
return vrndnq_f32(a.s);
|
||||
}
|
||||
} // namespace staffpad::audio::simd
|
||||
108
libs/staffpad/SimdTypes_scalar.h
Normal file
108
libs/staffpad/SimdTypes_scalar.h
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
scalar emulation of simd types.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#if _MSC_VER
|
||||
#define __finl __forceinline
|
||||
#define __vecc __vectorcall
|
||||
#else
|
||||
#define __finl inline __attribute__((always_inline))
|
||||
#define __vecc
|
||||
#endif
|
||||
|
||||
namespace staffpad::audio::simd {
|
||||
struct float_x4
|
||||
{
|
||||
float v[4];
|
||||
|
||||
__finl float_x4()
|
||||
{
|
||||
}
|
||||
|
||||
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
|
||||
__finl float_x4(float val)
|
||||
{
|
||||
v[0] = v[1] = v[2] = v[3] = val;
|
||||
}
|
||||
|
||||
/// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
|
||||
__finl float_x4(float v0, float v1, float v2, float v3)
|
||||
{
|
||||
v[0] = v0;
|
||||
v[1] = v1;
|
||||
v[2] = v2;
|
||||
v[3] = v3;
|
||||
}
|
||||
|
||||
__finl float& operator[](int n)
|
||||
{
|
||||
return v[n];
|
||||
}
|
||||
|
||||
__finl const float& operator[](int n) const
|
||||
{
|
||||
return v[n];
|
||||
}
|
||||
};
|
||||
|
||||
__finl float_x4 __vecc float_x4_from_float(float x)
|
||||
{
|
||||
return { x, x, x, x };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
|
||||
{
|
||||
return { x[0], x[1], x[2], x[3] };
|
||||
}
|
||||
|
||||
__finl void __vecc store_aligned(const float_x4& a, float* x)
|
||||
{
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
x[i] = a[i];
|
||||
}
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return { a[0], a[2], b[0], b[2] };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return { a[1], a[1], b[3], b[3] };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
|
||||
{
|
||||
return { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
|
||||
{
|
||||
return { a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3] };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
|
||||
{
|
||||
return { a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3] };
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc sqrt(const float_x4& a)
|
||||
{
|
||||
return { std::sqrt(a[0]), std::sqrt(a[1]), std::sqrt(a[2]), std::sqrt(a[3]) };
|
||||
}
|
||||
|
||||
__finl float __vecc rint(float a)
|
||||
{
|
||||
return std::rint(a);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc rint(const float_x4& a)
|
||||
{
|
||||
return { std::rint(a[0]), std::rint(a[1]), std::rint(a[2]), std::rint(a[3]) };
|
||||
}
|
||||
} // namespace staffpad::audio::simd
|
||||
150
libs/staffpad/SimdTypes_sse2.h
Normal file
150
libs/staffpad/SimdTypes_sse2.h
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
SSE2 simd types
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#if _MSC_VER
|
||||
#define __finl __forceinline
|
||||
#define __vecc __vectorcall
|
||||
#else
|
||||
#define __finl inline __attribute__((always_inline))
|
||||
#define __vecc
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
namespace staffpad::audio::simd {
|
||||
// this is jumping through some hoops to get the same level of support
|
||||
// for clang and msvc. With clang, the sse2 types are built-in and have
|
||||
// some arithmetic operators defined.
|
||||
// On msvc the sse2 types are structs with no operators.
|
||||
// to get to the same level and to be able to write algorithms "naturally",
|
||||
// everything needs to be wrapped in a struct.
|
||||
struct float_x4
|
||||
{
|
||||
__m128 s;
|
||||
__finl float_x4()
|
||||
{
|
||||
}
|
||||
|
||||
/// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
|
||||
__finl float_x4(float val)
|
||||
{
|
||||
s = _mm_set1_ps(val);
|
||||
}
|
||||
|
||||
__finl float_x4(const __m128& val)
|
||||
: s(val)
|
||||
{
|
||||
}
|
||||
|
||||
#if __clang__
|
||||
private:
|
||||
// this helper class allows writing to the single registers for clang
|
||||
// __mm128 is a built-in type -> we can't return a float& reference.
|
||||
// this is just syntax sugar and clang will remove it during builds.
|
||||
//
|
||||
// it allows to write
|
||||
// float_x4 a;
|
||||
// a[1] = 2.f;
|
||||
struct RegisterAccessWrapper
|
||||
{
|
||||
__m128& val;
|
||||
int i;
|
||||
|
||||
void operator=(float x)
|
||||
{
|
||||
val[i] = x;
|
||||
}
|
||||
|
||||
operator float() noexcept
|
||||
{
|
||||
return val[i];
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
__finl RegisterAccessWrapper operator[](int n)
|
||||
{
|
||||
RegisterAccessWrapper raw = { s, n };
|
||||
return raw;
|
||||
}
|
||||
|
||||
__finl const float operator[](int n) const
|
||||
{
|
||||
return s[n];
|
||||
}
|
||||
|
||||
#elif _MSC_VER
|
||||
// on msvc returning a ref to a sub-register is possible
|
||||
__finl float& operator[](int n)
|
||||
{
|
||||
return s.m128_f32[n];
|
||||
}
|
||||
|
||||
__finl const float operator[](int n) const
|
||||
{
|
||||
return s.m128_f32[n];
|
||||
}
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
__finl float_x4 __vecc float_x4_from_float(float x)
|
||||
{
|
||||
return _mm_set1_ps(x);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc float_x4_load_aligned(const float* x)
|
||||
{
|
||||
return _mm_load_ps(x);
|
||||
}
|
||||
|
||||
__finl void __vecc store_aligned(const float_x4& a, float* x)
|
||||
{
|
||||
_mm_store_ps(x, a.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
|
||||
{
|
||||
return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
|
||||
{
|
||||
return _mm_add_ps(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
|
||||
{
|
||||
return _mm_sub_ps(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
|
||||
{
|
||||
return _mm_mul_ps(a.s, b.s);
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc sqrt(const float_x4& a)
|
||||
{
|
||||
return _mm_sqrt_ps(a.s);
|
||||
}
|
||||
|
||||
__finl float __vecc rint(float x)
|
||||
{
|
||||
__m128i A = _mm_cvtps_epi32(_mm_set_ss(x));
|
||||
return _mm_cvtss_f32(_mm_cvtepi32_ps(A));
|
||||
}
|
||||
|
||||
__finl float_x4 __vecc rint(const float_x4& a)
|
||||
{
|
||||
__m128i A = _mm_cvtps_epi32(a.s);
|
||||
return _mm_cvtepi32_ps(A);
|
||||
}
|
||||
} // namespace staffpad::audio::simd
|
||||
559
libs/staffpad/TimeAndPitch.cpp
Normal file
559
libs/staffpad/TimeAndPitch.cpp
Normal file
|
|
@ -0,0 +1,559 @@
|
|||
#include "staffpad/TimeAndPitch.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <random>
|
||||
#include <stdlib.h>
|
||||
#include <utility>
|
||||
|
||||
#include "CircularSampleBuffer.h"
|
||||
#include "FourierTransform_pffft.h"
|
||||
#include "SamplesFloat.h"
|
||||
#include "SimdTypes.h"
|
||||
|
||||
using namespace staffpad::audio;
|
||||
|
||||
namespace staffpad {
|
||||
namespace {
|
||||
constexpr double twoPi = 6.28318530717958647692f;
|
||||
|
||||
/**
|
||||
* 6-point lagrange interpolator SNR ~ -35.2db
|
||||
* interpolates between samples 2 and 3 using t.
|
||||
*
|
||||
* smp pointer to a memory region containing 6 samples
|
||||
* t fractional sample to interpolate t >= 0 && t < 1
|
||||
* return value the interpolated sample
|
||||
*/
|
||||
inline float lagrange6(const float (& smp)[6], float t)
|
||||
{
|
||||
auto* y = &smp[2];
|
||||
|
||||
auto ym1_y1 = y[-1] + y[1];
|
||||
auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
|
||||
auto c0 = y[0];
|
||||
auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
|
||||
auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
|
||||
auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
|
||||
auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
|
||||
auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
|
||||
|
||||
// Estrin's scheme
|
||||
auto t2 = t * t;
|
||||
return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
|
||||
}
|
||||
|
||||
void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
|
||||
{
|
||||
std::vector<std::complex<float> > v(size);
|
||||
std::uniform_real_distribution<float> dis(
|
||||
-3.14159265358979323846f, 3.14159265358979323846f);
|
||||
std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
|
||||
}
|
||||
} // namespace
|
||||
|
||||
struct TimeAndPitch::impl
|
||||
{
|
||||
impl(int fft_size)
|
||||
: fft(fft_size)
|
||||
{
|
||||
randomGenerator.seed(0);
|
||||
}
|
||||
|
||||
FourierTransform fft;
|
||||
std::mt19937 randomGenerator;
|
||||
CircularSampleBuffer<float> inResampleInputBuffer[2];
|
||||
CircularSampleBuffer<float> inCircularBuffer[2];
|
||||
CircularSampleBuffer<float> outCircularBuffer[2];
|
||||
CircularSampleBuffer<float> normalizationBuffer;
|
||||
|
||||
SamplesReal fft_timeseries;
|
||||
SamplesComplex spectrum;
|
||||
SamplesReal norm;
|
||||
SamplesReal phase;
|
||||
SamplesReal last_phase;
|
||||
SamplesReal phase_accum;
|
||||
SamplesReal cosWindow;
|
||||
SamplesReal sqWindow;
|
||||
SamplesReal last_norm;
|
||||
SamplesReal random_phases;
|
||||
|
||||
double exact_hop_a = 512.0, hop_a_err = 0.0;
|
||||
double exact_hop_s = 0.0;
|
||||
double next_exact_hop_s = 512.0;
|
||||
double hop_s_err = 0.0;
|
||||
|
||||
std::vector<int> peak_index, trough_index;
|
||||
};
|
||||
|
||||
TimeAndPitch::TimeAndPitch(
|
||||
int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
|
||||
: fftSize(fftSize)
|
||||
, _reduceImaging(reduceImaging)
|
||||
, _shiftTimbreCb(std::move(shiftTimbreCb))
|
||||
{
|
||||
}
|
||||
|
||||
TimeAndPitch::~TimeAndPitch()
|
||||
{
|
||||
// Here for ~std::shared_ptr<impl>() to know ~impl()
|
||||
}
|
||||
|
||||
void TimeAndPitch::setup(int numChannels, int maxBlockSize)
|
||||
{
|
||||
assert(numChannels == 1 || numChannels == 2);
|
||||
_numChannels = numChannels;
|
||||
|
||||
d = std::make_unique<impl>(fftSize);
|
||||
_maxBlockSize = maxBlockSize;
|
||||
_numBins = fftSize / 2 + 1;
|
||||
|
||||
// audio sample buffers
|
||||
d->fft_timeseries.setSize(_numChannels, fftSize);
|
||||
int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->inResampleInputBuffer[ch].setSize(_maxBlockSize
|
||||
+ 6); // needs additional history for resampling. more in the future
|
||||
d->inCircularBuffer[ch].setSize(fftSize);
|
||||
d->outCircularBuffer[ch].setSize(outBufferSize);
|
||||
}
|
||||
d->normalizationBuffer.setSize(outBufferSize);
|
||||
|
||||
// fft coefficient buffers
|
||||
d->spectrum.setSize(_numChannels, _numBins);
|
||||
d->norm.setSize(1, _numBins);
|
||||
d->last_norm.setSize(1, _numBins);
|
||||
d->phase.setSize(_numChannels, _numBins);
|
||||
d->last_phase.setSize(_numChannels, _numBins);
|
||||
d->phase_accum.setSize(_numChannels, _numBins);
|
||||
d->random_phases.setSize(1, _numBins);
|
||||
generateRandomPhaseVector(
|
||||
d->random_phases.getPtr(0), _numBins, d->randomGenerator);
|
||||
|
||||
_expectedPhaseChangePerBinPerSample = twoPi / double(fftSize);
|
||||
|
||||
// Cos window
|
||||
// For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
|
||||
d->cosWindow.setSize(1, fftSize);
|
||||
d->sqWindow.setSize(1, fftSize);
|
||||
auto* w = d->cosWindow.getPtr(0);
|
||||
auto* sqw = d->sqWindow.getPtr(0);
|
||||
for (int i = 0; i < fftSize; ++i) {
|
||||
w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
|
||||
sqw[i] = w[i] * w[i];
|
||||
}
|
||||
|
||||
d->peak_index.reserve(_numBins);
|
||||
d->trough_index.reserve(_numBins);
|
||||
|
||||
// force fft to allocate all buffers now. Currently there is no other way in the fft class
|
||||
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
int TimeAndPitch::getSamplesToNextHop() const
|
||||
{
|
||||
return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter
|
||||
+ 1); // 1 extra to counter pitch factor error accumulation
|
||||
}
|
||||
|
||||
int TimeAndPitch::getNumAvailableOutputSamples() const
|
||||
{
|
||||
return _availableOutputSamples;
|
||||
}
|
||||
|
||||
void TimeAndPitch::reset()
|
||||
{
|
||||
_analysis_hop_counter = 0;
|
||||
_availableOutputSamples = 0;
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->inResampleInputBuffer[ch].reset();
|
||||
d->inCircularBuffer[ch].reset();
|
||||
d->outCircularBuffer[ch].reset();
|
||||
}
|
||||
d->normalizationBuffer.reset();
|
||||
d->last_norm.zeroOut();
|
||||
d->last_phase.zeroOut();
|
||||
d->phase_accum.zeroOut();
|
||||
_outBufferWriteOffset = 0;
|
||||
d->hop_a_err = 0.0;
|
||||
d->hop_s_err = 0.0;
|
||||
d->exact_hop_s = 0.0;
|
||||
_resampleReadPos = 0.0;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// wrap a phase value into -PI..PI
|
||||
inline float _unwrapPhase(float arg)
|
||||
{
|
||||
using namespace audio::simd;
|
||||
return arg - rint(arg * 0.15915494309f) * 6.283185307f;
|
||||
}
|
||||
|
||||
void _unwrapPhaseVec(float* v, int n)
|
||||
{
|
||||
audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
|
||||
}
|
||||
|
||||
/// rotate even-sized array by half its size to align fft phase at the center
|
||||
void _fft_shift(float* v, int n)
|
||||
{
|
||||
assert((n & 1) == 0);
|
||||
int n2 = n >> 1;
|
||||
audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
|
||||
auto tmp = a;
|
||||
a = b;
|
||||
b = tmp;
|
||||
});
|
||||
}
|
||||
|
||||
void _lr_to_ms(float* ch1, float* ch2, int n)
|
||||
{
|
||||
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
|
||||
auto l = a, r = b;
|
||||
a = 0.5f * (l + r);
|
||||
b = 0.5f * (l - r);
|
||||
});
|
||||
}
|
||||
|
||||
void _ms_to_lr(float* ch1, float* ch2, int n)
|
||||
{
|
||||
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
|
||||
auto m = a, s = b;
|
||||
a = m + s;
|
||||
b = m - s;
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
template<int num_channels>
|
||||
void TimeAndPitch::_time_stretch(float a_a, float a_s)
|
||||
{
|
||||
auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
|
||||
|
||||
// Create a norm array
|
||||
const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
|
||||
const auto* norms_last = d->last_norm.getPtr(0);
|
||||
|
||||
d->peak_index.clear();
|
||||
d->trough_index.clear();
|
||||
float lowest = norms[0];
|
||||
int trough = 0;
|
||||
if (norms_last[0] >= norms[1]) {
|
||||
d->peak_index.emplace_back(0);
|
||||
d->trough_index.emplace_back(0);
|
||||
}
|
||||
for (int i = 1; i < _numBins - 1; ++i) {
|
||||
if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) {
|
||||
d->peak_index.emplace_back(i);
|
||||
d->trough_index.emplace_back(trough);
|
||||
trough = i;
|
||||
lowest = norms[i];
|
||||
} else if (norms[i] < lowest) {
|
||||
lowest = norms[i];
|
||||
trough = i;
|
||||
}
|
||||
}
|
||||
if (norms_last[_numBins - 1] > norms[_numBins - 2]) {
|
||||
d->peak_index.emplace_back(_numBins - 1);
|
||||
d->trough_index.emplace_back(trough);
|
||||
}
|
||||
|
||||
if (d->peak_index.size() == 0) {
|
||||
// use max norm of last frame
|
||||
int max_idx = 0;
|
||||
float max_norm = 0.f;
|
||||
vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
|
||||
d->peak_index.emplace_back(max_idx);
|
||||
}
|
||||
|
||||
const float** p = const_cast<const float**>(d->phase.getPtrs());
|
||||
const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
|
||||
float** acc = d->phase_accum.getPtrs();
|
||||
|
||||
float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
|
||||
float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
|
||||
|
||||
int num_peaks = (int)d->peak_index.size();
|
||||
// time integration for all peaks
|
||||
for (int i = 0; i < num_peaks; ++i) {
|
||||
const int n = d->peak_index[i];
|
||||
auto fn = float(n);
|
||||
|
||||
// determine phase from last frame
|
||||
float fn_expChange_a = fn * expChange_a;
|
||||
float fn_expChange_s = fn * expChange_s;
|
||||
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
|
||||
}
|
||||
}
|
||||
|
||||
// go from first peak to 0
|
||||
for (int n = d->peak_index[0]; n > 0; --n) {
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// 'grow' from pairs of peaks to the lowest norm in between
|
||||
for (int i = 0; i < num_peaks - 1; ++i) {
|
||||
const int mid = d->trough_index[i + 1];
|
||||
for (int n = d->peak_index[i]; n < mid; ++n) {
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
|
||||
}
|
||||
}
|
||||
for (int n = d->peak_index[i + 1]; n > mid + 1; --n) {
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// last peak to the end
|
||||
for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) {
|
||||
for (int ch = 0; ch < num_channels; ++ch) {
|
||||
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
|
||||
}
|
||||
}
|
||||
|
||||
d->last_norm.assignSamples(d->norm);
|
||||
d->last_phase.assignSamples(d->phase);
|
||||
}
|
||||
|
||||
void TimeAndPitch::_applyImagingReduction()
|
||||
{
|
||||
// Upsampling brings spectral components down, including those that were
|
||||
// beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
|
||||
// spectrum, which, if not lowpass-filtered by the resampler, may yield
|
||||
// an aliasing-like quality if what is mirrored is a harmonic series. A
|
||||
// simple thing to do against that is to scramble the phase values there.
|
||||
// This way we preserve natural energy fluctuations, but the artifact
|
||||
// isn't noticeable as much anymore, because much more noise-like, which
|
||||
// is what one typically gets at those frequencies.
|
||||
constexpr auto alignment = 64 / sizeof(float);
|
||||
const int imagingBeginBin
|
||||
=std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;
|
||||
|
||||
if (imagingBeginBin >= _numBins) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto n = _numBins - imagingBeginBin;
|
||||
auto pSpec = d->spectrum.getPtr(0);
|
||||
auto pRand = d->random_phases.getPtr(0);
|
||||
vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
|
||||
// Just rotating the random phase vector to produce pseudo-random
|
||||
// sequences of phases.
|
||||
std::uniform_int_distribution<size_t> dis(0, n - 1);
|
||||
const auto middle = dis(d->randomGenerator);
|
||||
std::rotate(pRand, pRand + middle, pRand + n);
|
||||
}
|
||||
|
||||
/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
|
||||
void TimeAndPitch::_process_hop(int hop_a, int hop_s)
|
||||
{
|
||||
if (d->exact_hop_a != d->exact_hop_s) {
|
||||
if (_numChannels == 2) {
|
||||
_lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
|
||||
}
|
||||
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
|
||||
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
|
||||
}
|
||||
|
||||
// determine norm/phase
|
||||
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
|
||||
// norms of the mid channel only (or sole channel) are needed in
|
||||
// _time_stretch
|
||||
vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
|
||||
}
|
||||
|
||||
if (_shiftTimbreCb) {
|
||||
_shiftTimbreCb(
|
||||
1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
|
||||
}
|
||||
|
||||
if (_reduceImaging && _pitchFactor < 1.) {
|
||||
_applyImagingReduction();
|
||||
}
|
||||
|
||||
if (_numChannels == 1) {
|
||||
_time_stretch<1>((float)hop_a, (float)hop_s);
|
||||
} else if (_numChannels == 2) {
|
||||
_time_stretch<2>((float)hop_a, (float)hop_s);
|
||||
}
|
||||
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
_unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
|
||||
}
|
||||
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
|
||||
d->spectrum.getNumSamples());
|
||||
}
|
||||
d->fft.inverseReal(d->spectrum, d->fft_timeseries);
|
||||
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
|
||||
d->fft_timeseries.getNumSamples());
|
||||
}
|
||||
|
||||
if (_numChannels == 2) {
|
||||
_ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
|
||||
}
|
||||
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
|
||||
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
|
||||
}
|
||||
} else { // stretch factor == 1.0 => just apply window
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
|
||||
gainFact);
|
||||
}
|
||||
|
||||
if (normalize_window) {
|
||||
d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
|
||||
}
|
||||
}
|
||||
_outBufferWriteOffset += hop_s;
|
||||
_availableOutputSamples += hop_s;
|
||||
}
|
||||
|
||||
void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
|
||||
{
|
||||
assert(timeScale > 0.0);
|
||||
assert(pitchFactor > 0.0);
|
||||
_pitchFactor = pitchFactor;
|
||||
_timeStretch = timeScale * pitchFactor;
|
||||
|
||||
_overlap_a = overlap;
|
||||
double overlap_s = overlap;
|
||||
if (_timeStretch > 1.0 || !modulate_synthesis_hop) {
|
||||
_overlap_a *= _timeStretch;
|
||||
} else {
|
||||
overlap_s /= _timeStretch;
|
||||
}
|
||||
|
||||
d->exact_hop_a = double(fftSize) / _overlap_a;
|
||||
if (!modulate_synthesis_hop) {
|
||||
d->exact_hop_s = double(fftSize) / overlap_s;
|
||||
} else {
|
||||
// switch after processing next block, unless it's the first one
|
||||
d->next_exact_hop_s = double(fftSize) / overlap_s;
|
||||
if (d->exact_hop_s == 0.0) { // on first chunk set it immediately
|
||||
d->exact_hop_s = d->next_exact_hop_s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
|
||||
{
|
||||
assert(numSamples <= _maxBlockSize);
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
|
||||
d->inResampleInputBuffer[ch].advance(numSamples);
|
||||
}
|
||||
_resampleReadPos -= numSamples;
|
||||
|
||||
// determine integer hop size for the next hop.
|
||||
// The error accumulators will make the value fluctuate by one to reach arbitrary scale
|
||||
// ratios
|
||||
if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors
|
||||
d->exact_hop_s = d->next_exact_hop_s;
|
||||
}
|
||||
|
||||
const int hop_s = int(d->exact_hop_s + d->hop_s_err);
|
||||
const int hop_a = int(d->exact_hop_a + d->hop_a_err);
|
||||
|
||||
double step = 0.0;
|
||||
double read_pos = _resampleReadPos;
|
||||
while (read_pos < 0.0)
|
||||
{
|
||||
auto int_pos = int(std::floor(read_pos));
|
||||
float frac_pos = float(read_pos - int_pos);
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
float smp[6];
|
||||
d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
|
||||
float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
|
||||
d->inCircularBuffer[ch].writeOffset0(s);
|
||||
d->inCircularBuffer[ch].advance(1);
|
||||
}
|
||||
_analysis_hop_counter++;
|
||||
step++;
|
||||
|
||||
if (_analysis_hop_counter >= hop_a) {
|
||||
_analysis_hop_counter -= hop_a;
|
||||
d->hop_s_err += d->exact_hop_s - hop_s;
|
||||
d->hop_a_err += d->exact_hop_a - hop_a;
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
|
||||
}
|
||||
_process_hop(hop_a, hop_s);
|
||||
}
|
||||
read_pos = _resampleReadPos + step * _pitchFactor;
|
||||
}
|
||||
_resampleReadPos = read_pos;
|
||||
}
|
||||
|
||||
void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
|
||||
{
|
||||
assert(numSamples <= _maxBlockSize);
|
||||
for (int ch = 0; ch < _numChannels; ++ch) {
|
||||
d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
|
||||
if (normalize_window) {
|
||||
constexpr float curve
|
||||
=4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
|
||||
for (int i = 0; i < numSamples; ++i) {
|
||||
float x = d->normalizationBuffer.read(i);
|
||||
out_smp[ch][i] *= x / (x * x + 1 / curve);
|
||||
}
|
||||
}
|
||||
d->outCircularBuffer[ch].advance(numSamples);
|
||||
}
|
||||
|
||||
d->normalizationBuffer.clearBlock(0, numSamples);
|
||||
d->normalizationBuffer.advance(numSamples);
|
||||
|
||||
_outBufferWriteOffset -= numSamples;
|
||||
_availableOutputSamples -= numSamples;
|
||||
|
||||
if (modulate_synthesis_hop) {
|
||||
d->exact_hop_s = d->next_exact_hop_s;
|
||||
}
|
||||
}
|
||||
|
||||
void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
|
||||
{
|
||||
setTimeStretchAndPitchFactor(1.0, pitchFactor);
|
||||
feedAudio(smp, numSamples);
|
||||
retrieveAudio(smp, numSamples);
|
||||
}
|
||||
|
||||
int TimeAndPitch::getLatencySamples() const
|
||||
{
|
||||
return fftSize - fftSize / overlap + 3; // 3 for resampling
|
||||
}
|
||||
|
||||
int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const
|
||||
{
|
||||
const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
|
||||
return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
|
||||
}
|
||||
} // namespace staffpad
|
||||
155
libs/staffpad/VectorOps.h
Normal file
155
libs/staffpad/VectorOps.h
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
//
|
||||
// This header is provided for convenience, to easily wrap vector operations around
|
||||
// their platform-specific optimised libraries (e.g. IPP, vDSP), if desired.
|
||||
// This can be done by adding #if defined(...) compile-time branches to each function
|
||||
// and calling the corresponding library function, e.g. ippsAdd_32f or vDSP_vadd
|
||||
// inside add().
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <complex>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_X64) || \
|
||||
(defined(_M_IX86_FP) && _M_IX86_FP >= 2))
|
||||
#define USE_SSE2_COMPLEX 1
|
||||
#endif
|
||||
|
||||
#if USE_SSE2_COMPLEX
|
||||
# include "SimdComplexConversions_sse2.h"
|
||||
#endif
|
||||
|
||||
namespace staffpad {
|
||||
namespace vo {
|
||||
inline void* allocate(int32_t bytes)
|
||||
{
|
||||
return ::malloc(bytes);
|
||||
}
|
||||
|
||||
inline void free(void* ptr)
|
||||
{
|
||||
return ::free(ptr);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void copy(const T* src, T* dst, int32_t n)
|
||||
{
|
||||
memcpy(dst, src, n * sizeof(T));
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void add(const T* src1, const T* src2, T* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = src1[i] + src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void subtract(const T* src1, const T* src2, T* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = src2[i] - src1[i];
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void constantMultiply(const T* src, T constant, T* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = src[i] * constant;
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void constantMultiplyAndAdd(const T* src, T constant, T* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] += src[i] * constant;
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void multiply(const T* src1, const T* src2, T* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = src1[i] * src2[i];
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void setToZero(T* dst, int32_t n)
|
||||
{
|
||||
std::fill(dst, dst + n, 0.f);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline void findMaxElement(const T* src, int32_t n, int32_t& maxIndex, T& maxValue)
|
||||
{
|
||||
maxIndex = 0;
|
||||
maxValue = n > 0 ? src[0] : std::numeric_limits<T>::min();
|
||||
|
||||
for (int32_t i = 1; i < n; i++) {
|
||||
if (src[i] > maxValue) {
|
||||
maxValue = src[i];
|
||||
maxIndex = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_SSE2_COMPLEX
|
||||
|
||||
inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
|
||||
{
|
||||
simd_complex_conversions::perform_parallel_simd_aligned(
|
||||
src, dst, n,
|
||||
[](const __m128 rp, const __m128 ip, __m128& out)
|
||||
{ out = simd_complex_conversions::atan2_ps(ip, rp); });
|
||||
}
|
||||
|
||||
inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
|
||||
{
|
||||
simd_complex_conversions::perform_parallel_simd_aligned(
|
||||
src, dst, n,
|
||||
[](const __m128 rp, const __m128 ip, __m128& out)
|
||||
{ out = simd_complex_conversions::norm(rp, ip); });
|
||||
}
|
||||
|
||||
inline void rotate(
|
||||
const float* oldPhase, const float* newPhase, std::complex<float>* dst,
|
||||
int32_t n)
|
||||
{
|
||||
simd_complex_conversions::rotate_parallel_simd_aligned(
|
||||
oldPhase, newPhase, dst, n);
|
||||
}
|
||||
|
||||
#else
|
||||
inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = std::arg(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
dst[i] = std::norm(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void rotate(const float* oldPhase, const float* newPhase, std::complex<float>* dst, int32_t n)
|
||||
{
|
||||
for (int32_t i = 0; i < n; i++) {
|
||||
const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
|
||||
dst[i] *= std::complex<float>(cosf(theta), sinf(theta));
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
} // namespace vo
|
||||
} // namespace staffpad
|
||||
379
libs/staffpad/pffft/README.txt
Normal file
379
libs/staffpad/pffft/README.txt
Normal file
|
|
@ -0,0 +1,379 @@
|
|||
PFFFT: a pretty fast FFT.
|
||||
|
||||
TL;DR
|
||||
--
|
||||
|
||||
PFFFT does 1D Fast Fourier Transforms, of single precision real and
|
||||
complex vectors. It tries do it fast, it tries to be correct, and it
|
||||
tries to be small. Computations do take advantage of SSE1 instructions
|
||||
on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
|
||||
license is BSD-like.
|
||||
|
||||
|
||||
Why does it exist:
|
||||
--
|
||||
|
||||
I was in search of a good performing FFT library , preferably very
|
||||
small and with a very liberal license.
|
||||
|
||||
When one says "fft library", FFTW ("Fastest Fourier Transform in the
|
||||
West") is probably the first name that comes to mind -- I guess that
|
||||
99% of open-source projects that need a FFT do use FFTW, and are happy
|
||||
with it. However, it is quite a large library , which does everything
|
||||
fft related (2d transforms, 3d transforms, other transformations such
|
||||
as discrete cosine , or fast hartley). And it is licensed under the
|
||||
GNU GPL , which means that it cannot be used in non open-source
|
||||
products.
|
||||
|
||||
An alternative to FFTW that is really small, is the venerable FFTPACK
|
||||
v4, which is available on NETLIB. A more recent version (v5) exists,
|
||||
but it is larger as it deals with multi-dimensional transforms. This
|
||||
is a library that is written in FORTRAN 77, a language that is now
|
||||
considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
|
||||
by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
|
||||
its age, benchmarks show it that it still a very good performing FFT
|
||||
library, see for example the 1d single precision benchmarks here:
|
||||
http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
|
||||
competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
|
||||
Apple vDSP. The reason for that is that those libraries do take
|
||||
advantage of the SSE SIMD instructions available on Intel CPUs,
|
||||
available since the days of the Pentium III. These instructions deal
|
||||
with small vectors of 4 floats at a time, instead of a single float
|
||||
for a traditionnal FPU, so when using these instructions one may expect
|
||||
a 4-fold performance improvement.
|
||||
|
||||
The idea was to take this fortran fftpack v4 code, translate to C,
|
||||
modify it to deal with those SSE instructions, and check that the
|
||||
final performance is not completely ridiculous when compared to other
|
||||
SIMD FFT libraries. Translation to C was performed with f2c (
|
||||
http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
|
||||
order to remove the thousands of gotos that were introduced by
|
||||
f2c. You will find the fftpack.h and fftpack.c sources in the
|
||||
repository, this a complete translation of
|
||||
http://www.netlib.org/fftpack/ , with the discrete cosine transform
|
||||
and the test program. There is no license information in the netlib
|
||||
repository, but it was confirmed to me by the fftpack v5 curators that
|
||||
the same terms do apply to fftpack v4:
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
|
||||
"BSD-like" license, it is compatible with proprietary projects.
|
||||
|
||||
Adapting fftpack to deal with the SIMD 4-element vectors instead of
|
||||
scalar single precision numbers was more complex than I originally
|
||||
thought, especially with the real transforms, and I ended up writing
|
||||
more code than I planned..
|
||||
|
||||
|
||||
The code:
|
||||
--
|
||||
|
||||
Only two files, in good old C, pffft.c and pffft.h . The API is very
|
||||
very simple, just make sure that you read the comments in pffft.h.
|
||||
|
||||
|
||||
Comparison with other FFTs:
|
||||
--
|
||||
|
||||
The idea was not to break speed records, but to get a decently fast
|
||||
fft that is at least 50% as fast as the fastest FFT -- especially on
|
||||
slowest computers . I'm more focused on getting the best performance
|
||||
on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
|
||||
on getting top performance on today fastest cpus.
|
||||
|
||||
It can be used in a real-time context as the fft functions do not
|
||||
perform any memory allocation -- that is why they accept a 'work'
|
||||
array in their arguments.
|
||||
|
||||
It is also a bit focused on performing 1D convolutions, that is why it
|
||||
provides "unordered" FFTs , and a fourier domain convolution
|
||||
operation.
|
||||
|
||||
|
||||
Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9)
|
||||
--
|
||||
|
||||
The benchmark shows the performance of various fft implementations measured in
|
||||
MFlops, with the number of floating point operations being defined as 5Nlog2(N)
|
||||
for a length N complex fft, and 2.5*Nlog2(N) for a real fft.
|
||||
See http://www.fftw.org/speed/method.html for an explanation of these formulas.
|
||||
|
||||
MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
|
||||
|
||||
Built with:
|
||||
|
||||
gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f
|
||||
|
||||
| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
|
||||
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
|
||||
| 64 | 2816 | 8596 | 7329 | 8187 | | 2887 | 14898 | 14668 | 11108 |
|
||||
| 96 | 3298 | n/a | 8378 | 7727 | | 3953 | n/a | 15680 | 10878 |
|
||||
| 128 | 3507 | 11575 | 9266 | 10108 | | 4233 | 17598 | 16427 | 12000 |
|
||||
| 160 | 3391 | n/a | 9838 | 10711 | | 4220 | n/a | 16653 | 11187 |
|
||||
| 192 | 3919 | n/a | 9868 | 10956 | | 4297 | n/a | 15770 | 12540 |
|
||||
| 256 | 4283 | 13179 | 10694 | 13128 | | 4545 | 19550 | 16350 | 13822 |
|
||||
| 384 | 3136 | n/a | 10810 | 12061 | | 3600 | n/a | 16103 | 13240 |
|
||||
| 480 | 3477 | n/a | 10632 | 12074 | | 3536 | n/a | 11630 | 12522 |
|
||||
| 512 | 3783 | 15141 | 11267 | 13838 | | 3649 | 20002 | 16560 | 13580 |
|
||||
| 640 | 3639 | n/a | 11164 | 13946 | | 3695 | n/a | 15416 | 13890 |
|
||||
| 768 | 3800 | n/a | 11245 | 13495 | | 3590 | n/a | 15802 | 14552 |
|
||||
| 800 | 3440 | n/a | 10499 | 13301 | | 3659 | n/a | 12056 | 13268 |
|
||||
| 1024 | 3924 | 15605 | 11450 | 15339 | | 3769 | 20963 | 13941 | 15467 |
|
||||
| 2048 | 4518 | 16195 | 11551 | 15532 | | 4258 | 20413 | 13723 | 15042 |
|
||||
| 2400 | 4294 | n/a | 10685 | 13078 | | 4093 | n/a | 12777 | 13119 |
|
||||
| 4096 | 4750 | 16596 | 11672 | 15817 | | 4157 | 19662 | 14316 | 14336 |
|
||||
| 8192 | 3820 | 16227 | 11084 | 12555 | | 3691 | 18132 | 12102 | 13813 |
|
||||
| 9216 | 3864 | n/a | 10254 | 12870 | | 3586 | n/a | 12119 | 13994 |
|
||||
| 16384 | 3822 | 15123 | 10454 | 12822 | | 3613 | 16874 | 12370 | 13881 |
|
||||
| 32768 | 4175 | 14512 | 10662 | 11095 | | 3881 | 14702 | 11619 | 11524 |
|
||||
| 262144 | 3317 | 11429 | 6269 | 9517 | | 2810 | 11729 | 7757 | 10179 |
|
||||
| 1048576 | 2913 | 10551 | 4730 | 5867 | | 2661 | 7881 | 3520 | 5350 |
|
||||
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
|
||||
|
||||
|
||||
Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
|
||||
|
||||
Built with:
|
||||
gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
|
||||
|
||||
| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
|
||||
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
|
||||
| 64 | 3840 | 7680 | 8777 | | 4389 | 20480 | 11171 |
|
||||
| 96 | 4214 | 9633 | 8429 | | 4816 | 22477 | 11238 |
|
||||
| 128 | 3584 | 10240 | 10240 | | 5120 | 23893 | 11947 |
|
||||
| 192 | 4854 | 11095 | 12945 | | 4854 | 22191 | 14121 |
|
||||
| 256 | 4096 | 11703 | 16384 | | 5120 | 23406 | 13653 |
|
||||
| 384 | 4395 | 14651 | 12558 | | 4884 | 19535 | 14651 |
|
||||
| 512 | 5760 | 13166 | 15360 | | 4608 | 23040 | 15360 |
|
||||
| 768 | 4907 | 14020 | 16357 | | 4461 | 19628 | 14020 |
|
||||
| 1024 | 5120 | 14629 | 14629 | | 5120 | 20480 | 15754 |
|
||||
| 2048 | 5632 | 14080 | 18773 | | 4693 | 12516 | 16091 |
|
||||
| 4096 | 5120 | 13653 | 17554 | | 4726 | 7680 | 14456 |
|
||||
| 8192 | 4160 | 7396 | 13312 | | 4437 | 14791 | 13312 |
|
||||
| 9216 | 4210 | 6124 | 13473 | | 4491 | 7282 | 14970 |
|
||||
| 16384 | 3976 | 11010 | 14313 | | 4210 | 11450 | 13631 |
|
||||
| 32768 | 4260 | 10224 | 10954 | | 4260 | 6816 | 11797 |
|
||||
| 262144 | 3736 | 6896 | 9961 | | 2359 | 8965 | 9437 |
|
||||
| 1048576 | 2796 | 4534 | 6453 | | 1864 | 3078 | 5592 |
|
||||
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
|
||||
|
||||
|
||||
|
||||
MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
|
||||
|
||||
Built with:
|
||||
|
||||
gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
|
||||
|
||||
| input len |real FFTPack| real vDSP | real FFTW | real PFFFT | |cplx FFTPack| cplx vDSP | cplx FFTW | cplx PFFFT |
|
||||
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
|
||||
| 64 | 745 | 2145 | 1706 | 2028 | | 961 | 3356 | 3313 | 2300 |
|
||||
| 96 | 877 | n/a | 1976 | 1978 | | 1059 | n/a | 3333 | 2233 |
|
||||
| 128 | 951 | 2808 | 2213 | 2279 | | 1202 | 3803 | 3739 | 2494 |
|
||||
| 192 | 1002 | n/a | 2456 | 2429 | | 1186 | n/a | 3701 | 2508 |
|
||||
| 256 | 1065 | 3205 | 2641 | 2793 | | 1302 | 4013 | 3912 | 2663 |
|
||||
| 384 | 845 | n/a | 2759 | 2499 | | 948 | n/a | 3729 | 2504 |
|
||||
| 512 | 900 | 3476 | 2956 | 2759 | | 974 | 4057 | 3954 | 2645 |
|
||||
| 768 | 910 | n/a | 2912 | 2737 | | 975 | n/a | 3837 | 2614 |
|
||||
| 1024 | 936 | 3583 | 3107 | 3009 | | 1006 | 4124 | 3821 | 2697 |
|
||||
| 2048 | 1057 | 3585 | 3091 | 2837 | | 1089 | 3889 | 3701 | 2513 |
|
||||
| 4096 | 1083 | 3524 | 3092 | 2733 | | 1039 | 3617 | 3462 | 2364 |
|
||||
| 8192 | 874 | 3252 | 2967 | 2363 | | 911 | 3106 | 2789 | 2302 |
|
||||
| 9216 | 898 | n/a | 2420 | 2290 | | 865 | n/a | 2676 | 2204 |
|
||||
| 16384 | 903 | 2892 | 2506 | 2421 | | 899 | 3026 | 2797 | 2289 |
|
||||
| 32768 | 965 | 2837 | 2550 | 2358 | | 920 | 2922 | 2763 | 2240 |
|
||||
| 262144 | 738 | 2422 | 1589 | 1708 | | 610 | 2038 | 1436 | 1091 |
|
||||
| 1048576 | 528 | 1207 | 845 | 880 | | 606 | 1020 | 669 | 1036 |
|
||||
|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
|
||||
|
||||
|
||||
|
||||
Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
|
||||
|
||||
Built with:
|
||||
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||
|
||||
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
| 64 | 1920 | 3614 | 5120 | | 2194 | 7680 | 6467 |
|
||||
| 96 | 1873 | 3549 | 5187 | | 2107 | 8429 | 5863 |
|
||||
| 128 | 2240 | 3773 | 5514 | | 2560 | 7964 | 6827 |
|
||||
| 192 | 1765 | 4569 | 7767 | | 2284 | 9137 | 7061 |
|
||||
| 256 | 2048 | 5461 | 7447 | | 2731 | 9638 | 7802 |
|
||||
| 384 | 1998 | 5861 | 6762 | | 2313 | 9253 | 7644 |
|
||||
| 512 | 2095 | 6144 | 7680 | | 2194 | 10240 | 7089 |
|
||||
| 768 | 2230 | 5773 | 7549 | | 2045 | 10331 | 7010 |
|
||||
| 1024 | 2133 | 6400 | 8533 | | 2133 | 10779 | 7877 |
|
||||
| 2048 | 2011 | 7040 | 8665 | | 1942 | 10240 | 7768 |
|
||||
| 4096 | 2194 | 6827 | 8777 | | 1755 | 9452 | 6827 |
|
||||
| 8192 | 1849 | 6656 | 6656 | | 1752 | 7831 | 6827 |
|
||||
| 9216 | 1871 | 5858 | 6416 | | 1643 | 6909 | 6266 |
|
||||
| 16384 | 1883 | 6223 | 6506 | | 1664 | 7340 | 6982 |
|
||||
| 32768 | 1826 | 6390 | 6667 | | 1631 | 7481 | 6971 |
|
||||
| 262144 | 1546 | 4075 | 5977 | | 1299 | 3415 | 3551 |
|
||||
| 1048576 | 1104 | 2071 | 1730 | | 1104 | 1149 | 1834 |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
|
||||
|
||||
|
||||
Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
|
||||
|
||||
Built with:
|
||||
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
||||
|
||||
| N (input length) | real FFTPack | real FFTW | real PFFFT | | cplx FFTPack | cplx FFTW | cplx PFFFT |
|
||||
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
|
||||
| 64 | 452 | 1041 | 1336 | | 549 | 2318 | 1781 |
|
||||
| 96 | 444 | 1297 | 1297 | | 503 | 2408 | 1686 |
|
||||
| 128 | 527 | 1525 | 1707 | | 543 | 2655 | 1886 |
|
||||
| 192 | 498 | 1653 | 1849 | | 539 | 2678 | 1942 |
|
||||
| 256 | 585 | 1862 | 2156 | | 594 | 2777 | 2244 |
|
||||
| 384 | 499 | 1870 | 1998 | | 511 | 2586 | 1890 |
|
||||
| 512 | 562 | 2095 | 2194 | | 542 | 2973 | 2194 |
|
||||
| 768 | 545 | 2045 | 2133 | | 545 | 2365 | 2133 |
|
||||
| 1024 | 595 | 2133 | 2438 | | 569 | 2695 | 2179 |
|
||||
| 2048 | 587 | 2125 | 2347 | | 521 | 2230 | 1707 |
|
||||
| 4096 | 495 | 1890 | 1834 | | 492 | 1876 | 1672 |
|
||||
| 8192 | 469 | 1548 | 1729 | | 438 | 1740 | 1664 |
|
||||
| 9216 | 468 | 1663 | 1663 | | 446 | 1585 | 1531 |
|
||||
| 16384 | 453 | 1608 | 1767 | | 398 | 1476 | 1664 |
|
||||
| 32768 | 456 | 1420 | 1503 | | 387 | 1388 | 1345 |
|
||||
| 262144 | 309 | 385 | 726 | | 262 | 415 | 840 |
|
||||
| 1048576 | 280 | 351 | 739 | | 261 | 313 | 797 |
|
||||
|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
|
||||
|
||||
|
||||
|
||||
Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
|
||||
|
||||
Built with:
|
||||
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
||||
|
||||
(visual c++ is definitively not very good with SSE intrinsics...)
|
||||
|
||||
| N (input length) | real FFTPack | real PFFFT | | cplx FFTPack | cplx PFFFT |
|
||||
|------------------+--------------+--------------| |--------------+--------------|
|
||||
| 64 | 173 | 1009 | | 174 | 1159 |
|
||||
| 96 | 169 | 1029 | | 188 | 1201 |
|
||||
| 128 | 195 | 1242 | | 191 | 1275 |
|
||||
| 192 | 178 | 1312 | | 184 | 1276 |
|
||||
| 256 | 196 | 1591 | | 186 | 1281 |
|
||||
| 384 | 172 | 1409 | | 181 | 1281 |
|
||||
| 512 | 187 | 1640 | | 181 | 1313 |
|
||||
| 768 | 171 | 1614 | | 176 | 1258 |
|
||||
| 1024 | 186 | 1812 | | 178 | 1223 |
|
||||
| 2048 | 190 | 1707 | | 186 | 1099 |
|
||||
| 4096 | 182 | 1446 | | 177 | 975 |
|
||||
| 8192 | 175 | 1345 | | 169 | 1034 |
|
||||
| 9216 | 165 | 1271 | | 168 | 1023 |
|
||||
| 16384 | 166 | 1396 | | 165 | 949 |
|
||||
| 32768 | 172 | 1311 | | 161 | 881 |
|
||||
| 262144 | 136 | 632 | | 134 | 629 |
|
||||
| 1048576 | 134 | 698 | | 127 | 623 |
|
||||
|------------------+--------------+--------------| |--------------+--------------|
|
||||
|
||||
|
||||
|
||||
Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
|
||||
|
||||
Built with:
|
||||
gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
|
||||
|
||||
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
| 64 | 549 | 452 | 731 | | 512 | 602 | 640 |
|
||||
| 96 | 421 | 272 | 702 | | 496 | 571 | 602 |
|
||||
| 128 | 498 | 512 | 815 | | 597 | 618 | 652 |
|
||||
| 160 | 521 | 536 | 815 | | 586 | 669 | 625 |
|
||||
| 192 | 539 | 571 | 883 | | 485 | 597 | 626 |
|
||||
| 256 | 640 | 539 | 975 | | 569 | 611 | 671 |
|
||||
| 384 | 499 | 610 | 879 | | 499 | 602 | 637 |
|
||||
| 480 | 518 | 507 | 877 | | 496 | 661 | 616 |
|
||||
| 512 | 524 | 591 | 1002 | | 549 | 678 | 668 |
|
||||
| 640 | 542 | 612 | 955 | | 568 | 663 | 645 |
|
||||
| 768 | 557 | 613 | 981 | | 491 | 663 | 598 |
|
||||
| 800 | 514 | 353 | 882 | | 514 | 360 | 574 |
|
||||
| 1024 | 640 | 640 | 1067 | | 492 | 683 | 602 |
|
||||
| 2048 | 587 | 640 | 908 | | 486 | 640 | 552 |
|
||||
| 2400 | 479 | 368 | 777 | | 422 | 376 | 518 |
|
||||
| 4096 | 511 | 614 | 853 | | 426 | 640 | 534 |
|
||||
| 8192 | 415 | 584 | 708 | | 386 | 622 | 516 |
|
||||
| 9216 | 419 | 571 | 687 | | 364 | 586 | 506 |
|
||||
| 16384 | 426 | 577 | 716 | | 398 | 606 | 530 |
|
||||
| 32768 | 417 | 572 | 673 | | 399 | 572 | 468 |
|
||||
| 262144 | 219 | 380 | 293 | | 255 | 431 | 343 |
|
||||
| 1048576 | 202 | 274 | 237 | | 265 | 282 | 355 |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
|
||||
Same platform as above, but this time pffft and fftpack are built with clang 3.2:
|
||||
|
||||
clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
|
||||
|
||||
| input len |real FFTPack| real FFTW | real PFFFT | |cplx FFTPack| cplx FFTW | cplx PFFFT |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
| 64 | 427 | 452 | 853 | | 427 | 602 | 1024 |
|
||||
| 96 | 351 | 276 | 843 | | 337 | 571 | 963 |
|
||||
| 128 | 373 | 512 | 996 | | 390 | 618 | 1054 |
|
||||
| 160 | 426 | 536 | 987 | | 375 | 669 | 914 |
|
||||
| 192 | 404 | 571 | 1079 | | 388 | 588 | 1079 |
|
||||
| 256 | 465 | 539 | 1205 | | 445 | 602 | 1170 |
|
||||
| 384 | 366 | 610 | 1099 | | 343 | 594 | 1099 |
|
||||
| 480 | 356 | 507 | 1140 | | 335 | 651 | 931 |
|
||||
| 512 | 411 | 591 | 1213 | | 384 | 649 | 1124 |
|
||||
| 640 | 398 | 612 | 1193 | | 373 | 654 | 901 |
|
||||
| 768 | 409 | 613 | 1227 | | 383 | 663 | 1044 |
|
||||
| 800 | 411 | 348 | 1073 | | 353 | 358 | 809 |
|
||||
| 1024 | 427 | 640 | 1280 | | 413 | 692 | 1004 |
|
||||
| 2048 | 414 | 626 | 1126 | | 371 | 640 | 853 |
|
||||
| 2400 | 399 | 373 | 898 | | 319 | 368 | 653 |
|
||||
| 4096 | 404 | 602 | 1059 | | 357 | 633 | 778 |
|
||||
| 8192 | 332 | 584 | 792 | | 308 | 616 | 716 |
|
||||
| 9216 | 322 | 561 | 783 | | 299 | 586 | 687 |
|
||||
| 16384 | 344 | 568 | 778 | | 314 | 617 | 745 |
|
||||
| 32768 | 342 | 564 | 737 | | 314 | 552 | 629 |
|
||||
| 262144 | 201 | 383 | 313 | | 227 | 435 | 413 |
|
||||
| 1048576 | 187 | 262 | 251 | | 228 | 281 | 409 |
|
||||
|-----------+------------+------------+------------| |------------+------------+------------|
|
||||
|
||||
So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
|
||||
(the fftpack performance numbers are better with gcc), while clang is
|
||||
the best with neon intrinsics (see how pffft perf has improved with
|
||||
clang 3.2).
|
||||
|
||||
|
||||
NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
|
||||
|
||||
Built with:
|
||||
gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
|
||||
|
||||
| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
|
||||
|-----------+------------+------------| |------------+------------|
|
||||
| 64 | 1735 | 3308 | | 1994 | 3744 |
|
||||
| 96 | 1596 | 3448 | | 1987 | 3572 |
|
||||
| 128 | 1807 | 4076 | | 2255 | 3960 |
|
||||
| 160 | 1769 | 4083 | | 2071 | 3845 |
|
||||
| 192 | 1990 | 4233 | | 2017 | 3939 |
|
||||
| 256 | 2191 | 4882 | | 2254 | 4346 |
|
||||
| 384 | 1878 | 4492 | | 2073 | 4012 |
|
||||
| 480 | 1748 | 4398 | | 1923 | 3951 |
|
||||
| 512 | 2030 | 5064 | | 2267 | 4195 |
|
||||
| 640 | 1918 | 4756 | | 2094 | 4184 |
|
||||
| 768 | 2099 | 4907 | | 2048 | 4297 |
|
||||
| 800 | 1822 | 4555 | | 1880 | 4063 |
|
||||
| 1024 | 2232 | 5355 | | 2187 | 4420 |
|
||||
| 2048 | 2176 | 4983 | | 2027 | 3602 |
|
||||
| 2400 | 1741 | 4256 | | 1710 | 3344 |
|
||||
| 4096 | 1816 | 3914 | | 1851 | 3349 |
|
||||
| 8192 | 1716 | 3481 | | 1700 | 3255 |
|
||||
| 9216 | 1735 | 3589 | | 1653 | 3094 |
|
||||
| 16384 | 1567 | 3483 | | 1637 | 3244 |
|
||||
| 32768 | 1624 | 3240 | | 1655 | 3156 |
|
||||
| 262144 | 1012 | 1898 | | 983 | 1503 |
|
||||
| 1048576 | 876 | 1154 | | 868 | 1341 |
|
||||
|-----------+------------+------------| |------------+------------|
|
||||
|
||||
The performance on the tegra K1 is pretty impressive. I'm not
|
||||
including the FFTW numbers as they as slightly below the scalar
|
||||
fftpack numbers, so something must be wrong (however it seems to be
|
||||
correctly configured and is using neon simd instructions).
|
||||
|
||||
When using clang 3.4 the pffft version is even a bit faster, reaching
|
||||
5.7 GFlops for real ffts of size 1024.
|
||||
1866
libs/staffpad/pffft/pffft.c
Normal file
1866
libs/staffpad/pffft/pffft.c
Normal file
File diff suppressed because it is too large
Load diff
215
libs/staffpad/pffft/pffft.h
Normal file
215
libs/staffpad/pffft/pffft.h
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
|
||||
authored by Dr Paul Swarztrauber of NCAR, in 1985.
|
||||
|
||||
As confirmed by the NCAR fftpack software curators, the following
|
||||
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
|
||||
released under the same terms.
|
||||
|
||||
FFTPACK license:
|
||||
|
||||
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
|
||||
|
||||
Copyright (c) 2004 the University Corporation for Atmospheric
|
||||
Research ("UCAR"). All rights reserved. Developed by NCAR's
|
||||
Computational and Information Systems Laboratory, UCAR,
|
||||
www.cisl.ucar.edu.
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
PFFFT : a Pretty Fast FFT.
|
||||
|
||||
This is basically an adaptation of the single precision fftpack
|
||||
(v4) as found on netlib taking advantage of SIMD instruction found
|
||||
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
|
||||
|
||||
For architectures where no SIMD instruction is available, the code
|
||||
falls back to a scalar version.
|
||||
|
||||
Restrictions:
|
||||
|
||||
- 1D transforms only, with 32-bit single precision.
|
||||
|
||||
- supports only transforms for inputs of length N of the form
|
||||
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
|
||||
144, 160, etc are all acceptable lengths). Performance is best for
|
||||
128<=N<=8192.
|
||||
|
||||
- all (float*) pointers in the functions below are expected to
|
||||
have an "simd-compatible" alignment, that is 16 bytes on x86 and
|
||||
powerpc CPUs.
|
||||
|
||||
You can allocate such buffers with the functions
|
||||
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
|
||||
posix_memalign..)
|
||||
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h> // for size_t
|
||||
|
||||
#ifndef PFFFT_FLOAT
|
||||
#if 1
|
||||
/* default: float */
|
||||
#define PFFFT_FLOAT float
|
||||
#else
|
||||
#define PFFFT_FLOAT double
|
||||
#ifndef PFFFT_SIMD_DISABLE
|
||||
/* double only with PFFFT_SIMD_DISABLE */
|
||||
#define PFFFT_SIMD_DISABLE 1
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* opaque struct holding internal stuff (precomputed twiddle factors)
|
||||
this struct can be shared by many threads as it contains only
|
||||
read-only data.
|
||||
*/
|
||||
typedef struct PFFFT_Setup PFFFT_Setup;
|
||||
|
||||
/* direction of the transform */
|
||||
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
|
||||
|
||||
/* type of transform */
|
||||
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
|
||||
|
||||
/*
|
||||
prepare for performing transforms of size N -- the returned
|
||||
PFFFT_Setup structure is read-only so it can safely be shared by
|
||||
multiple concurrent threads.
|
||||
*/
|
||||
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
|
||||
void pffft_destroy_setup(PFFFT_Setup *);
|
||||
/*
|
||||
Perform a Fourier transform , The z-domain data is stored in the
|
||||
most efficient order for transforming it back, or using it for
|
||||
convolution. If you need to have its content sorted in the
|
||||
"usual" way, that is as an array of interleaved complex numbers,
|
||||
either use pffft_transform_ordered , or call pffft_zreorder after
|
||||
the forward fft, and before the backward fft.
|
||||
|
||||
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
|
||||
Typically you will want to scale the backward transform by 1/N.
|
||||
|
||||
The 'work' pointer should point to an area of N (2*N for complex
|
||||
fft) floats, properly aligned. If 'work' is NULL, then stack will
|
||||
be used instead (this is probably the best strategy for small
|
||||
FFTs, say for N < 16384). Threads usually have a small stack, that
|
||||
there's no sufficient amount of memory, usually leading to a creash!
|
||||
Use the heap with pffft_aligned_malloc() in this case.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Similar to pffft_transform, but makes sure that the output is
|
||||
ordered as expected (interleaved complex numbers). This is
|
||||
similar to calling pffft_transform and then pffft_zreorder.
|
||||
|
||||
input and output may alias.
|
||||
*/
|
||||
void pffft_transform_ordered(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
|
||||
PFFFT_FORWARD) if you want to have the frequency components in
|
||||
the correct "canonical" order, as interleaved complex numbers.
|
||||
|
||||
(for real transforms, both 0-frequency and half frequency
|
||||
components, which are real, are assembled in the first entry as
|
||||
F(0)+i*F(n/2+1). Note that the original fftpack did place
|
||||
F(n/2+1) at the end of the arrays).
|
||||
|
||||
input and output should not alias.
|
||||
*/
|
||||
void pffft_zreorder(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, pffft_direction_t direction);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and accumulate them into dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const PFFFT_FLOAT *dft_a, const PFFFT_FLOAT *dft_b, PFFFT_FLOAT *dft_ab, PFFFT_FLOAT scaling);
|
||||
|
||||
/*
|
||||
Perform a multiplication of the frequency components of dft_a and
|
||||
dft_b and put result in dft_ab. The arrays should have
|
||||
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
|
||||
*not* have been reordered with pffft_zreorder (otherwise just
|
||||
perform the operation yourself as the dft coefs are stored as
|
||||
interleaved complex numbers).
|
||||
|
||||
the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
|
||||
|
||||
The dft_a, dft_b and dft_ab pointers may alias.
|
||||
*/
|
||||
void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
|
||||
|
||||
/* simple helper to get minimum possible fft size */
|
||||
int pffft_min_fft_size(pffft_transform_t transform);
|
||||
|
||||
/* simple helper to determine next power of 2
|
||||
- without inexact/rounding floating point operations
|
||||
*/
|
||||
int pffft_next_power_of_two(int N);
|
||||
|
||||
/* simple helper to determine if power of 2 - returns bool */
|
||||
int pffft_is_power_of_two(int N);
|
||||
|
||||
/*
|
||||
the float buffers must have the correct alignment (16-byte boundary
|
||||
on intel and powerpc). This function may be used to obtain such
|
||||
correctly aligned buffers.
|
||||
*/
|
||||
void *pffft_aligned_malloc(size_t nb_bytes);
|
||||
void pffft_aligned_free(void *);
|
||||
|
||||
/* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
|
||||
int pffft_simd_size(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
246
libs/staffpad/pffft/pfsimd_macros.h
Normal file
246
libs/staffpad/pffft/pfsimd_macros.h
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
|
||||
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
||||
|
||||
Redistribution and use of the Software in source and binary forms,
|
||||
with or without modification, is permitted provided that the
|
||||
following conditions are met:
|
||||
|
||||
- Neither the names of NCAR's Computational and Information Systems
|
||||
Laboratory, the University Corporation for Atmospheric Research,
|
||||
nor the names of its sponsors or contributors may be used to
|
||||
endorse or promote products derived from this Software without
|
||||
specific prior written permission.
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notices, this list of conditions, and the disclaimer below.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer below in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/*
|
||||
* SIMD reference material:
|
||||
*
|
||||
* general SIMD introduction:
|
||||
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
||||
*
|
||||
* SSE 1:
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
||||
*
|
||||
* ARM NEON:
|
||||
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
||||
*
|
||||
* Altivec:
|
||||
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
||||
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
||||
* better one?
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
Altivec support macros
|
||||
*/
|
||||
#if !defined(PFFFT_SIMD_DISABLE) && ((defined(__ppc__) || defined(__ppc64__)) && defined(__ALTIVEC__))
|
||||
#include <altivec.h>
|
||||
typedef vector float v4sf;
|
||||
# define SIMD_SZ 4
|
||||
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
||||
# define VZERO() ((vector float) vec_splat_u8(0))
|
||||
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
||||
# define VADD(a,b) vec_add(a,b)
|
||||
# define VMADD(a,b,c) vec_madd(a,b,c)
|
||||
# define VSUB(a,b) vec_sub(a,b)
|
||||
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
||||
# define LD_PS1(p) ld_ps1(&p)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
||||
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
||||
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
||||
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
||||
}
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
v4sf y0 = vec_mergeh(x0, x2); \
|
||||
v4sf y1 = vec_mergel(x0, x2); \
|
||||
v4sf y2 = vec_mergeh(x1, x3); \
|
||||
v4sf y3 = vec_mergel(x1, x3); \
|
||||
x0 = vec_mergeh(y0, y2); \
|
||||
x1 = vec_mergel(y0, y2); \
|
||||
x2 = vec_mergeh(y1, y3); \
|
||||
x3 = vec_mergel(y1, y3); \
|
||||
}
|
||||
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
/*
|
||||
SSE1 support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
|
||||
|
||||
#include <xmmintrin.h>
|
||||
typedef __m128 v4sf;
|
||||
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
||||
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
||||
# define SIMD_SZ 4
|
||||
# define VREQUIRES_ALIGN 1
|
||||
# define VZERO() _mm_setzero_ps()
|
||||
# define VMUL(a,b) _mm_mul_ps(a,b)
|
||||
# define VADD(a,b) _mm_add_ps(a,b)
|
||||
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
||||
# define VSUB(a,b) _mm_sub_ps(a,b)
|
||||
# define LD_PS1(p) _mm_set1_ps(p)
|
||||
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
||||
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
||||
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
||||
/* reverse/flip all floats */
|
||||
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
||||
/* reverse/flip complex floats */
|
||||
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
||||
|
||||
/*
|
||||
ARM NEON support macros
|
||||
*/
|
||||
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64))
|
||||
# include <arm_neon.h>
|
||||
typedef float32x4_t v4sf;
|
||||
# define SIMD_SZ 4
|
||||
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
||||
# define VZERO() vdupq_n_f32(0)
|
||||
# define VMUL(a,b) vmulq_f32(a,b)
|
||||
# define VADD(a,b) vaddq_f32(a,b)
|
||||
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
||||
# define VSUB(a,b) vsubq_f32(a,b)
|
||||
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
||||
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
||||
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
||||
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
||||
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
||||
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
||||
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
||||
}
|
||||
// marginally faster version
|
||||
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
||||
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
#else
|
||||
# if !defined(PFFFT_SIMD_DISABLE)
|
||||
# warning "building with simd disabled !\n";
|
||||
# define PFFFT_SIMD_DISABLE // fallback to scalar code
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
|
||||
#ifdef PFFFT_SIMD_DISABLE
|
||||
typedef float v4sf;
|
||||
# define SIMD_SZ 1
|
||||
# define VREQUIRES_ALIGN 0
|
||||
# define VZERO() 0.f
|
||||
# define VMUL(a,b) ((a)*(b))
|
||||
# define VADD(a,b) ((a)+(b))
|
||||
# define VMADD(a,b,c) ((a)*(b)+(c))
|
||||
# define VSUB(a,b) ((a)-(b))
|
||||
# define LD_PS1(p) (p)
|
||||
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
||||
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
||||
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
||||
#endif
|
||||
|
||||
// shortcuts for complex multiplcations
|
||||
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
||||
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
||||
#ifndef SVMUL
|
||||
// multiply a scalar with a vector
|
||||
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
||||
#endif
|
||||
|
||||
typedef union v4sf_union {
|
||||
v4sf v;
|
||||
float f[SIMD_SZ];
|
||||
} v4sf_union;
|
||||
|
||||
#if !defined(PFFFT_SIMD_DISABLE)
|
||||
|
||||
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
|
||||
|
||||
/* detect bugs with the vector support macros */
|
||||
static void Vvalidate_simd() {
|
||||
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
|
||||
v4sf_union a0, a1, a2, a3, t, u;
|
||||
memcpy(a0.f, f, 4*sizeof(float));
|
||||
memcpy(a1.f, f+4, 4*sizeof(float));
|
||||
memcpy(a2.f, f+8, 4*sizeof(float));
|
||||
memcpy(a3.f, f+12, 4*sizeof(float));
|
||||
|
||||
t = a0; u = a1; t.v = VZERO();
|
||||
printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
|
||||
t.v = VADD(a1.v, a2.v);
|
||||
printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
|
||||
t.v = VMUL(a1.v, a2.v);
|
||||
printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
|
||||
t.v = VMADD(a1.v, a2.v,a0.v);
|
||||
printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
|
||||
|
||||
INTERLEAVE2(a1.v,a2.v,t.v,u.v);
|
||||
printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
|
||||
assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
|
||||
UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
|
||||
printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
|
||||
assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
|
||||
|
||||
t.v=LD_PS1(f[15]);
|
||||
printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
|
||||
assertv4(t, 15, 15, 15, 15);
|
||||
t.v = VSWAPHL(a1.v, a2.v);
|
||||
printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
|
||||
assertv4(t, 8, 9, 6, 7);
|
||||
VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
|
||||
printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
|
||||
a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
|
||||
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
|
||||
assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
|
||||
}
|
||||
#endif //!PFFFT_SIMD_DISABLE
|
||||
|
||||
/* SSE and co like 16-bytes aligned pointers */
|
||||
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
|
||||
|
||||
static
|
||||
void *Valigned_malloc(size_t nb_bytes) {
|
||||
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
||||
if (!p0) return (void *) 0;
|
||||
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
||||
*((void **) p - 1) = p0;
|
||||
return p;
|
||||
}
|
||||
|
||||
static
|
||||
void Valigned_free(void *p) {
|
||||
if (p) free(*((void **) p - 1));
|
||||
}
|
||||
117
libs/staffpad/staffpad/TimeAndPitch.h
Normal file
117
libs/staffpad/staffpad/TimeAndPitch.h
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
//
|
||||
// Arbitrary time and pitch stretching, based on the Phase-locked Vocoder.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <complex>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace staffpad {
|
||||
class TimeAndPitch
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* `magnitude` may be null, in which case the callback should re-compute it.
|
||||
*/
|
||||
using ShiftTimbreCb = std::function<void (
|
||||
double factor, std::complex<float>* spectrum, const float* magnitude)>;
|
||||
|
||||
TimeAndPitch(
|
||||
int fftSize, bool reduceImaging = true, ShiftTimbreCb shiftTimbreCb = {});
|
||||
~TimeAndPitch();
|
||||
/**
|
||||
Setup at least once before processing.
|
||||
\param numChannels Must be 1 or 2
|
||||
\param maxBlockSize The caller's maximum block size, e.g. 1024 samples
|
||||
*/
|
||||
void setup(int numChannels, int maxBlockSize);
|
||||
|
||||
/**
|
||||
Set independent time stretch and pitch factors (synchronously to processing thread).
|
||||
The factor change resolution follows the processing block size, with some approximation.
|
||||
As a general rule, smaller block sizes result in finer changes, although there's a lower
|
||||
bound due to internal windowed segmentation and OLA resynthesis.
|
||||
*/
|
||||
void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor);
|
||||
|
||||
/**
|
||||
Return the ~number of samples until the next hop gets analysed (might be 1
|
||||
or 2 too high). For extremely small stretch ratios, several analyses might
|
||||
be needed to produce samples, so you may have to repeat the process.
|
||||
Eventually, more output samples are produced.
|
||||
*/
|
||||
int getSamplesToNextHop() const;
|
||||
|
||||
/**
|
||||
Tells the number of already processed output samples (after time-stretching) that
|
||||
are ready for consumption.
|
||||
*/
|
||||
int getNumAvailableOutputSamples() const;
|
||||
|
||||
/**
|
||||
Feed more input samples. \see `getSamplesToNextHop()` to know how much to feed to
|
||||
produce more output samples.
|
||||
*/
|
||||
void feedAudio(const float* const* in_smp, int numSamples);
|
||||
|
||||
/**
|
||||
Retrieved shifted/stretched samples in output. \see `getNumAvailableOutputSamples()`
|
||||
to ask a valid amount of `numSamples`.
|
||||
*/
|
||||
void retrieveAudio(float* const* out_smp, int numSamples);
|
||||
|
||||
/**
|
||||
Pitch-shift only in-place processing.
|
||||
*/
|
||||
void processPitchShift(float* const* smp, int numSamples, double pitchFactor);
|
||||
|
||||
/**
|
||||
Latency in input samples
|
||||
*/
|
||||
int getLatencySamples() const;
|
||||
|
||||
/**
|
||||
Latency in output samples, calculated on the given timeStretch factor
|
||||
*/
|
||||
int getLatencySamplesForStretchRatio(float timeStretch) const;
|
||||
|
||||
/**
|
||||
Resets the internal state, discards any buffered input
|
||||
*/
|
||||
void reset();
|
||||
|
||||
private:
|
||||
const int fftSize;
|
||||
static constexpr int overlap = 4;
|
||||
static constexpr bool normalize_window = true; // compensate for ola window overlaps
|
||||
static constexpr bool modulate_synthesis_hop = true;
|
||||
|
||||
void _process_hop(int hop_a, int hop_s);
|
||||
template<int num_channels>
|
||||
void _time_stretch(float hop_a, float hop_s);
|
||||
void _applyImagingReduction();
|
||||
|
||||
struct impl;
|
||||
std::shared_ptr<impl> d;
|
||||
|
||||
const bool _reduceImaging;
|
||||
const ShiftTimbreCb _shiftTimbreCb;
|
||||
|
||||
int _numChannels = 1;
|
||||
int _maxBlockSize = 1024;
|
||||
double _resampleReadPos = 0.0;
|
||||
int _availableOutputSamples = 0;
|
||||
int _numBins = fftSize / 2 + 1;
|
||||
double _overlap_a = overlap;
|
||||
|
||||
int _analysis_hop_counter = 0;
|
||||
|
||||
double _expectedPhaseChangePerBinPerSample = 0.01;
|
||||
double _timeStretch = 1.0;
|
||||
double _pitchFactor = 1.0;
|
||||
|
||||
int _outBufferWriteOffset = 0;
|
||||
};
|
||||
} // namespace staffpad
|
||||
37
libs/staffpad/wscript
Normal file
37
libs/staffpad/wscript
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Version of this package (even if built as a child)
|
||||
MAJOR = '0'
|
||||
MINOR = '0'
|
||||
MICRO = '0'
|
||||
STAFFPAD_VERSION = "%s.%s.%s" % (MAJOR, MINOR, MICRO)
|
||||
|
||||
# Library version (UNIX style major, minor, micro)
|
||||
# major increment <=> incompatible changes
|
||||
# minor increment <=> compatible changes (additions)
|
||||
# micro increment <=> no interface changes
|
||||
STAFFPAD_LIB_VERSION = '0.0.0'
|
||||
|
||||
I18N_PACKAGE = 'staffpad'
|
||||
|
||||
staffpad_sources = [
|
||||
'TimeAndPitch.cpp',
|
||||
'FourierTransform_pffft.cpp',
|
||||
'pffft/pffft.c'
|
||||
]
|
||||
|
||||
def options(opt):
|
||||
pass
|
||||
|
||||
def configure(conf):
|
||||
pass
|
||||
|
||||
def build(bld):
|
||||
obj = bld.stlib(features = 'cxx cxxstlib', source = staffpad_sources)
|
||||
obj.cxxflags = [ bld.env['compiler_flags_dict']['pic'], '-O3', '-ffast-math' ]
|
||||
obj.export_includes = ['.']
|
||||
obj.includes = ['.', 'pffft']
|
||||
obj.name = 'staffpad'
|
||||
obj.target = 'staffpad'
|
||||
obj.vnum = STAFFPAD_LIB_VERSION
|
||||
obj.defines = [ 'PACKAGE="' + I18N_PACKAGE + '"' ]
|
||||
1
wscript
1
wscript
|
|
@ -332,6 +332,7 @@ children = [
|
|||
'libs/clearlooks-newer',
|
||||
'libs/zita-resampler',
|
||||
'libs/zita-convolver',
|
||||
'libs/staffpad',
|
||||
# optionally external libraries
|
||||
'libs/fluidsynth',
|
||||
'libs/hidapi',
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue