diff --git a/libs/staffpad/CircularSampleBuffer.h b/libs/staffpad/CircularSampleBuffer.h
new file mode 100644
index 0000000000..4b67c7319d
--- /dev/null
+++ b/libs/staffpad/CircularSampleBuffer.h
@@ -0,0 +1,159 @@
+#pragma once
+
+#include <cassert>
+
+#include "VectorOps.h"
+
+/*
+ * Utility buffer class for delay-based effects
+ * Allocates a 2^n size buffer for branchless write and read using a bitmask
+ * It can read and write to past and future samples, but does not check for overlaps
+ */
+
+namespace staffpad::audio {
+template<typename SampleT>
+class CircularSampleBuffer
+{
+public:
+    CircularSampleBuffer()
+    {
+    }
+
+    ~CircularSampleBuffer()
+    {
+        if (_buffer) {
+            free(_buffer);
+        }
+    }
+
+    void setSize(int n)
+    {
+        if (n > _allocatedSize) {
+            auto oldSize = _allocatedSize;
+
+            auto findLargerPowerOfTwo = [](int32_t number) {
+                int32_t powerOf2 = 1;
+                while (powerOf2 < number) {
+                    powerOf2 *= 2;
+                }
+                return powerOf2;
+            };
+
+            _allocatedSize = findLargerPowerOfTwo(n);
+            _bufferSizeMask = _allocatedSize - 1;
+            _buffer = (SampleT*)std::realloc(_buffer, _allocatedSize * sizeof(SampleT));
+
+            // Reset the new memory region
+            assert(_buffer);
+            std::fill(_buffer + oldSize, _buffer + _allocatedSize, 0.f);
+        }
+    }
+
+    int getAllocatedSize() const
+    {
+        return _allocatedSize;
+    }
+
+    void reset()
+    {
+        if (_buffer && _allocatedSize > 0) {
+            memset(_buffer, 0, sizeof(SampleT) * _allocatedSize);
+        }
+        _position0 = 0;
+    }
+
+    void write(int offset, const SampleT& sample)
+    {
+        _buffer[(_position0 + offset) & _bufferSizeMask] = sample;
+    }
+
+    void writeOffset0(const SampleT& sample)
+    {
+        _buffer[_position0] = sample;
+    }
+
+    const SampleT& read(int offset) const
+    {
+        return _buffer[(_position0 + offset) & _bufferSizeMask];
+    }
+
+    /// change the 0 position by n
+    void advance(int n)
+    {
+        _position0 += n;
+        _position0 &= _bufferSizeMask;
+    }
+
+private:
+    template<typename fnc>
+    void _splitBlockOffsetFunction(int startOffset, int n, fnc f) const
+    {
+        assert(n <= _allocatedSize);
+        int firstIndex = (_position0 + startOffset) & _bufferSizeMask;
+        int n_to_end = _allocatedSize - firstIndex;
+        if (n_to_end > n) {
+            f(firstIndex, 0, n);
+        } else {
+            f(firstIndex, 0, n_to_end);
+            f(0, n_to_end, n - n_to_end);
+        }
+    }
+
+public:
+    void writeBlock(int startOffset, int n, const SampleT* sourceBlock)
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::copy(&sourceBlock[sampleOff], &_buffer[bufferOff], n);
+        });
+    }
+
+    void readBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::constantMultiply(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
+        });
+    }
+
+    void readAddBlockWithGain(int startOffset, int n, SampleT* targetBlock, float gainFactor) const
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::constantMultiplyAndAdd(&_buffer[bufferOff], gainFactor, &targetBlock[sampleOff], n);
+        });
+    }
+
+    void writeAddBlockWithGain(int startOffset, int n, const SampleT* sourceBlock, float gainFactor)
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::constantMultiplyAndAdd(&sourceBlock[sampleOff], gainFactor, &_buffer[bufferOff], n);
+        });
+    }
+
+    void readBlock(int startOffset, int n, SampleT* targetBlock) const
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
+        });
+    }
+
+    void readAndClearBlock(int startOffset, int n, SampleT* targetBlock)
+    {
+        _splitBlockOffsetFunction(startOffset, n, [=](int bufferOff, int sampleOff, int n) {
+            vo::copy(&_buffer[bufferOff], &targetBlock[sampleOff], n);
+            vo::setToZero(&_buffer[bufferOff], n);
+        });
+    }
+
+    void clearBlock(int startOffset, int n)
+    {
+        _splitBlockOffsetFunction(startOffset, n,
+                                  [=](int bufferOff, int sampleOff, int n) { vo::setToZero(&_buffer[bufferOff], n); });
+    }
+
+private:
+    SampleT* _buffer = nullptr;
+
+    int _position0 = 0;   // position of sample index 0 inside _buffer. This is where the next sample will be written to
+    int _allocatedSize = 0; // 2^n buffer size
+    int _bufferSizeMask = 0; // 2^n-1 buffer mask
+};
+} // namespace staffpad::audio
diff --git a/libs/staffpad/FourierTransform_pffft.cpp b/libs/staffpad/FourierTransform_pffft.cpp
new file mode 100644
index 0000000000..d19b2227c5
--- /dev/null
+++ b/libs/staffpad/FourierTransform_pffft.cpp
@@ -0,0 +1,56 @@
+#include "FourierTransform_pffft.h"
+
+#include "pffft.h"
+
+namespace staffpad::audio {
+FourierTransform::FourierTransform(int32_t newBlockSize)
+    : _blockSize{newBlockSize}
+{
+    _pffft_scratch = (float*)pffft_aligned_malloc(_blockSize * sizeof(float));
+    realFftSpec = pffft_new_setup(_blockSize, PFFFT_REAL);
+}
+
+FourierTransform::~FourierTransform()
+{
+    if (_pffft_scratch) {
+        pffft_aligned_free(_pffft_scratch);
+        _pffft_scratch = nullptr;
+    }
+    if (realFftSpec) {
+        pffft_destroy_setup(realFftSpec);
+        realFftSpec = nullptr;
+    }
+}
+
+void FourierTransform::forwardReal(const SamplesReal& t, SamplesComplex& c)
+{
+    assert(t.getNumSamples() == _blockSize);
+
+    for (auto ch = 0; ch < t.getNumChannels(); ++ch) {
+        auto* spec = c.getPtr(ch); // interleaved complex numbers, size _blockSize + 2
+        auto* cpx_flt = (float*)spec;
+        pffft_transform_ordered(realFftSpec, t.getPtr(ch), cpx_flt, _pffft_scratch, PFFFT_FORWARD);
+        // pffft combines dc and nyq values into the first complex value,
+        // adjust to CCS format.
+        auto dc = cpx_flt[0];
+        auto nyq = cpx_flt[1];
+        spec[0] = { dc, 0.f };
+        spec[c.getNumSamples() - 1] = { nyq, 0.f };
+    }
+}
+
+void FourierTransform::inverseReal(const SamplesComplex& c, SamplesReal& t)
+{
+    assert(c.getNumSamples() == _blockSize / 2 + 1);
+
+    for (auto ch = 0; ch < c.getNumChannels(); ++ch) {
+        auto* spec = c.getPtr(ch);
+        // Use t to convert in-place from CCS to pffft format
+        t.assignSamples(ch, (float*)spec);
+        auto* ts = t.getPtr(ch);
+        ts[0] = spec[0].real();
+        ts[1] = spec[c.getNumSamples() - 1].real();
+        pffft_transform_ordered(realFftSpec, ts, ts, _pffft_scratch, PFFFT_BACKWARD);
+    }
+}
+} // namespace  staffpad::audio
diff --git a/libs/staffpad/FourierTransform_pffft.h b/libs/staffpad/FourierTransform_pffft.h
new file mode 100644
index 0000000000..8f301e270f
--- /dev/null
+++ b/libs/staffpad/FourierTransform_pffft.h
@@ -0,0 +1,32 @@
+// FFT wrapper for the PFFFT library. It is possible to use different wrappers for
+// other FFT libraries or platforms, as long as the CSS complex data format is used.
+
+#pragma once
+
+#include <stdint.h>
+
+#include "SamplesFloat.h"
+
+struct PFFFT_Setup;
+
+namespace staffpad::audio {
+class FourierTransform
+{
+public:
+    FourierTransform(int32_t newBlockSize);
+    ~FourierTransform();
+
+    int getSize() const { return static_cast<int>(_blockSize); }
+
+    void forwardReal(const SamplesReal& t, SamplesComplex& c);
+    void inverseReal(const SamplesComplex& c, SamplesReal& t);
+
+private:
+    PFFFT_Setup* realFftSpec = nullptr;
+    PFFFT_Setup* complexFftSpec = nullptr;
+    float* _pffft_scratch = nullptr;
+
+    const int32_t _blockSize;
+    int32_t _order = 0;
+};
+} // namespace staffpad::audio
diff --git a/libs/staffpad/README b/libs/staffpad/README
new file mode 100644
index 0000000000..324f17f7a9
--- /dev/null
+++ b/libs/staffpad/README
@@ -0,0 +1,10 @@
+StaffPad - Time and Pitch
+=========================
+
+This library was copied from Audacity [1] in October 2025.
+
+Licensed in terms of the GNU GNU GENERAL PUBLIC LICENSE V2 or later, unless unless otherwise noted.
+(https://github.com/audacity/audacity?tab=readme-ov-file#license)
+
+[1] https://github.com/audacity/audacity/tree/master/au3/libraries/lib-time-and-pitch/StaffPad
+    git rev 4cdd78b4326456cb77c7102a4f9a6435e0d474a2
diff --git a/libs/staffpad/SamplesFloat.h b/libs/staffpad/SamplesFloat.h
new file mode 100644
index 0000000000..3ed2c565f6
--- /dev/null
+++ b/libs/staffpad/SamplesFloat.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <complex>
+#include <vector>
+
+#include "SimdTypes.h"
+#include "VectorOps.h"
+
+namespace staffpad {
+template<typename T = float>
+class SamplesFloat
+{
+public:
+    ~SamplesFloat()
+    {
+        for (int ch = 0; ch < num_channels; ch++) {
+            dealloc(ch);
+        }
+    }
+
+    void setSize(int32_t numChannels, int32_t samples)
+    {
+        for (int ch = 0; ch < num_channels; ch++) {
+            dealloc(ch);
+        }
+
+        num_channels = numChannels;
+        num_samples = samples;
+        data.resize(num_channels);
+        for (int ch = 0; ch < num_channels; ch++) {
+            alloc(ch, num_samples);
+        }
+    }
+
+    int32_t getNumChannels() const
+    {
+        return num_channels;
+    }
+
+    int32_t getNumSamples() const
+    {
+        return num_samples;
+    }
+
+    float** getPtrs()
+    {
+        return data.data();
+    }
+
+    T* getPtr(int32_t channel)
+    {
+        assert(channel < num_channels);
+        assert(data[channel]);
+        return data[channel];
+    }
+
+    const T* getPtr(int32_t channel) const
+    {
+        assert(channel < num_channels);
+        assert(data[channel]);
+        return data[channel];
+    }
+
+    void assignSamples(int32_t channel, const T* input)
+    {
+        assert(channel < num_channels);
+        assert(data[channel]);
+        vo::copy(input, data[channel], num_samples);
+    }
+
+    void assignSamples(const SamplesFloat& rhs)
+    {
+        assert(num_channels == rhs.num_channels);
+        assert(num_samples == rhs.num_samples);
+        for (int ch = 0; ch < num_channels; ch++) {
+            assert(data[ch]);
+            vo::copy(rhs.getPtr(ch), getPtr(ch), num_samples);
+        }
+    }
+
+    void zeroOut()
+    {
+        for (int ch = 0; ch < num_channels; ch++) {
+            vo::setToZero(data[ch], num_samples);
+        }
+    }
+
+private:
+    int32_t num_channels{ 0 };
+    int32_t num_samples{ 0 };
+    std::vector<T*> data;
+
+    void alloc(int32_t channel, int32_t samples)
+    {
+        assert(channel < num_channels);
+        if (data[channel]) {
+            dealloc(channel);
+        }
+        data[channel] = (T*)audio::simd::aligned_malloc(samples * sizeof(T), 64);
+    }
+
+    void dealloc(int32_t channel)
+    {
+        assert(channel < num_channels);
+        if (data[channel]) {
+            audio::simd::aligned_free(data[channel]);
+            data[channel] = nullptr;
+        }
+    }
+};
+
+typedef SamplesFloat<float> SamplesReal;
+typedef SamplesFloat<std::complex<float> > SamplesComplex;
+} // namespace staffpad
diff --git a/libs/staffpad/SimdComplexConversions_sse2.h b/libs/staffpad/SimdComplexConversions_sse2.h
new file mode 100644
index 0000000000..c0eb1432f8
--- /dev/null
+++ b/libs/staffpad/SimdComplexConversions_sse2.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: zlib */
+/*
+ * This code is cleaned up and modernized version of https://github.com/to-miz/sse_mathfun_extension
+ */
+
+#pragma once
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include <array>
+#include <complex>
+#include <type_traits>
+#include <memory>
+#include <utility>
+
+namespace simd_complex_conversions {
+// Before C++20 there is no standard and correct way to implement bit_cast.
+// It was verified that MSVC generates the correct code for the current use
+// cases
+namespace details {
+template<class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From>
+    && std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept
+{
+    static_assert(
+        std::is_trivially_constructible_v<To>,
+        "This implementation additionally requires "
+        "destination type to be trivially constructible");
+
+    To dst;
+    std::memcpy(&dst, &src, sizeof(To));
+    return dst;
+}
+
+constexpr float PIF = 3.141592653589793238f;
+constexpr float PIO2F = 1.5707963267948966192f;
+
+constexpr float cephes_PIF = 3.141592653589793238f;
+constexpr float cephes_PIO2F = 1.5707963267948966192f;
+constexpr float cephes_PIO4F = 0.7853981633974483096f;
+constexpr float cephes_FOPI = 1.27323954473516f; // 4 / M_PI
+constexpr float minus_cephes_DP1 = -0.78515625f;
+constexpr float minus_cephes_DP2 = -2.4187564849853515625e-4f;
+constexpr float minus_cephes_DP3 = -3.77489497744594108e-8f;
+constexpr float sincof_p0 = -1.9515295891e-4f;
+constexpr float sincof_p1 = 8.3321608736e-3f;
+constexpr float sincof_p2 = -1.6666654611e-1f;
+constexpr float coscof_p0 = 2.443315711809948e-005f;
+constexpr float coscof_p1 = -1.388731625493765e-003f;
+constexpr float coscof_p2 = 4.166664568298827e-002f;
+
+constexpr float atancof_p0 = 8.05374449538e-2f;
+constexpr float atancof_p1 = 1.38776856032e-1f;
+constexpr float atancof_p2 = 1.99777106478e-1f;
+constexpr float atancof_p3 = 3.33329491539e-1f;
+
+static const float sign_mask = bit_cast<float>(0x80000000);
+static const float inv_sign_mask = bit_cast<float>(~0x80000000);
+} // namespace details
+
+inline __m128 atan_ps(__m128 x)
+{
+    using namespace details;
+
+    __m128 sign_bit, y;
+
+    sign_bit = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
+    /* extract the sign bit (upper one) */
+    sign_bit = _mm_and_ps(sign_bit, _mm_set1_ps(sign_mask));
+
+    /* range reduction, init x and y depending on range */
+    /* x > 2.414213562373095 */
+    __m128 cmp0 = _mm_cmpgt_ps(x, _mm_set1_ps(2.414213562373095f));
+    /* x > 0.4142135623730950 */
+    __m128 cmp1 = _mm_cmpgt_ps(x, _mm_set1_ps(0.4142135623730950f));
+
+    /* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
+    __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
+
+    /* -( 1.0/x ) */
+    __m128 y0 = _mm_and_ps(cmp0, _mm_set1_ps(cephes_PIO2F));
+    __m128 x0 = _mm_div_ps(_mm_set1_ps(1.0f), x);
+    x0 = _mm_xor_ps(x0, _mm_set1_ps(sign_mask));
+
+    __m128 y1 = _mm_and_ps(cmp2, _mm_set1_ps(cephes_PIO4F));
+    /* (x-1.0)/(x+1.0) */
+    __m128 x1_o = _mm_sub_ps(x, _mm_set1_ps(1.0f));
+    __m128 x1_u = _mm_add_ps(x, _mm_set1_ps(1.0f));
+    __m128 x1 = _mm_div_ps(x1_o, x1_u);
+
+    __m128 x2 = _mm_and_ps(cmp2, x1);
+    x0 = _mm_and_ps(cmp0, x0);
+    x2 = _mm_or_ps(x2, x0);
+    cmp1 = _mm_or_ps(cmp0, cmp2);
+    x2 = _mm_and_ps(cmp1, x2);
+    x = _mm_andnot_ps(cmp1, x);
+    x = _mm_or_ps(x2, x);
+
+    y = _mm_or_ps(y0, y1);
+
+    __m128 zz = _mm_mul_ps(x, x);
+    __m128 acc = _mm_set1_ps(atancof_p0);
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p1));
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_add_ps(acc, _mm_set1_ps(atancof_p2));
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_sub_ps(acc, _mm_set1_ps(atancof_p3));
+    acc = _mm_mul_ps(acc, zz);
+    acc = _mm_mul_ps(acc, x);
+    acc = _mm_add_ps(acc, x);
+    y = _mm_add_ps(y, acc);
+
+    /* update the sign */
+    y = _mm_xor_ps(y, sign_bit);
+
+    return y;
+}
+
+inline __m128 atan2_ps(__m128 y, __m128 x)
+{
+    using namespace details;
+
+    __m128 zero = _mm_setzero_ps();
+    __m128 x_eq_0 = _mm_cmpeq_ps(x, zero);
+    __m128 x_gt_0 = _mm_cmpgt_ps(x, zero);
+    __m128 x_le_0 = _mm_cmple_ps(x, zero);
+    __m128 y_eq_0 = _mm_cmpeq_ps(y, zero);
+    __m128 x_lt_0 = _mm_cmplt_ps(x, zero);
+    __m128 y_lt_0 = _mm_cmplt_ps(y, zero);
+
+    __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
+    __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
+    zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
+
+    __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
+    __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, _mm_set1_ps(sign_mask));
+    __m128 pio2_result = _mm_set1_ps(cephes_PIO2F);
+    pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
+    pio2_result = _mm_and_ps(pio2_mask, pio2_result);
+
+    __m128 pi_mask = _mm_and_ps(y_eq_0, x_lt_0);
+    __m128 pi = _mm_set1_ps(cephes_PIF);
+    __m128 pi_result = _mm_and_ps(pi_mask, pi);
+
+    __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
+    swap_sign_mask_offset
+        =_mm_and_ps(swap_sign_mask_offset, _mm_set1_ps(sign_mask));
+
+    __m128 offset0 = _mm_setzero_ps();
+    __m128 offset1 = _mm_set1_ps(cephes_PIF);
+    offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
+
+    __m128 offset = _mm_andnot_ps(x_lt_0, offset0);
+    offset = _mm_and_ps(x_lt_0, offset1);
+
+    __m128 arg = _mm_div_ps(y, x);
+    __m128 atan_result = atan_ps(arg);
+    atan_result = _mm_add_ps(atan_result, offset);
+
+    /* select between zero_result, pio2_result and atan_result */
+
+    __m128 result = _mm_andnot_ps(zero_mask, pio2_result);
+    atan_result = _mm_andnot_ps(zero_mask, atan_result);
+    atan_result = _mm_andnot_ps(pio2_mask, atan_result);
+    result = _mm_or_ps(result, atan_result);
+    result = _mm_or_ps(result, pi_result);
+
+    return result;
+}
+
+inline std::pair<__m128, __m128> sincos_ps(__m128 x)
+{
+    using namespace details;
+    __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+    __m128i emm0, emm2, emm4;
+
+    sign_bit_sin = x;
+    /* take the absolute value */
+    x = _mm_and_ps(x, _mm_set1_ps(inv_sign_mask));
+    /* extract the sign bit (upper one) */
+    sign_bit_sin = _mm_and_ps(sign_bit_sin, _mm_set1_ps(sign_mask));
+
+    /* scale by 4/Pi */
+    y = _mm_mul_ps(x, _mm_set1_ps(cephes_FOPI));
+
+    /* store the integer part of y in emm2 */
+    emm2 = _mm_cvttps_epi32(y);
+
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
+    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
+    y = _mm_cvtepi32_ps(emm2);
+
+    emm4 = emm2;
+
+    /* get the swap sign flag for the sine */
+    emm0 = _mm_and_si128(emm2, _mm_set1_epi32(4));
+    emm0 = _mm_slli_epi32(emm0, 29);
+    __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+    /* get the polynom selection mask for the sine*/
+    emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
+    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    __m128 poly_mask = _mm_castsi128_ps(emm2);
+
+    /* The magic pass: "Extended precision modular arithmetic"
+       x = ((x - y * DP1) - y * DP2) - y * DP3; */
+    xmm1 = _mm_set1_ps(minus_cephes_DP1);
+    xmm2 = _mm_set1_ps(minus_cephes_DP2);
+    xmm3 = _mm_set1_ps(minus_cephes_DP3);
+    xmm1 = _mm_mul_ps(y, xmm1);
+    xmm2 = _mm_mul_ps(y, xmm2);
+    xmm3 = _mm_mul_ps(y, xmm3);
+    x = _mm_add_ps(x, xmm1);
+    x = _mm_add_ps(x, xmm2);
+    x = _mm_add_ps(x, xmm3);
+
+    emm4 = _mm_sub_epi32(emm4, _mm_set1_epi32(2));
+    emm4 = _mm_andnot_si128(emm4, _mm_set1_epi32(4));
+    emm4 = _mm_slli_epi32(emm4, 29);
+    __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
+
+    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+    __m128 z = _mm_mul_ps(x, x);
+    y = _mm_set1_ps(coscof_p0);
+
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _mm_set1_ps(coscof_p1));
+    y = _mm_mul_ps(y, z);
+    y = _mm_add_ps(y, _mm_set1_ps(coscof_p2));
+    y = _mm_mul_ps(y, z);
+    y = _mm_mul_ps(y, z);
+    __m128 tmp = _mm_mul_ps(z, _mm_set1_ps(0.5f));
+    y = _mm_sub_ps(y, tmp);
+    y = _mm_add_ps(y, _mm_set1_ps(1));
+
+    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+    __m128 y2 = _mm_set1_ps(sincof_p0);
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p1));
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_add_ps(y2, _mm_set1_ps(sincof_p2));
+    y2 = _mm_mul_ps(y2, z);
+    y2 = _mm_mul_ps(y2, x);
+    y2 = _mm_add_ps(y2, x);
+
+    /* select the correct result from the two polynoms */
+    xmm3 = poly_mask;
+    __m128 ysin2 = _mm_and_ps(xmm3, y2);
+    __m128 ysin1 = _mm_andnot_ps(xmm3, y);
+    y2 = _mm_sub_ps(y2, ysin2);
+    y = _mm_sub_ps(y, ysin1);
+
+    xmm1 = _mm_add_ps(ysin1, ysin2);
+    xmm2 = _mm_add_ps(y, y2);
+
+    /* update the sign */
+    return std::make_pair(
+        _mm_xor_ps(xmm1, sign_bit_sin), _mm_xor_ps(xmm2, sign_bit_cos));
+}
+
+inline float atan2_ss(float y, float x)
+{
+    return _mm_cvtss_f32(atan2_ps(_mm_set_ss(y), _mm_set_ss(x)));
+}
+
+inline std::pair<float, float> sincos_ss(float angle)
+{
+    auto res = sincos_ps(_mm_set_ss(angle));
+    return std::make_pair(_mm_cvtss_f32(res.first), _mm_cvtss_f32(res.second));
+}
+
+inline __m128 norm(__m128 x, __m128 y)
+{
+    return _mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
+}
+
+inline float sqrt_ss(float x)
+{
+    __m128 sse_value = _mm_set_ss(x);
+    sse_value = _mm_sqrt_ss(sse_value);
+    return _mm_cvtss_f32(sse_value);
+}
+
+template<typename fnc>
+void perform_parallel_simd_aligned(
+    const std::complex<float>* input, float* output, int n, const fnc& f)
+{
+    for (int i = 0; i <= n - 4; i += 4) {
+        // Safe according to C++ standard
+        auto p1 = _mm_load_ps(reinterpret_cast<const float*>(input + i));
+        auto p2 = _mm_load_ps(reinterpret_cast<const float*>(input + i + 2));
+
+        // p1 = {real(c1), imag(c1), real(c2), imag(c2)}
+        // p2 = {real(c3), imag(c3), real(c4), imag(c4)}
+
+        auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
+        auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        __m128 out;
+        f(rp, ip, out);
+
+        _mm_store_ps(output + i, out);
+    }
+    // deal with last partial packet
+    for (int i = n & (~3); i < n; ++i) {
+        __m128 out;
+        f(_mm_set_ss(real(input[i])), _mm_set_ss(imag(input[i])), out);
+        output[i] = _mm_cvtss_f32(out);
+    }
+}
+
+inline void rotate_parallel_simd_aligned(
+    const float* oldPhase, const float* newPhase, std::complex<float>* output,
+    int n)
+{
+    for (int i = 0; i <= n - 4; i += 4) {
+        auto [sin, cos] = sincos_ps(
+            oldPhase
+            ? _mm_sub_ps(_mm_load_ps(newPhase + i), _mm_load_ps(oldPhase + i))
+            : _mm_load_ps(newPhase + i));
+
+        // Safe according to C++ standard
+        auto p1 = _mm_load_ps(reinterpret_cast<float*>(output + i));
+        auto p2 = _mm_load_ps(reinterpret_cast<float*>(output + i + 2));
+
+        // p1 = {real(c1), imag(c1), real(c2), imag(c2)}
+        // p2 = {real(c3), imag(c3), real(c4), imag(c4)}
+
+        auto rp = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(2, 0, 2, 0));
+        auto ip = _mm_shuffle_ps(p1, p2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        // We need to calculate (rp, ip) * (cos, sin) -> (rp*cos - ip*sin, rp*sin + ip*cos)
+
+        auto out_rp = _mm_sub_ps(_mm_mul_ps(rp, cos), _mm_mul_ps(ip, sin));
+        auto out_ip = _mm_add_ps(_mm_mul_ps(rp, sin), _mm_mul_ps(ip, cos));
+
+        p1 = _mm_unpacklo_ps(out_rp, out_ip);
+        p2 = _mm_unpackhi_ps(out_rp, out_ip);
+
+        _mm_store_ps(reinterpret_cast<float*>(output + i), p1);
+        _mm_store_ps(reinterpret_cast<float*>(output + i + 2), p2);
+    }
+    // deal with last partial packet
+    for (int i = n & (~3); i < n; ++i) {
+        const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
+        output[i] *= std::complex<float>(cosf(theta), sinf(theta));
+    }
+}
+} // namespace simd_complex_conversions
diff --git a/libs/staffpad/SimdTypes.h b/libs/staffpad/SimdTypes.h
new file mode 100644
index 0000000000..4c0433dd32
--- /dev/null
+++ b/libs/staffpad/SimdTypes.h
@@ -0,0 +1,128 @@
+/*
+  Simd-types for parallel dsp processing.
+  Aligned memory allocation for simd vectors.
+ */
+
+#pragma once
+#include <cassert>
+#include <cstdlib>
+
+#if _MSC_VER
+#define __finl __forceinline
+#define __vecc __vectorcall
+#else
+#define __finl inline __attribute__((always_inline))
+#define __vecc
+#endif
+
+#if defined(__SSE2__) || (defined(_M_AMD64) || defined(_M_X64)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#include "SimdTypes_sse2.h"
+#elif defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM64)
+#include "SimdTypes_neon.h"
+#else
+#include "SimdTypes_scalar.h"
+#endif
+
+namespace staffpad::audio::simd {
+/// reserve aligned memory. Needs to be freed with aligned_free()
+inline void* aligned_malloc(size_t required_bytes, size_t alignment)
+{
+    auto offset = alignment - 1 + sizeof(void*);
+    auto p1 = std::malloc(required_bytes + offset);
+    if (p1 == nullptr) {
+        return nullptr;
+    }
+    // figure out aligned position
+    void* p2 = (void*)(((size_t)(p1) + offset) & ~(alignment - 1));
+    // write malloced pointer in front of aligned data
+    ((void**)p2)[-1] = p1;
+    return p2;
+}
+
+/// free memory allocated with aligned_malloc
+inline void aligned_free(void* p)
+{
+    if (p) {
+        free(((void**)p)[-1]);
+    }
+}
+
+/// create a c++ class at an memory-aligned spot that needs to be deleted using aligned_delete
+template<typename cls>
+inline cls* aligned_new(int alignment)
+{
+    void* mem = aligned_malloc(sizeof(cls), alignment);
+    return new (mem) cls();
+}
+
+/** delete objects created using aligned_new */
+template<typename cls>
+inline void aligned_delete(cls* obj)
+{
+    if (obj != nullptr) {
+        obj->~cls();
+        aligned_free((void*)obj);
+    }
+}
+
+template<typename T>
+inline bool is_aligned(T* obj, int alignment)
+{
+    return (((size_t)obj) & (alignment - 1)) == 0;
+}
+
+/** this template allows to write float SIMD code with the supported operators the following way:
+
+float *a_vec;
+const float *b_vec;
+perform_parallel_simd_aligned(a_vec, b_vec, 512, [](auto &a, auto &b) {
+auto t = a;
+a = 3.f * a + 12.f * b;
+b = 0.25f * a + 3.f * b;
+});
+*/
+
+// two buffers read/write
+template<typename fnc>
+__finl void perform_parallel_simd_aligned(float* a, float* b, int n, const fnc& f)
+{
+    // fnc& f needs to be a lambda of type [](auto &a, auto &b){}.
+    // the autos will be float_x4/float
+    constexpr int N = 4;
+    constexpr int byte_size = sizeof(float);
+
+    assert(is_aligned(a, N * byte_size) && is_aligned(b, N * byte_size));
+
+    for (int i = 0; i <= n - N; i += N) {
+        auto x = float_x4_load_aligned(a + i);
+        auto y = float_x4_load_aligned(b + i);
+        f(x, y);
+        store_aligned(x, a + i);
+        store_aligned(y, b + i);
+    }
+    // deal with last partial packet
+    for (int i = n & (~(N - 1)); i < n; ++i) {
+        f(a[i], b[i]);
+    }
+}
+
+/// template for applying math to one data buffer
+template<typename fnc>
+__finl void perform_parallel_simd_aligned(float* a, int n, const fnc& f)
+{
+    // fnc& f needs to be a lambda of type [](auto &a){}.
+    constexpr int N = 4;
+    constexpr int byte_size = sizeof(float);
+    assert(is_aligned(a, N * byte_size));
+
+    for (int i = 0; i <= n - N; i += N) {
+        auto x = float_x4_load_aligned(a + i);
+        f(x);
+        store_aligned(x, a + i);
+    }
+    // deal with last partial packet
+    for (int i = n & (~(N - 1)); i < n; ++i) {
+        f(a[i]);
+    }
+}
+} // namespace staffpad::audio::simd
diff --git a/libs/staffpad/SimdTypes_neon.h b/libs/staffpad/SimdTypes_neon.h
new file mode 100644
index 0000000000..9d9e2c58e6
--- /dev/null
+++ b/libs/staffpad/SimdTypes_neon.h
@@ -0,0 +1,161 @@
+/*
+  Neon version of SIMD types.
+ */
+
+#pragma once
+
+#if _MSC_VER
+#include <arm64_neon.h>
+#define __finl __forceinline
+#define __vecc __vectorcall
+#else
+#include <arm_neon.h>
+#define __finl inline __attribute__((always_inline))
+#define __vecc
+#endif
+
+#include <cmath>
+
+namespace staffpad::audio::simd {
+struct float_x4
+{
+    float32x4_t s;
+
+    __finl float_x4()
+    {
+    }
+
+    /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
+    __finl float_x4(float val)
+    {
+        s = vdupq_n_f32(val);
+    }
+
+    __finl float_x4(const float32x4_t& val)
+        : s(val)
+    {
+    }
+
+    /// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
+    __finl float_x4(float v0, float v1, float v2, float v3)
+    {
+#if _MSC_VER // aggregate initializer won't work unless we have {.n128_f32 = ..} in c++20
+        s.n128_f32[0] = v0;
+        s.n128_f32[1] = v1;
+        s.n128_f32[2] = v2;
+        s.n128_f32[3] = v3;
+#elif __clang__
+        s = { v0, v1, v2, v3 };
+#else
+        float f[4] = { v0, v1, v2, v3 };
+        s = vld1q_f32(f);
+#endif
+    }
+
+#if __clang__
+private:
+    // this helper class allows writing to the single registers for clang
+    // __mm128 is a built-in type -> we can't return a float& reference.
+    // this is just syntax sugar and clang will remove it during builds.
+    //
+    // it allows to write
+    // float_x4 a;
+    // a[1] = 2.f;
+    struct RegisterAccessWrapper
+    {
+        float32x4_t& val;
+        int i;
+
+        void operator=(float x)
+        {
+            val[i] = x;
+        }
+
+        operator float() noexcept
+        {
+            return val[i];
+        }
+    };
+
+public:
+    __finl RegisterAccessWrapper operator[](int n)
+    {
+        RegisterAccessWrapper raw = { s, n };
+        return raw;
+    }
+
+    __finl const float operator[](int n) const
+    {
+        return s[n];
+    }
+
+#elif _MSC_VER
+    // on msvc returning a ref to a sub-register is possible
+    __finl float& operator[](int n)
+    {
+        return s.n128_f32[n];
+    }
+
+    __finl const float operator[](int n) const
+    {
+        return s.n128_f32[n];
+    }
+
+#endif
+};
+
+__finl float_x4 __vecc float_x4_from_float(float x)
+{
+    return vdupq_n_f32(x);
+}
+
+__finl float_x4 __vecc float_x4_load_aligned(const float* x)
+{
+    return vld1q_f32(x);
+}
+
+__finl void __vecc store_aligned(const float_x4& a, float* x)
+{
+    vst1q_f32(x, a.s);
+}
+
+__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
+{
+    return vuzp1q_f32(a.s, b.s);
+}
+
+__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
+{
+    return vuzp2q_f32(a.s, b.s);
+}
+
+__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
+{
+    return vaddq_f32(a.s, b.s);
+}
+
+__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
+{
+    return vsubq_f32(a.s, b.s);
+}
+
+__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
+{
+    return vmulq_f32(a.s, b.s);
+}
+
+__finl float_x4 __vecc sqrt(const float_x4& a)
+{
+    return vsqrtq_f32(a.s);
+}
+
+__finl float __vecc rint(float a)
+{
+    return std::rint(a);
+}
+
+__finl float_x4 __vecc rint(const float_x4& a)
+{
+    return vrndnq_f32(a.s);
+}
+} // namespace staffpad::audio::simd
diff --git a/libs/staffpad/SimdTypes_scalar.h b/libs/staffpad/SimdTypes_scalar.h
new file mode 100644
index 0000000000..1b3dfae146
--- /dev/null
+++ b/libs/staffpad/SimdTypes_scalar.h
@@ -0,0 +1,108 @@
+/*
+  scalar emulation of simd types.
+ */
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+
+#if _MSC_VER
+#define __finl __forceinline
+#define __vecc __vectorcall
+#else
+#define __finl inline __attribute__((always_inline))
+#define __vecc
+#endif
+
+namespace staffpad::audio::simd {
+struct float_x4
+{
+    float v[4];
+
+    __finl float_x4()
+    {
+    }
+
+    /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
+    __finl float_x4(float val)
+    {
+        v[0] = v[1] = v[2] = v[3] = val;
+    }
+
+    /// enables assignments like: float_x4 a = {1.f, 2.f, 3.f, 4.f};
+    __finl float_x4(float v0, float v1, float v2, float v3)
+    {
+        v[0] = v0;
+        v[1] = v1;
+        v[2] = v2;
+        v[3] = v3;
+    }
+
+    __finl float& operator[](int n)
+    {
+        return v[n];
+    }
+
+    __finl const float& operator[](int n) const
+    {
+        return v[n];
+    }
+};
+
+__finl float_x4 __vecc float_x4_from_float(float x)
+{
+    return { x, x, x, x };
+}
+
+__finl float_x4 __vecc float_x4_load_aligned(const float* x)
+{
+    return { x[0], x[1], x[2], x[3] };
+}
+
+__finl void __vecc store_aligned(const float_x4& a, float* x)
+{
+    for (int i = 0; i < 4; ++i) {
+        x[i] = a[i];
+    }
+}
+
+__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
+{
+    return { a[0], a[2], b[0], b[2] };
+}
+
+__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
+{
+    return { a[1], a[1], b[3], b[3] };
+}
+
+__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
+{
+    return { a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3] };
+}
+
+__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
+{
+    return { a[0] - b[0], a[1] - b[1], a[2] - b[2], a[3] - b[3] };
+}
+
+__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
+{
+    return { a[0] * b[0], a[1] * b[1], a[2] * b[2], a[3] * b[3] };
+}
+
+__finl float_x4 __vecc sqrt(const float_x4& a)
+{
+    return { std::sqrt(a[0]), std::sqrt(a[1]), std::sqrt(a[2]), std::sqrt(a[3]) };
+}
+
+__finl float __vecc rint(float a)
+{
+    return std::rint(a);
+}
+
+__finl float_x4 __vecc rint(const float_x4& a)
+{
+    return { std::rint(a[0]), std::rint(a[1]), std::rint(a[2]), std::rint(a[3]) };
+}
+} // namespace staffpad::audio::simd
diff --git a/libs/staffpad/SimdTypes_sse2.h b/libs/staffpad/SimdTypes_sse2.h
new file mode 100644
index 0000000000..4dfad7c6d5
--- /dev/null
+++ b/libs/staffpad/SimdTypes_sse2.h
@@ -0,0 +1,150 @@
+/*
+SSE2 simd types
+*/
+
+#pragma once
+
+#if _MSC_VER
+#define __finl __forceinline
+#define __vecc __vectorcall
+#else
+#define __finl inline __attribute__((always_inline))
+#define __vecc
+#endif
+
+#include <emmintrin.h>
+
+namespace staffpad::audio::simd {
+// this is jumping through some hoops to get the same level of support
+// for clang and msvc. With clang, the sse2 types are built-in and have
+// some arithmetic operators defined.
+// On msvc the sse2 types are structs with no operators.
+// to get to the same level and to be able to write algorithms "naturally",
+// everything needs to be wrapped in a struct.
+struct float_x4
+{
+    __m128 s;
+    __finl float_x4()
+    {
+    }
+
+    /// enables math like: float_x4 a = 0.5f * float_x4{1.f, 2.f, 3.f, 4.f};
+    __finl float_x4(float val)
+    {
+        s = _mm_set1_ps(val);
+    }
+
+    __finl float_x4(const __m128& val)
+        : s(val)
+    {
+    }
+
+#if __clang__
+private:
+    // this helper class allows writing to the single registers for clang
+    // __mm128 is a built-in type -> we can't return a float& reference.
+    // this is just syntax sugar and clang will remove it during builds.
+    //
+    // it allows to write
+    // float_x4 a;
+    // a[1] = 2.f;
+    struct RegisterAccessWrapper
+    {
+        __m128& val;
+        int i;
+
+        void operator=(float x)
+        {
+            val[i] = x;
+        }
+
+        operator float() noexcept
+        {
+            return val[i];
+        }
+    };
+
+public:
+    __finl RegisterAccessWrapper operator[](int n)
+    {
+        RegisterAccessWrapper raw = { s, n };
+        return raw;
+    }
+
+    __finl const float operator[](int n) const
+    {
+        return s[n];
+    }
+
+#elif _MSC_VER
+    // on msvc returning a ref to a sub-register is possible
+    __finl float& operator[](int n)
+    {
+        return s.m128_f32[n];
+    }
+
+    __finl const float operator[](int n) const
+    {
+        return s.m128_f32[n];
+    }
+
+#endif
+};
+
+__finl float_x4 __vecc float_x4_from_float(float x)
+{
+    return _mm_set1_ps(x);
+}
+
+__finl float_x4 __vecc float_x4_load_aligned(const float* x)
+{
+    return _mm_load_ps(x);
+}
+
+__finl void __vecc store_aligned(const float_x4& a, float* x)
+{
+    _mm_store_ps(x, a.s);
+}
+
+__finl float_x4 __vecc unzip1(const float_x4& a, const float_x4& b)
+{
+    return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(2, 0, 2, 0));
+}
+
+__finl float_x4 __vecc unzip2(const float_x4& a, const float_x4& b)
+{
+    return _mm_shuffle_ps(a.s, b.s, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+__finl float_x4 __vecc operator+(float_x4 a, float_x4 b)
+{
+    return _mm_add_ps(a.s, b.s);
+}
+
+__finl float_x4 __vecc operator-(float_x4 a, float_x4 b)
+{
+    return _mm_sub_ps(a.s, b.s);
+}
+
+__finl float_x4 __vecc operator*(float_x4 a, float_x4 b)
+{
+    return _mm_mul_ps(a.s, b.s);
+}
+
+__finl float_x4 __vecc sqrt(const float_x4& a)
+{
+    return _mm_sqrt_ps(a.s);
+}
+
+__finl float __vecc rint(float x)
+{
+    __m128i A = _mm_cvtps_epi32(_mm_set_ss(x));
+    return _mm_cvtss_f32(_mm_cvtepi32_ps(A));
+}
+
+__finl float_x4 __vecc rint(const float_x4& a)
+{
+    __m128i A = _mm_cvtps_epi32(a.s);
+    return _mm_cvtepi32_ps(A);
+}
+} // namespace staffpad::audio::simd
diff --git a/libs/staffpad/TimeAndPitch.cpp b/libs/staffpad/TimeAndPitch.cpp
new file mode 100644
index 0000000000..bf93b469fe
--- /dev/null
+++ b/libs/staffpad/TimeAndPitch.cpp
@@ -0,0 +1,559 @@
+#include "staffpad/TimeAndPitch.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <random>
+#include <stdlib.h>
+#include <utility>
+
+#include "CircularSampleBuffer.h"
+#include "FourierTransform_pffft.h"
+#include "SamplesFloat.h"
+#include "SimdTypes.h"
+
+using namespace staffpad::audio;
+
+namespace staffpad {
+namespace {
+constexpr double twoPi = 6.28318530717958647692f;
+
+/**
+ * 6-point lagrange interpolator SNR ~ -35.2db
+ * interpolates between samples 2 and 3 using t.
+ *
+ * smp              pointer to a memory region containing 6 samples
+ * t                fractional sample to interpolate t >= 0 && t < 1
+ * return value     the interpolated sample
+ */
+inline float lagrange6(const float (& smp)[6], float t)
+{
+    auto* y = &smp[2];
+
+    auto ym1_y1 = y[-1] + y[1];
+    auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
+    auto c0 = y[0];
+    auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
+    auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
+    auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
+    auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
+    auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
+
+    // Estrin's scheme
+    auto t2 = t * t;
+    return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
+}
+
+void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
+{
+    std::vector<std::complex<float> > v(size);
+    std::uniform_real_distribution<float> dis(
+        -3.14159265358979323846f, 3.14159265358979323846f);
+    std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
+}
+} // namespace
+
+struct TimeAndPitch::impl
+{
+    impl(int fft_size)
+        : fft(fft_size)
+    {
+        randomGenerator.seed(0);
+    }
+
+    FourierTransform fft;
+    std::mt19937 randomGenerator;
+    CircularSampleBuffer<float> inResampleInputBuffer[2];
+    CircularSampleBuffer<float> inCircularBuffer[2];
+    CircularSampleBuffer<float> outCircularBuffer[2];
+    CircularSampleBuffer<float> normalizationBuffer;
+
+    SamplesReal fft_timeseries;
+    SamplesComplex spectrum;
+    SamplesReal norm;
+    SamplesReal phase;
+    SamplesReal last_phase;
+    SamplesReal phase_accum;
+    SamplesReal cosWindow;
+    SamplesReal sqWindow;
+    SamplesReal last_norm;
+    SamplesReal random_phases;
+
+    double exact_hop_a = 512.0, hop_a_err = 0.0;
+    double exact_hop_s = 0.0;
+    double next_exact_hop_s = 512.0;
+    double hop_s_err = 0.0;
+
+    std::vector<int> peak_index, trough_index;
+};
+
+TimeAndPitch::TimeAndPitch(
+    int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
+    : fftSize(fftSize)
+    , _reduceImaging(reduceImaging)
+    , _shiftTimbreCb(std::move(shiftTimbreCb))
+{
+}
+
+TimeAndPitch::~TimeAndPitch()
+{
+    // Here for ~std::shared_ptr<impl>() to know ~impl()
+}
+
+void TimeAndPitch::setup(int numChannels, int maxBlockSize)
+{
+    assert(numChannels == 1 || numChannels == 2);
+    _numChannels = numChannels;
+
+    d = std::make_unique<impl>(fftSize);
+    _maxBlockSize = maxBlockSize;
+    _numBins = fftSize / 2 + 1;
+
+    // audio sample buffers
+    d->fft_timeseries.setSize(_numChannels, fftSize);
+    int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
+    for (int ch = 0; ch < _numChannels; ++ch) {
+        d->inResampleInputBuffer[ch].setSize(_maxBlockSize
+                                             + 6); // needs additional history for resampling. more in the future
+        d->inCircularBuffer[ch].setSize(fftSize);
+        d->outCircularBuffer[ch].setSize(outBufferSize);
+    }
+    d->normalizationBuffer.setSize(outBufferSize);
+
+    // fft coefficient buffers
+    d->spectrum.setSize(_numChannels, _numBins);
+    d->norm.setSize(1, _numBins);
+    d->last_norm.setSize(1, _numBins);
+    d->phase.setSize(_numChannels, _numBins);
+    d->last_phase.setSize(_numChannels, _numBins);
+    d->phase_accum.setSize(_numChannels, _numBins);
+    d->random_phases.setSize(1, _numBins);
+    generateRandomPhaseVector(
+        d->random_phases.getPtr(0), _numBins, d->randomGenerator);
+
+    _expectedPhaseChangePerBinPerSample = twoPi / double(fftSize);
+
+    // Cos window
+    // For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
+    d->cosWindow.setSize(1, fftSize);
+    d->sqWindow.setSize(1, fftSize);
+    auto* w = d->cosWindow.getPtr(0);
+    auto* sqw = d->sqWindow.getPtr(0);
+    for (int i = 0; i < fftSize; ++i) {
+        w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
+        sqw[i] = w[i] * w[i];
+    }
+
+    d->peak_index.reserve(_numBins);
+    d->trough_index.reserve(_numBins);
+
+    // force fft to allocate all buffers now. Currently there is no other way in the fft class
+    d->fft.forwardReal(d->fft_timeseries, d->spectrum);
+
+    reset();
+}
+
+int TimeAndPitch::getSamplesToNextHop() const
+{
+    return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter
+                    + 1);    // 1 extra to counter pitch factor error accumulation
+}
+
+int TimeAndPitch::getNumAvailableOutputSamples() const
+{
+    return _availableOutputSamples;
+}
+
+void TimeAndPitch::reset()
+{
+    _analysis_hop_counter = 0;
+    _availableOutputSamples = 0;
+    for (int ch = 0; ch < _numChannels; ++ch) {
+        d->inResampleInputBuffer[ch].reset();
+        d->inCircularBuffer[ch].reset();
+        d->outCircularBuffer[ch].reset();
+    }
+    d->normalizationBuffer.reset();
+    d->last_norm.zeroOut();
+    d->last_phase.zeroOut();
+    d->phase_accum.zeroOut();
+    _outBufferWriteOffset = 0;
+    d->hop_a_err = 0.0;
+    d->hop_s_err = 0.0;
+    d->exact_hop_s = 0.0;
+    _resampleReadPos = 0.0;
+}
+
+namespace {
+// wrap a phase value into -PI..PI
+inline float _unwrapPhase(float arg)
+{
+    using namespace audio::simd;
+    return arg - rint(arg * 0.15915494309f) * 6.283185307f;
+}
+
+void _unwrapPhaseVec(float* v, int n)
+{
+    audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
+}
+
+/// rotate even-sized array by half its size to align fft phase at the center
+void _fft_shift(float* v, int n)
+{
+    assert((n & 1) == 0);
+    int n2 = n >> 1;
+    audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
+        auto tmp = a;
+        a = b;
+        b = tmp;
+    });
+}
+
+void _lr_to_ms(float* ch1, float* ch2, int n)
+{
+    audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
+        auto l = a, r = b;
+        a = 0.5f * (l + r);
+        b = 0.5f * (l - r);
+    });
+}
+
+void _ms_to_lr(float* ch1, float* ch2, int n)
+{
+    audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
+        auto m = a, s = b;
+        a = m + s;
+        b = m - s;
+    });
+}
+} // namespace
+
+// ----------------------------------------------------------------------------
+
+template<int num_channels>
+void TimeAndPitch::_time_stretch(float a_a, float a_s)
+{
+    auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
+
+    // Create a norm array
+    const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
+    const auto* norms_last = d->last_norm.getPtr(0);
+
+    d->peak_index.clear();
+    d->trough_index.clear();
+    float lowest = norms[0];
+    int trough = 0;
+    if (norms_last[0] >= norms[1]) {
+        d->peak_index.emplace_back(0);
+        d->trough_index.emplace_back(0);
+    }
+    for (int i = 1; i < _numBins - 1; ++i) {
+        if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) {
+            d->peak_index.emplace_back(i);
+            d->trough_index.emplace_back(trough);
+            trough = i;
+            lowest = norms[i];
+        } else if (norms[i] < lowest) {
+            lowest = norms[i];
+            trough = i;
+        }
+    }
+    if (norms_last[_numBins - 1] > norms[_numBins - 2]) {
+        d->peak_index.emplace_back(_numBins - 1);
+        d->trough_index.emplace_back(trough);
+    }
+
+    if (d->peak_index.size() == 0) {
+        // use max norm of last frame
+        int max_idx = 0;
+        float max_norm = 0.f;
+        vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
+        d->peak_index.emplace_back(max_idx);
+    }
+
+    const float** p = const_cast<const float**>(d->phase.getPtrs());
+    const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
+    float** acc = d->phase_accum.getPtrs();
+
+    float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
+    float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
+
+    int num_peaks = (int)d->peak_index.size();
+    // time integration for all peaks
+    for (int i = 0; i < num_peaks; ++i) {
+        const int n = d->peak_index[i];
+        auto fn = float(n);
+
+        // determine phase from last frame
+        float fn_expChange_a = fn * expChange_a;
+        float fn_expChange_s = fn * expChange_s;
+
+        for (int ch = 0; ch < num_channels; ++ch) {
+            acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
+        }
+    }
+
+    // go from first peak to 0
+    for (int n = d->peak_index[0]; n > 0; --n) {
+        for (int ch = 0; ch < num_channels; ++ch) {
+            acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
+        }
+    }
+
+    // 'grow' from pairs of peaks to the lowest norm in between
+    for (int i = 0; i < num_peaks - 1; ++i) {
+        const int mid = d->trough_index[i + 1];
+        for (int n = d->peak_index[i]; n < mid; ++n) {
+            for (int ch = 0; ch < num_channels; ++ch) {
+                acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
+            }
+        }
+        for (int n = d->peak_index[i + 1]; n > mid + 1; --n) {
+            for (int ch = 0; ch < num_channels; ++ch) {
+                acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
+            }
+        }
+    }
+
+    // last peak to the end
+    for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) {
+        for (int ch = 0; ch < num_channels; ++ch) {
+            acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
+        }
+    }
+
+    d->last_norm.assignSamples(d->norm);
+    d->last_phase.assignSamples(d->phase);
+}
+
+void TimeAndPitch::_applyImagingReduction()
+{
+    // Upsampling brings spectral components down, including those that were
+    // beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
+    // spectrum, which, if not lowpass-filtered by the resampler, may yield
+    // an aliasing-like quality if what is mirrored is a harmonic series. A
+    // simple thing to do against that is to scramble the phase values there.
+    // This way we preserve natural energy fluctuations, but the artifact
+    // isn't noticeable as much anymore, because much more noise-like, which
+    // is what one typically gets at those frequencies.
+    constexpr auto alignment = 64 / sizeof(float);
+    const int imagingBeginBin
+        =std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;
+
+    if (imagingBeginBin >= _numBins) {
+        return;
+    }
+
+    const auto n = _numBins - imagingBeginBin;
+    auto pSpec = d->spectrum.getPtr(0);
+    auto pRand = d->random_phases.getPtr(0);
+    vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
+    // Just rotating the random phase vector to produce pseudo-random
+    // sequences of phases.
+    std::uniform_int_distribution<size_t> dis(0, n - 1);
+    const auto middle = dis(d->randomGenerator);
+    std::rotate(pRand, pRand + middle, pRand + n);
+}
+
+/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
+void TimeAndPitch::_process_hop(int hop_a, int hop_s)
+{
+    if (d->exact_hop_a != d->exact_hop_s) {
+        if (_numChannels == 2) {
+            _lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
+        }
+
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
+            _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
+        }
+
+        // determine norm/phase
+        d->fft.forwardReal(d->fft_timeseries, d->spectrum);
+        // norms of the mid channel only (or sole channel) are needed in
+        // _time_stretch
+        vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
+        }
+
+        if (_shiftTimbreCb) {
+            _shiftTimbreCb(
+                1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
+        }
+
+        if (_reduceImaging && _pitchFactor < 1.) {
+            _applyImagingReduction();
+        }
+
+        if (_numChannels == 1) {
+            _time_stretch<1>((float)hop_a, (float)hop_s);
+        } else if (_numChannels == 2) {
+            _time_stretch<2>((float)hop_a, (float)hop_s);
+        }
+
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            _unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
+        }
+
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
+                       d->spectrum.getNumSamples());
+        }
+        d->fft.inverseReal(d->spectrum, d->fft_timeseries);
+
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
+                                 d->fft_timeseries.getNumSamples());
+        }
+
+        if (_numChannels == 2) {
+            _ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
+        }
+
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
+            vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
+        }
+    } else { // stretch factor == 1.0 => just apply window
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
+        }
+    }
+
+    {
+        float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
+                                                           gainFact);
+        }
+
+        if (normalize_window) {
+            d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
+        }
+    }
+    _outBufferWriteOffset += hop_s;
+    _availableOutputSamples += hop_s;
+}
+
+void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
+{
+    assert(timeScale > 0.0);
+    assert(pitchFactor > 0.0);
+    _pitchFactor = pitchFactor;
+    _timeStretch = timeScale * pitchFactor;
+
+    _overlap_a = overlap;
+    double overlap_s = overlap;
+    if (_timeStretch > 1.0 || !modulate_synthesis_hop) {
+        _overlap_a *= _timeStretch;
+    } else {
+        overlap_s /= _timeStretch;
+    }
+
+    d->exact_hop_a = double(fftSize) / _overlap_a;
+    if (!modulate_synthesis_hop) {
+        d->exact_hop_s = double(fftSize) / overlap_s;
+    } else {
+        // switch after processing next block, unless it's the first one
+        d->next_exact_hop_s = double(fftSize) / overlap_s;
+        if (d->exact_hop_s == 0.0) { // on first chunk set it immediately
+            d->exact_hop_s = d->next_exact_hop_s;
+        }
+    }
+}
+
+void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
+{
+    assert(numSamples <= _maxBlockSize);
+    for (int ch = 0; ch < _numChannels; ++ch) {
+        d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
+        d->inResampleInputBuffer[ch].advance(numSamples);
+    }
+    _resampleReadPos -= numSamples;
+
+    // determine integer hop size for the next hop.
+    // The error accumulators will make the value fluctuate by one to reach arbitrary scale
+    // ratios
+    if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors
+        d->exact_hop_s = d->next_exact_hop_s;
+    }
+
+    const int hop_s = int(d->exact_hop_s + d->hop_s_err);
+    const int hop_a = int(d->exact_hop_a + d->hop_a_err);
+
+    double step = 0.0;
+    double read_pos = _resampleReadPos;
+    while (read_pos < 0.0)
+    {
+        auto int_pos = int(std::floor(read_pos));
+        float frac_pos = float(read_pos - int_pos);
+        for (int ch = 0; ch < _numChannels; ++ch) {
+            float smp[6];
+            d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
+            float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
+            d->inCircularBuffer[ch].writeOffset0(s);
+            d->inCircularBuffer[ch].advance(1);
+        }
+        _analysis_hop_counter++;
+        step++;
+
+        if (_analysis_hop_counter >= hop_a) {
+            _analysis_hop_counter -= hop_a;
+            d->hop_s_err += d->exact_hop_s - hop_s;
+            d->hop_a_err += d->exact_hop_a - hop_a;
+            for (int ch = 0; ch < _numChannels; ++ch) {
+                d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
+            }
+            _process_hop(hop_a, hop_s);
+        }
+        read_pos = _resampleReadPos + step * _pitchFactor;
+    }
+    _resampleReadPos = read_pos;
+}
+
+void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
+{
+    assert(numSamples <= _maxBlockSize);
+    for (int ch = 0; ch < _numChannels; ++ch) {
+        d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
+        if (normalize_window) {
+            constexpr float curve
+                =4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
+            for (int i = 0; i < numSamples; ++i) {
+                float x = d->normalizationBuffer.read(i);
+                out_smp[ch][i] *= x / (x * x + 1 / curve);
+            }
+        }
+        d->outCircularBuffer[ch].advance(numSamples);
+    }
+
+    d->normalizationBuffer.clearBlock(0, numSamples);
+    d->normalizationBuffer.advance(numSamples);
+
+    _outBufferWriteOffset -= numSamples;
+    _availableOutputSamples -= numSamples;
+
+    if (modulate_synthesis_hop) {
+        d->exact_hop_s = d->next_exact_hop_s;
+    }
+}
+
+void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
+{
+    setTimeStretchAndPitchFactor(1.0, pitchFactor);
+    feedAudio(smp, numSamples);
+    retrieveAudio(smp, numSamples);
+}
+
+int TimeAndPitch::getLatencySamples() const
+{
+    return fftSize - fftSize / overlap + 3; // 3 for resampling
+}
+
+int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const
+{
+    const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
+    return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
+}
+} // namespace staffpad
diff --git a/libs/staffpad/VectorOps.h b/libs/staffpad/VectorOps.h
new file mode 100644
index 0000000000..7179dbfbb8
--- /dev/null
+++ b/libs/staffpad/VectorOps.h
@@ -0,0 +1,155 @@
+//
+// This header is provided for convenience, to easily wrap vector operations around
+// their platform-specific optimised libraries (e.g. IPP, vDSP), if desired.
+// This can be done by adding #if defined(...) compile-time branches to each function
+// and calling the corresponding library function, e.g. ippsAdd_32f or vDSP_vadd
+// inside add().
+//
+
+#pragma once
+
+#include <stdlib.h>
+
+#include <complex>
+#include <cstdint>
+#include <cstring>
+
+#if defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_X64) || \
+    (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#define USE_SSE2_COMPLEX 1
+#endif
+
+#if USE_SSE2_COMPLEX
+#   include "SimdComplexConversions_sse2.h"
+#endif
+
+namespace staffpad {
+namespace vo {
+inline void* allocate(int32_t bytes)
+{
+    return ::malloc(bytes);
+}
+
+inline void free(void* ptr)
+{
+    return ::free(ptr);
+}
+
+template<class T>
+inline void copy(const T* src, T* dst, int32_t n)
+{
+    memcpy(dst, src, n * sizeof(T));
+}
+
+template<class T>
+inline void add(const T* src1, const T* src2, T* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = src1[i] + src2[i];
+    }
+}
+
+template<class T>
+inline void subtract(const T* src1, const T* src2, T* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = src2[i] - src1[i];
+    }
+}
+
+template<class T>
+inline void constantMultiply(const T* src, T constant, T* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = src[i] * constant;
+    }
+}
+
+template<class T>
+inline void constantMultiplyAndAdd(const T* src, T constant, T* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] += src[i] * constant;
+    }
+}
+
+template<class T>
+inline void multiply(const T* src1, const T* src2, T* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = src1[i] * src2[i];
+    }
+}
+
+template<class T>
+inline void setToZero(T* dst, int32_t n)
+{
+    std::fill(dst, dst + n, 0.f);
+}
+
+template<class T>
+inline void findMaxElement(const T* src, int32_t n, int32_t& maxIndex, T& maxValue)
+{
+    maxIndex = 0;
+    maxValue = n > 0 ? src[0] : std::numeric_limits<T>::min();
+
+    for (int32_t i = 1; i < n; i++) {
+        if (src[i] > maxValue) {
+            maxValue = src[i];
+            maxIndex = i;
+        }
+    }
+}
+
+#if USE_SSE2_COMPLEX
+
+inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
+{
+    simd_complex_conversions::perform_parallel_simd_aligned(
+        src, dst, n,
+        [](const __m128 rp, const __m128 ip, __m128& out)
+    { out = simd_complex_conversions::atan2_ps(ip, rp); });
+}
+
+inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
+{
+    simd_complex_conversions::perform_parallel_simd_aligned(
+        src, dst, n,
+        [](const __m128 rp, const __m128 ip, __m128& out)
+    { out = simd_complex_conversions::norm(rp, ip); });
+}
+
+inline void rotate(
+    const float* oldPhase, const float* newPhase, std::complex<float>* dst,
+    int32_t n)
+{
+    simd_complex_conversions::rotate_parallel_simd_aligned(
+        oldPhase, newPhase, dst, n);
+}
+
+#else
+inline void calcPhases(const std::complex<float>* src, float* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = std::arg(src[i]);
+    }
+}
+
+inline void calcNorms(const std::complex<float>* src, float* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        dst[i] = std::norm(src[i]);
+    }
+}
+
+inline void rotate(const float* oldPhase, const float* newPhase, std::complex<float>* dst, int32_t n)
+{
+    for (int32_t i = 0; i < n; i++) {
+        const auto theta = oldPhase ? newPhase[i] - oldPhase[i] : newPhase[i];
+        dst[i] *= std::complex<float>(cosf(theta), sinf(theta));
+    }
+}
+
+#endif
+} // namespace vo
+} // namespace staffpad
diff --git a/libs/staffpad/pffft/README.txt b/libs/staffpad/pffft/README.txt
new file mode 100644
index 0000000000..878f4a67ee
--- /dev/null
+++ b/libs/staffpad/pffft/README.txt
@@ -0,0 +1,379 @@
+PFFFT: a pretty fast FFT.
+
+TL;DR
+--
+
+PFFFT does 1D Fast Fourier Transforms, of single precision real and
+complex vectors. It tries do it fast, it tries to be correct, and it
+tries to be small. Computations do take advantage of SSE1 instructions
+on x86 cpus, Altivec on powerpc cpus, and NEON on ARM cpus. The
+license is BSD-like.
+
+
+Why does it exist:
+--
+
+I was in search of a good performing FFT library , preferably very
+small and with a very liberal license.
+
+When one says "fft library", FFTW ("Fastest Fourier Transform in the
+West") is probably the first name that comes to mind -- I guess that
+99% of open-source projects that need a FFT do use FFTW, and are happy
+with it. However, it is quite a large library , which does everything
+fft related (2d transforms, 3d transforms, other transformations such
+as discrete cosine , or fast hartley). And it is licensed under the
+GNU GPL , which means that it cannot be used in non open-source
+products.
+
+An alternative to FFTW that is really small, is the venerable FFTPACK
+v4, which is available on NETLIB. A more recent version (v5) exists,
+but it is larger as it deals with multi-dimensional transforms. This
+is a library that is written in FORTRAN 77, a language that is now
+considered as a bit antiquated by many. FFTPACKv4 was written in 1985,
+by Dr Paul Swarztrauber of NCAR, more than 25 years ago ! And despite
+its age, benchmarks show it that it still a very good performing FFT
+library, see for example the 1d single precision benchmarks here:
+http://www.fftw.org/speed/opteron-2.2GHz-32bit/ . It is however not
+competitive with the fastest ones, such as FFTW, Intel MKL, AMD ACML,
+Apple vDSP. The reason for that is that those libraries do take
+advantage of the SSE SIMD instructions available on Intel CPUs,
+available since the days of the Pentium III. These instructions deal
+with small vectors of 4 floats at a time, instead of a single float
+for a traditionnal FPU, so when using these instructions one may expect
+a 4-fold performance improvement.
+
+The idea was to take this fortran fftpack v4 code, translate to C,
+modify it to deal with those SSE instructions, and check that the
+final performance is not completely ridiculous when compared to other
+SIMD FFT libraries. Translation to C was performed with f2c (
+http://www.netlib.org/f2c/ ). The resulting file was a bit edited in
+order to remove the thousands of gotos that were introduced by
+f2c. You will find the fftpack.h and fftpack.c sources in the
+repository, this a complete translation of
+http://www.netlib.org/fftpack/ , with the discrete cosine transform
+and the test program. There is no license information in the netlib
+repository, but it was confirmed to me by the fftpack v5 curators that
+the same terms do apply to fftpack v4:
+http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html . This is a
+"BSD-like" license, it is compatible with proprietary projects.
+
+Adapting fftpack to deal with the SIMD 4-element vectors instead of
+scalar single precision numbers was more complex than I originally
+thought, especially with the real transforms, and I ended up writing
+more code than I planned..
+
+
+The code:
+--
+
+Only two files, in good old C, pffft.c and pffft.h . The API is very
+very simple, just make sure that you read the comments in pffft.h.
+
+
+Comparison with other FFTs:
+--
+
+The idea was not to break speed records, but to get a decently fast
+fft that is at least 50% as fast as the fastest FFT -- especially on
+slowest computers . I'm more focused on getting the best performance
+on slow cpus (Atom, Intel Core 1, old Athlons, ARM Cortex-A9...), than
+on getting top performance on today fastest cpus.
+
+It can be used in a real-time context as the fft functions do not
+perform any memory allocation -- that is why they accept a 'work'
+array in their arguments.
+
+It is also a bit focused on performing 1D convolutions, that is why it
+provides "unordered" FFTs , and a fourier domain convolution
+operation.
+
+
+Benchmark results (cpu tested: core i7 2600, core 2 quad, core 1 duo, atom N270, cortex-A9)
+--
+
+The benchmark shows the performance of various fft implementations measured in 
+MFlops, with the number of floating point operations being defined as 5Nlog2(N)
+for a length N complex fft, and 2.5*Nlog2(N) for a real fft. 
+See http://www.fftw.org/speed/method.html for an explanation of these formulas.
+
+MacOS Lion, gcc 4.2, 64-bit, fftw 3.3 on a 3.4 GHz core i7 2600
+
+Built with:
+
+ gcc-4.2 -o test_pffft -arch x86_64 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -DHAVE_VECLIB -framework veclib -DHAVE_FFTW -lfftw3f 
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|       64  |     2816   |     8596   |     7329   |     8187   | |     2887   |    14898   |    14668   |    11108   |
+|       96  |     3298   |      n/a   |     8378   |     7727   | |     3953   |      n/a   |    15680   |    10878   |
+|      128  |     3507   |    11575   |     9266   |    10108   | |     4233   |    17598   |    16427   |    12000   |
+|      160  |     3391   |      n/a   |     9838   |    10711   | |     4220   |      n/a   |    16653   |    11187   |
+|      192  |     3919   |      n/a   |     9868   |    10956   | |     4297   |      n/a   |    15770   |    12540   |
+|      256  |     4283   |    13179   |    10694   |    13128   | |     4545   |    19550   |    16350   |    13822   |
+|      384  |     3136   |      n/a   |    10810   |    12061   | |     3600   |      n/a   |    16103   |    13240   |
+|      480  |     3477   |      n/a   |    10632   |    12074   | |     3536   |      n/a   |    11630   |    12522   |
+|      512  |     3783   |    15141   |    11267   |    13838   | |     3649   |    20002   |    16560   |    13580   |
+|      640  |     3639   |      n/a   |    11164   |    13946   | |     3695   |      n/a   |    15416   |    13890   |
+|      768  |     3800   |      n/a   |    11245   |    13495   | |     3590   |      n/a   |    15802   |    14552   |
+|      800  |     3440   |      n/a   |    10499   |    13301   | |     3659   |      n/a   |    12056   |    13268   |
+|     1024  |     3924   |    15605   |    11450   |    15339   | |     3769   |    20963   |    13941   |    15467   |
+|     2048  |     4518   |    16195   |    11551   |    15532   | |     4258   |    20413   |    13723   |    15042   |
+|     2400  |     4294   |      n/a   |    10685   |    13078   | |     4093   |      n/a   |    12777   |    13119   |
+|     4096  |     4750   |    16596   |    11672   |    15817   | |     4157   |    19662   |    14316   |    14336   |
+|     8192  |     3820   |    16227   |    11084   |    12555   | |     3691   |    18132   |    12102   |    13813   |
+|     9216  |     3864   |      n/a   |    10254   |    12870   | |     3586   |      n/a   |    12119   |    13994   |
+|    16384  |     3822   |    15123   |    10454   |    12822   | |     3613   |    16874   |    12370   |    13881   |
+|    32768  |     4175   |    14512   |    10662   |    11095   | |     3881   |    14702   |    11619   |    11524   |
+|   262144  |     3317   |    11429   |     6269   |     9517   | |     2810   |    11729   |     7757   |    10179   |
+|  1048576  |     2913   |    10551   |     4730   |     5867   | |     2661   |     7881   |     3520   |     5350   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+Debian 6, gcc 4.4.5, 64-bit, fftw 3.3.1 on a 3.4 GHz core i7 2600
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse2 -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L$HOME/local/lib -I$HOME/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |      3840    |      7680    |      8777    | |      4389    |     20480    |     11171    |
+|           96     |      4214    |      9633    |      8429    | |      4816    |     22477    |     11238    |
+|          128     |      3584    |     10240    |     10240    | |      5120    |     23893    |     11947    |
+|          192     |      4854    |     11095    |     12945    | |      4854    |     22191    |     14121    |
+|          256     |      4096    |     11703    |     16384    | |      5120    |     23406    |     13653    |
+|          384     |      4395    |     14651    |     12558    | |      4884    |     19535    |     14651    |
+|          512     |      5760    |     13166    |     15360    | |      4608    |     23040    |     15360    |
+|          768     |      4907    |     14020    |     16357    | |      4461    |     19628    |     14020    |
+|         1024     |      5120    |     14629    |     14629    | |      5120    |     20480    |     15754    |
+|         2048     |      5632    |     14080    |     18773    | |      4693    |     12516    |     16091    |
+|         4096     |      5120    |     13653    |     17554    | |      4726    |      7680    |     14456    |
+|         8192     |      4160    |      7396    |     13312    | |      4437    |     14791    |     13312    |
+|         9216     |      4210    |      6124    |     13473    | |      4491    |      7282    |     14970    |
+|        16384     |      3976    |     11010    |     14313    | |      4210    |     11450    |     13631    |
+|        32768     |      4260    |     10224    |     10954    | |      4260    |      6816    |     11797    |
+|       262144     |      3736    |      6896    |      9961    | |      2359    |      8965    |      9437    |
+|      1048576     |      2796    |      4534    |      6453    | |      1864    |      3078    |      5592    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+MacOS Snow Leopard, gcc 4.0, 32-bit, fftw 3.3 on a 1.83 GHz core 1 duo
+
+Built with:
+
+ gcc -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework veclib
+
+| input len |real FFTPack|  real vDSP |  real FFTW | real PFFFT | |cplx FFTPack|  cplx vDSP |  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+|      64   |      745   |     2145   |     1706   |     2028   | |      961   |     3356   |     3313   |     2300   |
+|      96   |      877   |      n/a   |     1976   |     1978   | |     1059   |      n/a   |     3333   |     2233   |
+|     128   |      951   |     2808   |     2213   |     2279   | |     1202   |     3803   |     3739   |     2494   |
+|     192   |     1002   |      n/a   |     2456   |     2429   | |     1186   |      n/a   |     3701   |     2508   |
+|     256   |     1065   |     3205   |     2641   |     2793   | |     1302   |     4013   |     3912   |     2663   |
+|     384   |      845   |      n/a   |     2759   |     2499   | |      948   |      n/a   |     3729   |     2504   |
+|     512   |      900   |     3476   |     2956   |     2759   | |      974   |     4057   |     3954   |     2645   |
+|     768   |      910   |      n/a   |     2912   |     2737   | |      975   |      n/a   |     3837   |     2614   |
+|    1024   |      936   |     3583   |     3107   |     3009   | |     1006   |     4124   |     3821   |     2697   |
+|    2048   |     1057   |     3585   |     3091   |     2837   | |     1089   |     3889   |     3701   |     2513   |
+|    4096   |     1083   |     3524   |     3092   |     2733   | |     1039   |     3617   |     3462   |     2364   |
+|    8192   |      874   |     3252   |     2967   |     2363   | |      911   |     3106   |     2789   |     2302   |
+|    9216   |      898   |      n/a   |     2420   |     2290   | |      865   |      n/a   |     2676   |     2204   |
+|   16384   |      903   |     2892   |     2506   |     2421   | |      899   |     3026   |     2797   |     2289   |
+|   32768   |      965   |     2837   |     2550   |     2358   | |      920   |     2922   |     2763   |     2240   |
+|  262144   |      738   |     2422   |     1589   |     1708   | |      610   |     2038   |     1436   |     1091   |
+| 1048576   |      528   |     1207   |      845   |      880   | |      606   |     1020   |      669   |     1036   |
+|-----------+------------+------------+------------+------------| |------------+------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.2 on a 2.66 core 2 quad
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |     1920   |     3614   |     5120   | |     2194   |     7680   |     6467   |
+|       96  |     1873   |     3549   |     5187   | |     2107   |     8429   |     5863   |
+|      128  |     2240   |     3773   |     5514   | |     2560   |     7964   |     6827   |
+|      192  |     1765   |     4569   |     7767   | |     2284   |     9137   |     7061   |
+|      256  |     2048   |     5461   |     7447   | |     2731   |     9638   |     7802   |
+|      384  |     1998   |     5861   |     6762   | |     2313   |     9253   |     7644   |
+|      512  |     2095   |     6144   |     7680   | |     2194   |    10240   |     7089   |
+|      768  |     2230   |     5773   |     7549   | |     2045   |    10331   |     7010   |
+|     1024  |     2133   |     6400   |     8533   | |     2133   |    10779   |     7877   |
+|     2048  |     2011   |     7040   |     8665   | |     1942   |    10240   |     7768   |
+|     4096  |     2194   |     6827   |     8777   | |     1755   |     9452   |     6827   |
+|     8192  |     1849   |     6656   |     6656   | |     1752   |     7831   |     6827   |
+|     9216  |     1871   |     5858   |     6416   | |     1643   |     6909   |     6266   |
+|    16384  |     1883   |     6223   |     6506   | |     1664   |     7340   |     6982   |
+|    32768  |     1826   |     6390   |     6667   | |     1631   |     7481   |     6971   |
+|   262144  |     1546   |     4075   |     5977   | |     1299   |     3415   |     3551   |
+|  1048576  |     1104   |     2071   |     1730   | |     1104   |     1149   |     1834   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+
+
+Ubuntu 11.04, gcc 4.5, 32-bit, fftw 3.3 on a 1.6 GHz Atom N270
+
+Built with:
+gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
+
+| N (input length) | real FFTPack |   real FFTW  |  real PFFFT  | | cplx FFTPack |   cplx FFTW  |  cplx PFFFT  |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+|           64     |       452    |      1041    |      1336    | |       549    |      2318    |      1781    |
+|           96     |       444    |      1297    |      1297    | |       503    |      2408    |      1686    |
+|          128     |       527    |      1525    |      1707    | |       543    |      2655    |      1886    |
+|          192     |       498    |      1653    |      1849    | |       539    |      2678    |      1942    |
+|          256     |       585    |      1862    |      2156    | |       594    |      2777    |      2244    |
+|          384     |       499    |      1870    |      1998    | |       511    |      2586    |      1890    |
+|          512     |       562    |      2095    |      2194    | |       542    |      2973    |      2194    |
+|          768     |       545    |      2045    |      2133    | |       545    |      2365    |      2133    |
+|         1024     |       595    |      2133    |      2438    | |       569    |      2695    |      2179    |
+|         2048     |       587    |      2125    |      2347    | |       521    |      2230    |      1707    |
+|         4096     |       495    |      1890    |      1834    | |       492    |      1876    |      1672    |
+|         8192     |       469    |      1548    |      1729    | |       438    |      1740    |      1664    |
+|         9216     |       468    |      1663    |      1663    | |       446    |      1585    |      1531    |
+|        16384     |       453    |      1608    |      1767    | |       398    |      1476    |      1664    |
+|        32768     |       456    |      1420    |      1503    | |       387    |      1388    |      1345    |
+|       262144     |       309    |       385    |       726    | |       262    |       415    |       840    |
+|      1048576     |       280    |       351    |       739    | |       261    |       313    |       797    |
+|------------------+--------------+--------------+--------------| |--------------+--------------+--------------|
+
+
+
+Windows 7, visual c++ 2010 on a 1.6 GHz Atom N270
+
+Built with:
+cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
+
+(visual c++ is definitively not very good with SSE intrinsics...)
+
+| N (input length) | real FFTPack |  real PFFFT  | | cplx FFTPack |  cplx PFFFT  |
+|------------------+--------------+--------------| |--------------+--------------|
+|           64     |       173    |      1009    | |       174    |      1159    |
+|           96     |       169    |      1029    | |       188    |      1201    |
+|          128     |       195    |      1242    | |       191    |      1275    |
+|          192     |       178    |      1312    | |       184    |      1276    |
+|          256     |       196    |      1591    | |       186    |      1281    |
+|          384     |       172    |      1409    | |       181    |      1281    |
+|          512     |       187    |      1640    | |       181    |      1313    |
+|          768     |       171    |      1614    | |       176    |      1258    |
+|         1024     |       186    |      1812    | |       178    |      1223    |
+|         2048     |       190    |      1707    | |       186    |      1099    |
+|         4096     |       182    |      1446    | |       177    |       975    |
+|         8192     |       175    |      1345    | |       169    |      1034    |
+|         9216     |       165    |      1271    | |       168    |      1023    |
+|        16384     |       166    |      1396    | |       165    |       949    |
+|        32768     |       172    |      1311    | |       161    |       881    |
+|       262144     |       136    |       632    | |       134    |       629    |
+|      1048576     |       134    |       698    | |       127    |       623    |
+|------------------+--------------+--------------| |--------------+--------------|
+
+
+
+Ubuntu 12.04, gcc-4.7.3, 32-bit, with fftw 3.3.3 (built with --enable-neon), on a 1.2GHz ARM Cortex A9 (Tegra 3)
+
+Built with:
+gcc-4.7 -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      549   |      452   |      731   | |      512   |      602   |      640   |
+|       96  |      421   |      272   |      702   | |      496   |      571   |      602   |
+|      128  |      498   |      512   |      815   | |      597   |      618   |      652   |
+|      160  |      521   |      536   |      815   | |      586   |      669   |      625   |
+|      192  |      539   |      571   |      883   | |      485   |      597   |      626   |
+|      256  |      640   |      539   |      975   | |      569   |      611   |      671   |
+|      384  |      499   |      610   |      879   | |      499   |      602   |      637   |
+|      480  |      518   |      507   |      877   | |      496   |      661   |      616   |
+|      512  |      524   |      591   |     1002   | |      549   |      678   |      668   |
+|      640  |      542   |      612   |      955   | |      568   |      663   |      645   |
+|      768  |      557   |      613   |      981   | |      491   |      663   |      598   |
+|      800  |      514   |      353   |      882   | |      514   |      360   |      574   |
+|     1024  |      640   |      640   |     1067   | |      492   |      683   |      602   |
+|     2048  |      587   |      640   |      908   | |      486   |      640   |      552   |
+|     2400  |      479   |      368   |      777   | |      422   |      376   |      518   |
+|     4096  |      511   |      614   |      853   | |      426   |      640   |      534   |
+|     8192  |      415   |      584   |      708   | |      386   |      622   |      516   |
+|     9216  |      419   |      571   |      687   | |      364   |      586   |      506   |
+|    16384  |      426   |      577   |      716   | |      398   |      606   |      530   |
+|    32768  |      417   |      572   |      673   | |      399   |      572   |      468   |
+|   262144  |      219   |      380   |      293   | |      255   |      431   |      343   |
+|  1048576  |      202   |      274   |      237   | |      265   |      282   |      355   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+Same platform as above, but this time pffft and fftpack are built with clang 3.2:
+
+clang -O3 -DHAVE_FFTW -march=armv7-a -mtune=cortex-a9 -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm -I/usr/local/include/ -L/usr/local/lib/ -lfftw3f
+
+| input len |real FFTPack|  real FFTW | real PFFFT | |cplx FFTPack|  cplx FFTW | cplx PFFFT |
+|-----------+------------+------------+------------| |------------+------------+------------|
+|       64  |      427   |      452   |      853   | |      427   |      602   |     1024   |
+|       96  |      351   |      276   |      843   | |      337   |      571   |      963   |
+|      128  |      373   |      512   |      996   | |      390   |      618   |     1054   |
+|      160  |      426   |      536   |      987   | |      375   |      669   |      914   |
+|      192  |      404   |      571   |     1079   | |      388   |      588   |     1079   |
+|      256  |      465   |      539   |     1205   | |      445   |      602   |     1170   |
+|      384  |      366   |      610   |     1099   | |      343   |      594   |     1099   |
+|      480  |      356   |      507   |     1140   | |      335   |      651   |      931   |
+|      512  |      411   |      591   |     1213   | |      384   |      649   |     1124   |
+|      640  |      398   |      612   |     1193   | |      373   |      654   |      901   |
+|      768  |      409   |      613   |     1227   | |      383   |      663   |     1044   |
+|      800  |      411   |      348   |     1073   | |      353   |      358   |      809   |
+|     1024  |      427   |      640   |     1280   | |      413   |      692   |     1004   |
+|     2048  |      414   |      626   |     1126   | |      371   |      640   |      853   |
+|     2400  |      399   |      373   |      898   | |      319   |      368   |      653   |
+|     4096  |      404   |      602   |     1059   | |      357   |      633   |      778   |
+|     8192  |      332   |      584   |      792   | |      308   |      616   |      716   |
+|     9216  |      322   |      561   |      783   | |      299   |      586   |      687   |
+|    16384  |      344   |      568   |      778   | |      314   |      617   |      745   |
+|    32768  |      342   |      564   |      737   | |      314   |      552   |      629   |
+|   262144  |      201   |      383   |      313   | |      227   |      435   |      413   |
+|  1048576  |      187   |      262   |      251   | |      228   |      281   |      409   |
+|-----------+------------+------------+------------| |------------+------------+------------|
+
+So it looks like, on ARM, gcc 4.7 is the best at scalar floating point
+(the fftpack performance numbers are better with gcc), while clang is
+the best with neon intrinsics (see how pffft perf has improved with
+clang 3.2).
+
+
+NVIDIA Jetson TK1 board, gcc-4.8.2. The cpu is a 2.3GHz cortex A15 (Tegra K1).
+
+Built with:
+gcc -O3 -march=armv7-a -mtune=native -mfloat-abi=hard -mfpu=neon -ffast-math test_pffft.c pffft.c -o test_pffft_arm fftpack.c -lm
+
+| input len |real FFTPack| real PFFFT | |cplx FFTPack| cplx PFFFT |
+|-----------+------------+------------| |------------+------------|
+|       64  |     1735   |     3308   | |     1994   |     3744   |
+|       96  |     1596   |     3448   | |     1987   |     3572   |
+|      128  |     1807   |     4076   | |     2255   |     3960   |
+|      160  |     1769   |     4083   | |     2071   |     3845   |
+|      192  |     1990   |     4233   | |     2017   |     3939   |
+|      256  |     2191   |     4882   | |     2254   |     4346   |
+|      384  |     1878   |     4492   | |     2073   |     4012   |
+|      480  |     1748   |     4398   | |     1923   |     3951   |
+|      512  |     2030   |     5064   | |     2267   |     4195   |
+|      640  |     1918   |     4756   | |     2094   |     4184   |
+|      768  |     2099   |     4907   | |     2048   |     4297   |
+|      800  |     1822   |     4555   | |     1880   |     4063   |
+|     1024  |     2232   |     5355   | |     2187   |     4420   |
+|     2048  |     2176   |     4983   | |     2027   |     3602   |
+|     2400  |     1741   |     4256   | |     1710   |     3344   |
+|     4096  |     1816   |     3914   | |     1851   |     3349   |
+|     8192  |     1716   |     3481   | |     1700   |     3255   |
+|     9216  |     1735   |     3589   | |     1653   |     3094   |
+|    16384  |     1567   |     3483   | |     1637   |     3244   |
+|    32768  |     1624   |     3240   | |     1655   |     3156   |
+|   262144  |     1012   |     1898   | |      983   |     1503   |
+|  1048576  |      876   |     1154   | |      868   |     1341   |
+|-----------+------------+------------| |------------+------------|
+
+The performance on the tegra K1 is pretty impressive. I'm not
+including the FFTW numbers as they as slightly below the scalar
+fftpack numbers, so something must be wrong (however it seems to be
+correctly configured and is using neon simd instructions).
+
+When using clang 3.4 the pffft version is even a bit faster, reaching
+5.7 GFlops for real ffts of size 1024.
diff --git a/libs/staffpad/pffft/pffft.c b/libs/staffpad/pffft/pffft.c
new file mode 100644
index 0000000000..779f266831
--- /dev/null
+++ b/libs/staffpad/pffft/pffft.c
@@ -0,0 +1,1866 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+   Copyright (c) 2020  Hayati Ayguen ( h_ayguen@web.de )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog: 
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#ifndef M_PI
+#define M_PI (3.14159265358979323846)
+#endif
+
+#ifndef M_SQRT2
+#define M_SQRT2 (1.41421356237309504880)
+#endif
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+#ifdef COMPILER_MSVC
+#pragma warning( disable : 4244 4305 4204 4456 )
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+
+
+// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+//#define PFFFT_SIMD_DISABLE
+
+#ifdef PFFFT_FLOAT
+#define float PFFFT_FLOAT
+#endif
+
+
+#include "pfsimd_macros.h"
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd(void); // silence warning
+void validate_pffft_simd(void) {
+#ifndef PFFFT_SIMD_DISABLE
+  Vvalidate_simd();
+#endif
+}
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+void *pffft_aligned_malloc(size_t nb_bytes) {
+  return Valigned_malloc(nb_bytes);
+}
+
+void pffft_aligned_free(void *p) {
+  Valigned_free(p);
+}
+
+int pffft_simd_size(void) { return SIMD_SZ; }
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
+  int k, i;
+  int l1ido = l1*ido;
+  if (ido <= 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
+      ch[0]         = VADD(cc[0], cc[ido+0]);
+      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
+      ch[1]         = VADD(cc[1], cc[ido+1]);
+      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
+        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
+        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
+        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
+        VCPLXMUL(tr2, ti2, wr, wi);
+        ch[i+l1ido]   = tr2;
+        ch[i+l1ido+1] = ti2;
+      }
+    }
+  }
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, float fsign) {
+  static const float taur = -0.5f;
+  float taui = 0.866025403784439f*fsign;
+  int i, k;
+  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
+  int l1ido = l1*ido;
+  float wr1, wi1, wr2, wi2;
+  assert(ido > 2);
+  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
+    for (i=0; i<ido-1; i+=2) {
+      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
+      cr2 = VADD(cc[i], SVMUL(taur,tr2));
+      ch[i]    = VADD(cc[i], tr2);
+      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
+      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
+      ch[i+1]  = VADD(cc[i+1], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
+      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+        wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch[i+l1ido] = dr2; 
+      ch[i+l1ido + 1] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch[i+2*l1ido] = dr3;
+      ch[i+2*l1ido+1] = di3;
+    }
+  }
+} /* passf3 */
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
+  /* isign == -1 for forward transform and +1 for backward transform */
+
+  int i, k;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  int l1ido = l1*ido;
+  if (ido == 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
+      tr1 = VSUB(cc[0], cc[2*ido + 0]);
+      tr2 = VADD(cc[0], cc[2*ido + 0]);
+      ti1 = VSUB(cc[1], cc[2*ido + 1]);
+      ti2 = VADD(cc[1], cc[2*ido + 1]);
+      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
+      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
+      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
+      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+
+      ch[0*l1ido + 0] = VADD(tr2, tr3);
+      ch[0*l1ido + 1] = VADD(ti2, ti3);
+      ch[1*l1ido + 0] = VADD(tr1, tr4);
+      ch[1*l1ido + 1] = VADD(ti1, ti4);
+      ch[2*l1ido + 0] = VSUB(tr2, tr3);
+      ch[2*l1ido + 1] = VSUB(ti2, ti3);        
+      ch[3*l1ido + 0] = VSUB(tr1, tr4);
+      ch[3*l1ido + 1] = VSUB(ti1, ti4);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        float wr1, wi1, wr2, wi2, wr3, wi3;
+        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
+        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
+        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
+        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
+        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
+        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
+        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
+        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+
+        ch[i] = VADD(tr2, tr3);
+        cr3    = VSUB(tr2, tr3);
+        ch[i + 1] = VADD(ti2, ti3);
+        ci3 = VSUB(ti2, ti3);
+
+        cr2 = VADD(tr1, tr4);
+        cr4 = VSUB(tr1, tr4);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+          wr1=wa1[i]; wi1=fsign*wa1[i+1];
+        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+          wr2=wa2[i]; wi2=fsign*wa2[i+1];
+        ch[i + l1ido] = cr2;
+        ch[i + l1ido + 1] = ci2;
+
+        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+          wr3=wa3[i]; wi3=fsign*wa3[i+1];
+        ch[i + 2*l1ido] = cr3;
+        ch[i + 2*l1ido + 1] = ci3;
+
+        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+        ch[i + 3*l1ido] = cr4;
+        ch[i + 3*l1ido + 1] = ci4;
+      }
+    }
+  }
+} /* passf4 */
+
+/*
+  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+*/
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, 
+                                    const float *wa3, const float *wa4, float fsign) {  
+  static const float tr11 = .309016994374947f;
+  const float ti11 = .951056516295154f*fsign;
+  static const float tr12 = -.809016994374947f;
+  const float ti12 = .587785252292473f*fsign;
+
+  /* Local variables */
+  int i, k;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
+
+  assert(ido > 2);
+  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
+    for (i = 0; i < ido-1; i += 2) {
+      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
+      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
+      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+        wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
+        wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch_ref(i - 1, 2) = dr2;
+      ch_ref(i, 2)     = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch_ref(i - 1, 3) = dr3;
+      ch_ref(i, 3)     = di3;
+      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+      ch_ref(i - 1, 4) = dr4;
+      ch_ref(i, 4)     = di4;
+      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+      ch_ref(i - 1, 5) = dr5;
+      ch_ref(i, 5)     = di5;
+    }
+  }
+#undef ch_ref
+#undef cc_ref
+}
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
+  static const float minus_one = -1.f;
+  int i, k, l1ido = l1*ido;
+  for (k=0; k < l1ido; k += ido) {
+    v4sf a = cc[k], b = cc[k + l1ido];
+    ch[2*k] = VADD(a, b);
+    ch[2*(k+ido)-1] = VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k=0; k < l1ido; k += ido) {
+      for (i=2; i<ido; i+=2) {
+        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
+        v4sf br = cc[i - 1 + k], bi = cc[i + k];
+        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); 
+        ch[i + 2*k] = VADD(bi, ti2);
+        ch[2*(k+ido) - i] = VSUB(ti2, bi);
+        ch[i - 1 + 2*k] = VADD(br, tr2);
+        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k += ido) {
+    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+    ch[2*k + ido-1] = cc[k + ido-1];
+  }
+} /* radf2 */
+
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
+  static const float minus_two=-2;
+  int i, k, l1ido = l1*ido;
+  v4sf a,b,c,d, tr2, ti2;
+  for (k=0; k < l1ido; k += ido) {
+    a = cc[2*k]; b = cc[2*(k+ido) - 1];
+    ch[k] = VADD(a, b);
+    ch[k + l1ido] =VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      for (i = 2; i < ido; i += 2) {
+        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
+        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
+        ch[i-1 + k] = VADD(a, b);
+        tr2 = VSUB(a, b);
+        ch[i+0 + k] = VSUB(c, d);
+        ti2 = VADD(c, d);
+        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i-1 + k + l1ido] = tr2;
+        ch[i+0 + k + l1ido] = ti2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 0; k < l1ido; k += ido) {
+    a = cc[2*k + ido-1]; b = cc[2*k + ido];
+    ch[k + ido-1] = VADD(a,a);
+    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+  }
+} /* radb2 */
+
+static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2) {
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  int i, k, ic;
+  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
+  for (k=0; k<l1; k++) {
+    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+    ch[3*k*ido] = VADD(cc[k*ido], cr2);
+    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
+      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
+      VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
+      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
+      VCPLXMULCONJ(dr3, di3, wr2, wi2);
+        
+      cr2 = VADD(dr2, dr3);
+      ci2 = VADD(di2, di3);
+      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
+      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
+      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
+      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
+      tr3 = SVMUL(taui, VSUB(di2, di3));
+      ti3 = SVMUL(taui, VSUB(dr3, dr2));
+      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
+      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
+      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+    }
+  }
+} /* radf3 */
+
+
+static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                     const float *wa1, const float *wa2)
+{
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  static const float taui_2 = 0.866025403784439f*2;
+  int i, k, ic;
+  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+  for (k=0; k<l1; k++) {
+    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
+    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+    ch[k*ido] = VADD(cc[3*k*ido], tr2);
+    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+    ch[(k + l1)*ido] = VSUB(cr2, ci3);
+    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
+      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
+      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
+      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+      ch[i - 1 + (k + l1)*ido] = dr2;
+      ch[i + (k + l1)*ido] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+      ch[i - 1 + (k + 2*l1)*ido] = dr3;
+      ch[i + (k + 2*l1)*ido] = di3;
+    }
+  }
+} /* radb3 */
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
+{
+  static const float minus_hsqt2 = (float)-0.7071067811865475;
+  int i, k, l1ido = l1*ido;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; 
+    v4sf * RESTRICT ch_ = ch;
+    while (cc < cc_end) {
+      // this loop represents between 25% and 40% of total radf4_ps cost !
+      v4sf a0 = cc[0], a1 = cc[l1ido];
+      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
+      v4sf tr1 = VADD(a1, a3);
+      v4sf tr2 = VADD(a0, a2);
+      ch[2*ido-1] = VSUB(a0, a2);
+      ch[2*ido  ] = VSUB(a3, a1);
+      ch[0      ] = VADD(tr1, tr2);
+      ch[4*ido-1] = VSUB(tr2, tr1);
+      cc += ido; ch += 4*ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
+      for (i=2; i<ido; i += 2, pc += 2) {
+        int ic = ido - i;
+        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+        cr2 = pc[1*l1ido+0];
+        ci2 = pc[1*l1ido+1];
+        wr=LD_PS1(wa1[i - 2]);
+        wi=LD_PS1(wa1[i - 1]);
+        VCPLXMULCONJ(cr2,ci2,wr,wi);
+
+        cr3 = pc[2*l1ido+0];
+        ci3 = pc[2*l1ido+1];
+        wr = LD_PS1(wa2[i-2]); 
+        wi = LD_PS1(wa2[i-1]);
+        VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+        cr4 = pc[3*l1ido];
+        ci4 = pc[3*l1ido+1];
+        wr = LD_PS1(wa3[i-2]); 
+        wi = LD_PS1(wa3[i-1]);
+        VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+        tr1 = VADD(cr2,cr4);
+        tr4 = VSUB(cr4,cr2); 
+        tr2 = VADD(pc[0],cr3);
+        tr3 = VSUB(pc[0],cr3);
+        ch[i - 1 + 4*k] = VADD(tr1,tr2);
+        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
+        ti1 = VADD(ci2,ci4);
+        ti4 = VSUB(ci2,ci4);
+        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
+        ti2 = VADD(pc[1],ci3);
+        ti3 = VSUB(pc[1],ci3);
+        ch[i + 4*k] = VADD(ti1, ti2);
+        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
+        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k<l1ido; k += ido) {
+    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
+    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
+    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+    ch[ido-1 + 4*k] = VADD(tr1, c);
+    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
+    ch[4*k + 1*ido] = VSUB(ti1, d); 
+    ch[4*k + 3*ido] = VADD(ti1, d); 
+  }
+} /* radf4 */
+
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+{
+  static const float minus_sqrt2 = (float)-1.414213562373095;
+  static const float two = 2.f;
+  int i, k, l1ido = l1*ido;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; 
+    v4sf *ch_ = ch;
+    while (ch < ch_end) {
+      v4sf a = cc[0], b = cc[4*ido-1];
+      v4sf c = cc[2*ido], d = cc[2*ido-1];
+      tr3 = SVMUL(two,d);
+      tr2 = VADD(a,b);
+      tr1 = VSUB(a,b);
+      tr4 = SVMUL(two,c);
+      ch[0*l1ido] = VADD(tr2, tr3);
+      ch[2*l1ido] = VSUB(tr2, tr3);
+      ch[1*l1ido] = VSUB(tr1, tr4);
+      ch[3*l1ido] = VADD(tr1, tr4);
+      
+      cc += 4*ido; ch += ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
+      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
+      for (i = 2; i < ido; i += 2) {
+
+        tr1 = VSUB(pc[i], pc[4*ido - i]);
+        tr2 = VADD(pc[i], pc[4*ido - i]);
+        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
+        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+        ph[0] = VADD(tr2, tr3);
+        cr3 = VSUB(tr2, tr3);
+
+        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        cr2 = VSUB(tr1, tr4);
+        cr4 = VADD(tr1, tr4);
+
+        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
+        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+
+        ph[1] = VADD(ti2, ti3); ph += l1ido;
+        ci3 = VSUB(ti2, ti3);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+        ph[0] = cr2;
+        ph[1] = ci2; ph += l1ido;
+        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+        ph[0] = cr3;
+        ph[1] = ci3; ph += l1ido;
+        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+        ph[0] = cr4;
+        ph[1] = ci4; ph = ph - 3*l1ido + 2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k+=ido) {
+    int i0 = 4*k + ido;
+    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
+    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
+    tr1 = VSUB(c,d);
+    tr2 = VADD(c,d);
+    ti1 = VADD(b,a);
+    ti2 = VSUB(b,a);
+    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
+    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
+    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+  }
+} /* radb4 */
+
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, 
+                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  /* System generated locals */
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
+    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+  int idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 6;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+    //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+      cr2 = VADD(dr2, dr5);
+      ci5 = VSUB(dr5, dr2);
+      cr5 = VSUB(di2, di5);
+      ci2 = VADD(di2, di5);
+      cr3 = VADD(dr3, dr4);
+      ci4 = VSUB(dr4, dr3);
+      cr4 = VSUB(di3, di4);
+      ci3 = VADD(di3, di4);
+      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
+      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
+      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
+      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+      ch_ref(i, 3, k) = VADD(ti2, ti5);
+      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+      ch_ref(i, 5, k) = VADD(ti3, ti4);
+      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, 
+                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+  int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+    ch_ref(1, k, 2) = VSUB(cr2, ci5);
+    ch_ref(1, k, 3) = VSUB(cr3, ci4);
+    ch_ref(1, k, 4) = VADD(cr3, ci4);
+    ch_ref(1, k, 5) = VADD(cr2, ci5);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
+      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+
+      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
+      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
+      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
+      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radb5 */
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
+                                      const float *wa, const int *ifac) {  
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l2 = n;
+  int iw = n-1;
+  assert(in != out && work1 != work2);
+  for (k1 = 1; k1 <= nf; ++k1) {
+    int kh = nf - k1;
+    int ip = ifac[kh + 2];
+    int l1 = l2 / ip;
+    int ido = n / l2;
+    iw -= (ip - 1)*ido;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radf2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l2 = l1;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+} /* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, 
+                                      const float *wa, const int *ifac) {  
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out);
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1 + 1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+      case 3: {
+        int ix2 = iw + ido;
+        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+      case 2:
+        radb2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1)*ido;
+
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int *ntryh) {
+  int nl = n, nf = 0, i, j = 0;
+  for (j=0; ntryh[j]; ++j) {
+    int ntry = ntryh[j];
+    while (nl != 1) {
+      int nq = nl / ntry;
+      int nr = nl - ntry * nq;
+      if (nr == 0) {
+        ifac[2+nf++] = ntry;
+        nl = nq;
+        if (ntry == 2 && nf != 1) {
+          for (i = 2; i <= nf; ++i) {
+            int ib = nf - i + 2;
+            ifac[ib + 1] = ifac[ib];
+          }
+          ifac[2] = 2;
+        }
+      } else break;
+    }
+  }
+  ifac[0] = n;
+  ifac[1] = nf;  
+  return nf;
+}
+
+
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 4,2,3,5,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (2*(float)M_PI) / n;
+  int is = 0;
+  int nfm1 = nf - 1;
+  int l1 = 1;
+  for (k1 = 1; k1 <= nfm1; k1++) {
+    int ip = ifac[k1 + 1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      float argld;
+      int i = is, fi=0;
+      ld += l1;
+      argld = ld*argh;
+      for (ii = 3; ii <= ido; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i - 2] = cosf(fi*argld);
+        wa[i - 1] = sinf(fi*argld);
+      }
+      is += ido;
+    }
+    l1 = l2;
+  }
+} /* rffti1 */
+
+void cffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 5,3,4,2,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (2*(float)M_PI) / n;
+  int i = 1;
+  int l1 = 1;
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1+1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int idot = ido + ido + 2;
+    int ipm = ip - 1;
+    for (j=1; j<=ipm; j++) {
+      float argld;
+      int i1 = i, fi = 0;
+      wa[i-1] = 1;
+      wa[i] = 0;
+      ld += l1;
+      argld = ld*argh;
+      for (ii = 4; ii <= idot; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i-1] = cosf(fi*argld);
+        wa[i] = sinf(fi*argld);
+      }
+      if (ip > 5) {
+        wa[i1-1] = wa[i-1];
+        wa[i1] = wa[i];
+      }
+    }
+    l1 = l2;
+  }
+} /* cffti1 */
+
+
+v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2); 
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out && work1 != work2);
+  for (k1=2; k1<=nf+1; k1++) {
+    int ip = ifac[k1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    int idot = ido + ido;
+    switch (ip) {
+      case 5: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        int ix4 = ix3 + idot;
+        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+      } break;
+      case 4: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
+      } break;
+      case 2: {
+        passf2_ps(idot, l1, in, out, &wa[iw], isign);
+      } break;
+      case 3: {
+        int ix2 = iw + idot;
+        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
+      } break;
+      default:
+        assert(0);
+    }
+    l1 = l2;
+    iw += (ip - 1)*idot;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+
+  return in; /* this is in fact the output .. */
+}
+
+
+struct PFFFT_Setup {
+  int     N;
+  int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+  int ifac[15];
+  pffft_transform_t transform;
+  v4sf *data; // allocated room for twiddle coefs
+  float *e;    // points into 'data' , N/4*3 elements
+  float *twiddle; // points into 'data', N/4 elements
+};
+
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
+  int k, m;
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
+     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  //assert((N % 32) == 0);
+  s->N = N;
+  s->transform = transform;  
+  /* nb of complex simd vectors */
+  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+  s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
+  s->e = (float*)s->data;
+  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);  
+
+  if (transform == PFFFT_REAL) {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = -2*(float)M_PI*(m+1)*k / N;
+        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cosf(A);
+        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sinf(A);
+      }
+    }
+    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  } else {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = -2*(float)M_PI*(m+1)*k / N;
+        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cosf(A);
+        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sinf(A);
+      }
+    }
+    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  }
+
+  /* check that N is decomposable with allowed prime factors */
+  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
+  if (m != N/SIMD_SZ) {
+    pffft_destroy_setup(s); s = 0;
+  }
+
+  return s;
+}
+
+
+void pffft_destroy_setup(PFFFT_Setup *s) {
+  pffft_aligned_free(s->data);
+  free(s);
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
+  v4sf g0, g1;
+  int k;
+  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
+  
+  *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
+  for (k=1; k < N; ++k) {
+    v4sf h0, h1;
+    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+    *--out = VSWAPHL(g1, h0);
+    *--out = VSWAPHL(h0, h1);
+    g1 = h1;
+  }
+  *--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
+  v4sf g0, g1, h0, h1;
+  int k;
+  g0 = g1 = in[0]; ++in;
+  for (k=1; k < N; ++k) {
+    h0 = *in++; h1 = *in++;
+    g1 = VSWAPHL(g1, h0);
+    h0 = VSWAPHL(h0, h1);
+    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
+    g1 = h1;
+  }
+  h0 = *in++; h1 = g0;
+  g1 = VSWAPHL(g1, h0);
+  h0 = VSWAPHL(h0, h1);
+  UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N, Ncvec = setup->Ncvec;
+  const v4sf *vin = (const v4sf*)in;
+  v4sf *vout = (v4sf*)out;
+  assert(in != out);
+  if (setup->transform == PFFFT_REAL) {
+    int k, dk = N/32;
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < dk; ++k) {
+        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+      }
+      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
+      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
+    } else {
+      for (k=0; k < dk; ++k) {
+        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+      }
+      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
+      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
+    }
+  } else {
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < Ncvec; ++k) { 
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+      }
+    } else {
+      for (k=0; k < Ncvec; ++k) { 
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+      }
+    }
+  }
+}
+
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {    
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    /*
+      transformation for each column is:
+      
+      [1   1   1   1   0   0   0   0]   [r0]
+      [1   0  -1   0   0  -1   0   1]   [r1]
+      [1  -1   1  -1   0   0   0   0]   [r2]
+      [1   0  -1   0   0   1   0  -1]   [r3]
+      [0   0   0   0   1   1   1   1] * [i0]
+      [0   1   0  -1   1   0  -1   0]   [i1]
+      [0   0   0   0   1  -1   1  -1]   [i2]
+      [0  -1   0   1   1   0  -1   0]   [i3]    
+    */
+    
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
+  
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {    
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
+
+    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+                            const v4sf *e, v4sf *out) {
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  r0 = *in0; i0 = *in1;
+  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+ 
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0  -1   0   0  -1   0   1]   [r1]
+    [1   0  -1   0   0   1   0  -1]   [r2]
+    [1  -1   1  -1   0   0   0   0]   [r3]
+    [0   0   0   0   1   1   1   1] * [i0]
+    [0  -1   0   1  -1   0   1   0]   [i1]
+    [0  -1   0   1   1   0  -1   0]   [i2]
+    [0   0   0   0  -1   1  -1   1]   [i3]    
+  */
+  
+  //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+  //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+  VCPLXMUL(r1,i1,e[0],e[1]);
+  VCPLXMUL(r2,i2,e[2],e[3]);
+  VCPLXMUL(r3,i3,e[4],e[5]);
+
+  //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+  //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); 
+  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
+  si0 = VADD(i0,i2); di0 = VSUB(i0,i2); 
+  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+
+  r0 = VADD(sr0, sr1);
+  r3 = VSUB(sr0, sr1);
+  i0 = VADD(si0, si1);
+  i3 = VSUB(si1, si0);
+  r1 = VADD(dr0, di1);
+  r2 = VSUB(dr0, di1);
+  i1 = VSUB(dr1, di0);
+  i2 = VADD(dr1, di0);
+
+  *out++ = r0;
+  *out++ = i0;
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union cr, ci, *uout = (v4sf_union*)out;
+  v4sf save = in[7], zero=VZERO();
+  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+  static const float s = (float)M_SQRT2/2;
+
+  cr.v = in[0]; ci.v = in[Ncvec*2-1];
+  assert(in != out);
+  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
+
+  /*
+    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+
+    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+  */
+
+  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
+  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
+  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
+  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
+  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
+  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
+  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
+  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; 
+
+  for (k=1; k < dk; ++k) {
+    v4sf save_next = in[8*k+7];
+    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
+                           e + k*6, out + k*8);
+    save = save_next;
+  }
+
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, 
+                                             const v4sf *e, v4sf *out, int first) {
+  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0   0  -1   0  -1  -1   0]   [r1]
+    [1  -1  -1   1   0   0   0   0]   [r2]
+    [1   0   0  -1   0   1   1   0]   [r3]
+    [0   0   0   0   1  -1   1  -1] * [i0]
+    [0  -1   1   0   1   0   0   1]   [i1]
+    [0   0   0   0   1   1  -1  -1]   [i2]
+    [0   1  -1   0   1   0   0   1]   [i3]    
+  */
+
+  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); 
+  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
+  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); 
+  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+
+  r0 = VADD(sr0, sr1);
+  r2 = VSUB(sr0, sr1);
+  r1 = VSUB(dr0, si1);
+  r3 = VADD(dr0, si1);
+  i0 = VSUB(di0, di1);
+  i2 = VADD(di0, di1);
+  i1 = VSUB(si0, dr1);
+  i3 = VADD(si0, dr1);
+
+  VCPLXMULCONJ(r1,i1,e[0],e[1]);
+  VCPLXMULCONJ(r2,i2,e[2],e[3]);
+  VCPLXMULCONJ(r3,i3,e[4],e[5]);
+
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  if (!first) {
+    *out++ = r0;
+    *out++ = i0;
+  }
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
+  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+  static const float s = (float)M_SQRT2;
+  assert(in != out);
+  for (k=0; k < 4; ++k) {
+    Xr.f[k] = ((float*)in)[8*k];
+    Xi.f[k] = ((float*)in)[8*k+4];
+  }
+
+  pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
+
+  /*
+    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+
+    [cr0] [1   0   2   0   1   0   0   0]
+    [cr1] [1   0   0   0  -1   0  -2   0]
+    [cr2] [1   0  -2   0   1   0   0   0]
+    [cr3] [1   0   0   0  -1   0   2   0]
+    [ci0] [0   2   0   2   0   0   0   0]
+    [ci1] [0   s   0  -s   0  -s   0  -s]
+    [ci2] [0   0   0   0   0  -2   0   2]
+    [ci3] [0  -s   0   s   0  -s   0  -s]
+  */
+  for (k=1; k < dk; ++k) {    
+    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+  }
+
+  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
+  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
+  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
+  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
+  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
+  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
+  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
+  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+}
+
+
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+                             pffft_direction_t direction, int ordered) {
+  int k, Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  // temporary buffer is allocated on the stack if the scratch pointer is NULL
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+
+  const v4sf *vinput = (const v4sf*)finput;
+  v4sf *voutput      = (v4sf*)foutput;
+  v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
+  int ib = (nf_odd ^ ordered ? 1 : 0);
+
+  assert(VALIGNED(finput) && VALIGNED(foutput));
+
+  //assert(finput != foutput);
+  if (direction == PFFFT_FORWARD) {
+    ib = !ib;
+    if (setup->transform == PFFFT_REAL) { 
+      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    } else {
+      v4sf *tmp = buff[ib];
+      for (k=0; k < Ncvec; ++k) {
+        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+      }
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], 
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);       
+    } else ib = !ib;
+  } else {
+    if (vinput == buff[ib]) { 
+      ib = !ib; // may happen when finput == foutput
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD); 
+      vinput = buff[ib]; ib = !ib;
+    }
+    if (setup->transform == PFFFT_REAL) {
+      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], 
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], 
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+      for (k=0; k < Ncvec; ++k) {
+        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+      }
+    }
+  }
+  
+  if (buff[ib] != voutput) {
+    /* extra copy required -- this situation should only happen when finput == foutput */
+    assert(finput==foutput);
+    for (k=0; k < Ncvec; ++k) {
+      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+      voutput[2*k] = a; voutput[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == voutput);
+}
+
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  int Ncvec = s->Ncvec;
+  const v4sf * RESTRICT va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * RESTRICT vab = (v4sf*)ab;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(vab);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(vab+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(vab+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+  __builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
+#endif
+
+  float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
+  v4sf vscal = LD_PS1(scaling);
+  int i;
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+  abr = ((v4sf_union*)vab)[0].f[0];
+  abi = ((v4sf_union*)vab)[1].f[0];
+ 
+#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
+  const float *a_ = a, *b_ = b; float *ab_ = ab;
+  int N = Ncvec;
+  asm volatile("mov         r8, %2                  \n"
+               "vdup.f32    q15, %4                 \n"
+               "1:                                  \n"
+               "pld         [%0,#64]                \n"
+               "pld         [%1,#64]                \n"
+               "pld         [%2,#64]                \n"
+               "pld         [%0,#96]                \n"
+               "pld         [%1,#96]                \n"
+               "pld         [%2,#96]                \n"
+               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+               
+               "vmul.f32    q10, q0, q4             \n"
+               "vmul.f32    q11, q0, q5             \n"
+               "vmul.f32    q12, q2, q6             \n" 
+               "vmul.f32    q13, q2, q7             \n"                 
+               "vmls.f32    q10, q1, q5             \n"
+               "vmla.f32    q11, q1, q4             \n"
+               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+               "vmls.f32    q12, q3, q7             \n"
+               "vmla.f32    q13, q3, q6             \n"
+               "vmla.f32    q8, q10, q15            \n"
+               "vmla.f32    q9, q11, q15            \n"
+               "vmla.f32    q0, q12, q15            \n"
+               "vmla.f32    q1, q13, q15            \n"
+               "vst1.f32    {q8,q9},[%2,:128]!    \n"
+               "vst1.f32    {q0,q1},[%2,:128]!    \n"
+               "subs        %3, #2                  \n"
+               "bne         1b                      \n"
+               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
+#else // default routine, works fine for non-arm cpus with current compilers
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
+    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
+    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
+  }
+#endif
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
+    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
+  }
+}
+
+
+void pffft_zconvolve_no_accu(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  v4sf vscal = LD_PS1(scaling);
+  const v4sf * RESTRICT va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * RESTRICT vab = (v4sf*)ab;
+  float sar, sai, sbr, sbi;
+  const int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
+  int k; /* was i -- but always used "2*i" - except at for() */
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(vab);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(vab+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(vab+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+  __builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  sar = ((v4sf_union*)va)[0].f[0];
+  sai = ((v4sf_union*)va)[1].f[0];
+  sbr = ((v4sf_union*)vb)[0].f[0];
+  sbi = ((v4sf_union*)vb)[1].f[0];
+
+  /* default routine, works fine for non-arm cpus with current compilers */
+  for (k=0; k < NcvecMulTwo; k += 4) {
+    v4sf var, vai, vbr, vbi;
+    var = va[k+0]; vai = va[k+1];
+    vbr = vb[k+0]; vbi = vb[k+1];
+    VCPLXMUL(var, vai, vbr, vbi);
+    vab[k+0] = VMUL(var, vscal);
+    vab[k+1] = VMUL(vai, vscal);
+    var = va[k+2]; vai = va[k+3];
+    vbr = vb[k+2]; vbi = vb[k+3];
+    VCPLXMUL(var, vai, vbr, vbi);
+    vab[k+2] = VMUL(var, vscal);
+    vab[k+3] = VMUL(vai, vscal);
+  }
+
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = sar*sbr*scaling;
+    ((v4sf_union*)vab)[1].f[0] = sai*sbi*scaling;
+  }
+}
+
+
+#else // defined(PFFFT_SIMD_DISABLE)
+
+// standard routine using scalar floats, without SIMD stuff.
+
+#define pffft_zreorder_nosimd pffft_zreorder
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N;
+  if (setup->transform == PFFFT_COMPLEX) {
+    for (k=0; k < 2*N; ++k) out[k] = in[k];
+    return;
+  }
+  else if (direction == PFFFT_FORWARD) {
+    float x_N = in[N-1];
+    for (k=N-1; k > 1; --k) out[k] = in[k-1]; 
+    out[0] = in[0];
+    out[1] = x_N;
+  } else {
+    float x_N = in[1];
+    for (k=1; k < N-1; ++k) out[k] = in[k+1]; 
+    out[0] = in[0];
+    out[N-1] = x_N;
+  }
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+                                    pffft_direction_t direction, int ordered) {
+  int Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+  // temporary buffer is allocated on the stack if the scratch pointer is NULL
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+  float *buff[2];
+  int ib;
+  if (scratch == 0) scratch = scratch_on_stack;
+  buff[0] = output; buff[1] = scratch;
+
+  if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
+  ib = (nf_odd ^ ordered ? 1 : 0);
+
+  if (direction == PFFFT_FORWARD) {
+    if (setup->transform == PFFFT_REAL) { 
+      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);      
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
+    }
+  } else {    
+    if (input == buff[ib]) { 
+      ib = !ib; // may happen when finput == foutput
+    }
+    if (ordered) {
+      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); 
+      input = buff[!ib];
+    }
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], 
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+    }
+  }
+  if (buff[ib] != output) {
+    int k;
+    // extra copy required -- this situation should happens only when finput == foutput
+    assert(input==output);
+    for (k=0; k < Ncvec; ++k) {
+      float a = buff[ib][2*k], b = buff[ib][2*k+1];
+      output[2*k] = a; output[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == output);
+}
+
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+                                       float *ab, float scaling) {
+  int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
+  int k; /* was i -- but always used "2*i" - except at for() */
+
+  if (s->transform == PFFFT_REAL) {
+    // take care of the fftpack ordering
+    ab[0] += a[0]*b[0]*scaling;
+    ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
+    ++ab; ++a; ++b; NcvecMulTwo -= 2;
+  }
+  for (k=0; k < NcvecMulTwo; k += 2) {
+    float ar, ai, br, bi;
+    ar = a[k+0]; ai = a[k+1];
+    br = b[k+0]; bi = b[k+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[k+0] += ar*scaling;
+    ab[k+1] += ai*scaling;
+  }
+}
+
+
+#define pffft_zconvolve_no_accu_nosimd pffft_zconvolve_no_accu
+void pffft_zconvolve_no_accu_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+                                    float *ab, float scaling) {
+  int NcvecMulTwo = 2*s->Ncvec;  /* int Ncvec = s->Ncvec; */
+  int k; /* was i -- but always used "2*i" - except at for() */
+
+  if (s->transform == PFFFT_REAL) {
+    // take care of the fftpack ordering
+    ab[0] += a[0]*b[0]*scaling;
+    ab[NcvecMulTwo-1] += a[NcvecMulTwo-1]*b[NcvecMulTwo-1]*scaling;
+    ++ab; ++a; ++b; NcvecMulTwo -= 2;
+  }
+  for (k=0; k < NcvecMulTwo; k += 2) {
+    float ar, ai, br, bi;
+    ar = a[k+0]; ai = a[k+1];
+    br = b[k+0]; bi = b[k+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[k+0] = ar*scaling;
+    ab[k+1] = ai*scaling;
+  }
+}
+
+
+#endif // defined(PFFFT_SIMD_DISABLE)
+
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
+}
+
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
+}
+
+
+int pffft_next_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 */
+  /* compute the next highest power of 2 of 32-bit v */
+  unsigned v = N;
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
+int pffft_is_power_of_two(int N) {
+  /* https://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 */
+  int f = N && !(N & (N - 1));
+  return f;
+}
+
+int pffft_min_fft_size(pffft_transform_t transform) {
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
+     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+  if (transform == PFFFT_REAL)
+    return ( 2 * SIMD_SZ * SIMD_SZ );
+  else if (transform == PFFFT_COMPLEX)
+    return ( SIMD_SZ * SIMD_SZ );
+  else
+    return 1;
+}
+
diff --git a/libs/staffpad/pffft/pffft.h b/libs/staffpad/pffft/pffft.h
new file mode 100644
index 0000000000..66459ea8dd
--- /dev/null
+++ b/libs/staffpad/pffft/pffft.h
@@ -0,0 +1,215 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) 
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+   
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+   
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.  
+
+   Restrictions: 
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+  
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#pragma once
+
+#include <stddef.h> // for size_t
+
+#ifndef PFFFT_FLOAT
+  #if 1
+    /* default: float */
+    #define PFFFT_FLOAT float
+  #else
+    #define PFFFT_FLOAT double
+    #ifndef PFFFT_SIMD_DISABLE
+      /* double only with PFFFT_SIMD_DISABLE */
+      #define PFFFT_SIMD_DISABLE 1
+    #endif
+  #endif
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.  
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+  
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads. 
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /* 
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+     
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384). Threads usually have a small stack, that
+     there's no sufficient amount of memory, usually leading to a creash!
+     Use the heap with pffft_aligned_malloc() in this case.
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
+
+  /* 
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+     
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, PFFFT_FLOAT *work, pffft_direction_t direction);
+
+  /* 
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+     
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+     
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const PFFFT_FLOAT *input, PFFFT_FLOAT *output, pffft_direction_t direction);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const PFFFT_FLOAT *dft_a, const PFFFT_FLOAT *dft_b, PFFFT_FLOAT *dft_ab, PFFFT_FLOAT scaling);
+
+  /* 
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and put result in dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+     
+     the operation performed is: dft_ab = (dft_a * fdt_b)*scaling
+     
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_no_accu(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /* simple helper to get minimum possible fft size */
+  int pffft_min_fft_size(pffft_transform_t transform);
+
+  /* simple helper to determine next power of 2
+     - without inexact/rounding floating point operations
+  */
+  int pffft_next_power_of_two(int N);
+
+  /* simple helper to determine if power of 2 - returns bool */
+  int pffft_is_power_of_two(int N);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.  
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enabled when building pffft.c */
+  int pffft_simd_size(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/libs/staffpad/pffft/pfsimd_macros.h b/libs/staffpad/pffft/pfsimd_macros.h
new file mode 100644
index 0000000000..ee9a0f80d2
--- /dev/null
+++ b/libs/staffpad/pffft/pfsimd_macros.h
@@ -0,0 +1,246 @@
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+#pragma once
+
+#include <assert.h>
+#include <string.h>
+#include <stdint.h>
+
+
+/*
+ *  SIMD reference material:
+ * 
+ * general SIMD introduction:
+ * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
+ * 
+ * SSE 1:
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ * 
+ * ARM NEON:
+ * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
+ * 
+ * Altivec:
+ * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
+ * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
+ * better one?
+ * 
+ */
+
+
+/*
+   Altivec support macros 
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && ((defined(__ppc__) || defined(__ppc64__)) && defined(__ALTIVEC__))
+#include <altivec.h>
+typedef vector float v4sf;
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 1  /* not sure, if really required */
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
+ *  anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 1
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define VLOAD_UNALIGNED(ptr)  _mm_loadu_ps(ptr)
+#  define VLOAD_ALIGNED(ptr)    _mm_load_ps(ptr)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+/* reverse/flip all floats */
+#  define VREV_S(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
+/* reverse/flip complex floats */
+#  define VREV_C(a)    _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64))
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+#  define SIMD_SZ 4
+#  define VREQUIRES_ALIGN 0  /* usually no alignment required */
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#else
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    warning "building with simd disabled !\n";
+#    define PFFFT_SIMD_DISABLE // fallback to scalar code
+#  endif
+#endif
+
+// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#  define SIMD_SZ 1
+#  define VREQUIRES_ALIGN 0
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VLOAD_UNALIGNED(ptr)  (*(ptr))
+#  define VLOAD_ALIGNED(ptr)    (*(ptr))
+#  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+typedef union v4sf_union {
+  v4sf  v;
+  float f[SIMD_SZ];
+} v4sf_union;
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+static void Vvalidate_simd() {
+  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+  v4sf_union a0, a1, a2, a3, t, u; 
+  memcpy(a0.f, f, 4*sizeof(float));
+  memcpy(a1.f, f+4, 4*sizeof(float));
+  memcpy(a2.f, f+8, 4*sizeof(float));
+  memcpy(a3.f, f+12, 4*sizeof(float));
+
+  t = a0; u = a1; t.v = VZERO();
+  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+  t.v = VADD(a1.v, a2.v);
+  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+  t.v = VMUL(a1.v, a2.v);
+  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+  t.v = VMADD(a1.v, a2.v,a0.v);
+  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+
+  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+  t.v=LD_PS1(f[15]);
+  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 15, 15, 15, 15);
+  t.v = VSWAPHL(a1.v, a2.v);
+  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 8, 9, 6, 7);
+  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", 
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], 
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); 
+  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif //!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
+
+static
+void *Valigned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+static
+void Valigned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
diff --git a/libs/staffpad/staffpad/TimeAndPitch.h b/libs/staffpad/staffpad/TimeAndPitch.h
new file mode 100644
index 0000000000..e1277bfb12
--- /dev/null
+++ b/libs/staffpad/staffpad/TimeAndPitch.h
@@ -0,0 +1,117 @@
+//
+// Arbitrary time and pitch stretching, based on the Phase-locked Vocoder.
+//
+#pragma once
+
+#include <complex>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace staffpad {
+class TimeAndPitch
+{
+public:
+    /**
+     * `magnitude` may be null, in which case the callback should re-compute it.
+     */
+    using ShiftTimbreCb = std::function<void (
+                                            double factor, std::complex<float>* spectrum, const float* magnitude)>;
+
+    TimeAndPitch(
+        int fftSize, bool reduceImaging = true, ShiftTimbreCb shiftTimbreCb = {});
+    ~TimeAndPitch();
+    /**
+      Setup at least once before processing.
+      \param numChannels  Must be 1 or 2
+      \param maxBlockSize The caller's maximum block size, e.g. 1024 samples
+    */
+    void setup(int numChannels, int maxBlockSize);
+
+    /**
+      Set independent time stretch and pitch factors (synchronously to processing thread).
+      The factor change resolution follows the processing block size, with some approximation.
+      As a general rule, smaller block sizes result in finer changes, although there's a lower
+      bound due to internal windowed segmentation and OLA resynthesis.
+    */
+    void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor);
+
+    /**
+      Return the ~number of samples until the next hop gets analysed (might be 1
+      or 2 too high). For extremely small stretch ratios, several analyses might
+      be needed to produce samples, so you may have to repeat the process.
+      Eventually, more output samples are produced.
+    */
+    int getSamplesToNextHop() const;
+
+    /**
+      Tells the number of already processed output samples (after time-stretching) that
+      are ready for consumption.
+    */
+    int getNumAvailableOutputSamples() const;
+
+    /**
+      Feed more input samples. \see `getSamplesToNextHop()` to know how much to feed to
+      produce more output samples.
+    */
+    void feedAudio(const float* const* in_smp, int numSamples);
+
+    /**
+      Retrieved shifted/stretched samples in output. \see `getNumAvailableOutputSamples()`
+      to ask a valid amount of `numSamples`.
+    */
+    void retrieveAudio(float* const* out_smp, int numSamples);
+
+    /**
+      Pitch-shift only in-place processing.
+     */
+    void processPitchShift(float* const* smp, int numSamples, double pitchFactor);
+
+    /**
+      Latency in input samples
+    */
+    int getLatencySamples() const;
+
+    /**
+      Latency in output samples, calculated on the given timeStretch factor
+    */
+    int getLatencySamplesForStretchRatio(float timeStretch) const;
+
+    /**
+      Resets the internal state, discards any buffered input
+    */
+    void reset();
+
+private:
+    const int fftSize;
+    static constexpr int overlap = 4;
+    static constexpr bool normalize_window = true; // compensate for ola window overlaps
+    static constexpr bool modulate_synthesis_hop = true;
+
+    void _process_hop(int hop_a, int hop_s);
+    template<int num_channels>
+    void _time_stretch(float hop_a, float hop_s);
+    void _applyImagingReduction();
+
+    struct impl;
+    std::shared_ptr<impl> d;
+
+    const bool _reduceImaging;
+    const ShiftTimbreCb _shiftTimbreCb;
+
+    int _numChannels = 1;
+    int _maxBlockSize = 1024;
+    double _resampleReadPos = 0.0;
+    int _availableOutputSamples = 0;
+    int _numBins = fftSize / 2 + 1;
+    double _overlap_a = overlap;
+
+    int _analysis_hop_counter = 0;
+
+    double _expectedPhaseChangePerBinPerSample = 0.01;
+    double _timeStretch = 1.0;
+    double _pitchFactor = 1.0;
+
+    int _outBufferWriteOffset = 0;
+};
+} // namespace staffpad
diff --git a/libs/staffpad/wscript b/libs/staffpad/wscript
new file mode 100644
index 0000000000..0d2e4ddf17
--- /dev/null
+++ b/libs/staffpad/wscript
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# Version of this package (even if built as a child)
+MAJOR = '0'
+MINOR = '0'
+MICRO = '0'
+STAFFPAD_VERSION = "%s.%s.%s" % (MAJOR, MINOR, MICRO)
+
+# Library version (UNIX style major, minor, micro)
+# major increment <=> incompatible changes
+# minor increment <=> compatible changes (additions)
+# micro increment <=> no interface changes
+STAFFPAD_LIB_VERSION = '0.0.0'
+
+I18N_PACKAGE = 'staffpad'
+
+staffpad_sources = [
+        'TimeAndPitch.cpp',
+        'FourierTransform_pffft.cpp',
+        'pffft/pffft.c'
+]
+
+def options(opt):
+    pass
+
+def configure(conf):
+    pass
+
+def build(bld):
+    obj = bld.stlib(features = 'cxx cxxstlib', source = staffpad_sources)
+    obj.cxxflags        = [ bld.env['compiler_flags_dict']['pic'], '-O3', '-ffast-math' ]
+    obj.export_includes = ['.']
+    obj.includes        = ['.', 'pffft']
+    obj.name            = 'staffpad'
+    obj.target          = 'staffpad'
+    obj.vnum            = STAFFPAD_LIB_VERSION
+    obj.defines         = [ 'PACKAGE="' + I18N_PACKAGE + '"' ]
diff --git a/wscript b/wscript
index c1ce824c80..dd145a86f6 100644
--- a/wscript
+++ b/wscript
@@ -332,6 +332,7 @@ children = [
         'libs/clearlooks-newer',
         'libs/zita-resampler',
         'libs/zita-convolver',
+        'libs/staffpad',
         # optionally external libraries
         'libs/fluidsynth',
         'libs/hidapi',