ardour/libs/staffpad/TimeAndPitch.cpp

#include "staffpad/TimeAndPitch.h"

#include <algorithm>
#include <array>
#include <cassert>
#include <random>
#include <stdlib.h>
#include <utility>

#include "CircularSampleBuffer.h"
#include "FourierTransform_pffft.h"
#include "SamplesFloat.h"
#include "SimdTypes.h"

using namespace staffpad::audio;

namespace staffpad {
namespace {
constexpr double twoPi = 6.28318530717958647692f;

/**
 * 6-point lagrange interpolator SNR ~ -35.2db
 * interpolates between samples 2 and 3 using t.
 *
 * smp              pointer to a memory region containing 6 samples
 * t                fractional sample to interpolate t >= 0 && t < 1
 * return value     the interpolated sample
 */
inline float lagrange6(const float (& smp)[6], float t)
{
    auto* y = &smp[2];

    auto ym1_y1 = y[-1] + y[1];
    auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
    auto c0 = y[0];
    auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
    auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
    auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
    auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
    auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);

    // Estrin's scheme
    auto t2 = t * t;
    return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
}

void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
{
    std::vector<std::complex<float> > v(size);
    std::uniform_real_distribution<float> dis(
        -3.14159265358979323846f, 3.14159265358979323846f);
    std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
}
} // namespace

struct TimeAndPitch::impl
{
    impl(int fft_size)
        : fft(fft_size)
    {
        randomGenerator.seed(0);
    }

    FourierTransform fft;
    std::mt19937 randomGenerator;
    CircularSampleBuffer<float> inResampleInputBuffer[2];
    CircularSampleBuffer<float> inCircularBuffer[2];
    CircularSampleBuffer<float> outCircularBuffer[2];
    CircularSampleBuffer<float> normalizationBuffer;

    SamplesReal fft_timeseries;
    SamplesComplex spectrum;
    SamplesReal norm;
    SamplesReal phase;
    SamplesReal last_phase;
    SamplesReal phase_accum;
    SamplesReal cosWindow;
    SamplesReal sqWindow;
    SamplesReal last_norm;
    SamplesReal random_phases;

    double exact_hop_a = 512.0, hop_a_err = 0.0;
    double exact_hop_s = 0.0;
    double next_exact_hop_s = 512.0;
    double hop_s_err = 0.0;

    std::vector<int> peak_index, trough_index;
};

TimeAndPitch::TimeAndPitch(
    int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
    : fftSize(fftSize)
    , _reduceImaging(reduceImaging)
    , _shiftTimbreCb(std::move(shiftTimbreCb))
{
}

TimeAndPitch::~TimeAndPitch()
{
    // Here for ~std::shared_ptr<impl>() to know ~impl()
}

void TimeAndPitch::setup(int numChannels, int maxBlockSize)
{
    assert(numChannels == 1 || numChannels == 2);
    _numChannels = numChannels;

    d = std::make_unique<impl>(fftSize);
    _maxBlockSize = maxBlockSize;
    _numBins = fftSize / 2 + 1;

    // audio sample buffers
    d->fft_timeseries.setSize(_numChannels, fftSize);
    int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
    for (int ch = 0; ch < _numChannels; ++ch) {
        d->inResampleInputBuffer[ch].setSize(_maxBlockSize
                                             + 6); // needs additional history for resampling. more in the future
        d->inCircularBuffer[ch].setSize(fftSize);
        d->outCircularBuffer[ch].setSize(outBufferSize);
    }
    d->normalizationBuffer.setSize(outBufferSize);

    // fft coefficient buffers
    d->spectrum.setSize(_numChannels, _numBins);
    d->norm.setSize(1, _numBins);
    d->last_norm.setSize(1, _numBins);
    d->phase.setSize(_numChannels, _numBins);
    d->last_phase.setSize(_numChannels, _numBins);
    d->phase_accum.setSize(_numChannels, _numBins);
    d->random_phases.setSize(1, _numBins);
    generateRandomPhaseVector(
        d->random_phases.getPtr(0), _numBins, d->randomGenerator);

    _expectedPhaseChangePerBinPerSample = twoPi / double(fftSize);

    // Cos window
    // For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
    d->cosWindow.setSize(1, fftSize);
    d->sqWindow.setSize(1, fftSize);
    auto* w = d->cosWindow.getPtr(0);
    auto* sqw = d->sqWindow.getPtr(0);
    for (int i = 0; i < fftSize; ++i) {
        w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
        sqw[i] = w[i] * w[i];
    }

    d->peak_index.reserve(_numBins);
    d->trough_index.reserve(_numBins);

    // force fft to allocate all buffers now. Currently there is no other way in the fft class
    d->fft.forwardReal(d->fft_timeseries, d->spectrum);

    reset();
}

int TimeAndPitch::getSamplesToNextHop() const
{
    return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter
                    + 1);    // 1 extra to counter pitch factor error accumulation
}

int TimeAndPitch::getNumAvailableOutputSamples() const
{
    return _availableOutputSamples;
}

void TimeAndPitch::reset()
{
    _analysis_hop_counter = 0;
    _availableOutputSamples = 0;
    for (int ch = 0; ch < _numChannels; ++ch) {
        d->inResampleInputBuffer[ch].reset();
        d->inCircularBuffer[ch].reset();
        d->outCircularBuffer[ch].reset();
    }
    d->normalizationBuffer.reset();
    d->last_norm.zeroOut();
    d->last_phase.zeroOut();
    d->phase_accum.zeroOut();
    _outBufferWriteOffset = 0;
    d->hop_a_err = 0.0;
    d->hop_s_err = 0.0;
    d->exact_hop_s = 0.0;
    _resampleReadPos = 0.0;
}

namespace {
// wrap a phase value into -PI..PI
inline float _unwrapPhase(float arg)
{
    using namespace audio::simd;
    return arg - rint(arg * 0.15915494309f) * 6.283185307f;
}

void _unwrapPhaseVec(float* v, int n)
{
    audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
}

/// rotate even-sized array by half its size to align fft phase at the center
void _fft_shift(float* v, int n)
{
    assert((n & 1) == 0);
    int n2 = n >> 1;
    audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
        auto tmp = a;
        a = b;
        b = tmp;
    });
}

void _lr_to_ms(float* ch1, float* ch2, int n)
{
    audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
        auto l = a, r = b;
        a = 0.5f * (l + r);
        b = 0.5f * (l - r);
    });
}

void _ms_to_lr(float* ch1, float* ch2, int n)
{
    audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
        auto m = a, s = b;
        a = m + s;
        b = m - s;
    });
}
} // namespace

// ----------------------------------------------------------------------------

template<int num_channels>
void TimeAndPitch::_time_stretch(float a_a, float a_s)
{
    auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes

    // Create a norm array
    const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
    const auto* norms_last = d->last_norm.getPtr(0);

    d->peak_index.clear();
    d->trough_index.clear();
    float lowest = norms[0];
    int trough = 0;
    if (norms_last[0] >= norms[1]) {
        d->peak_index.emplace_back(0);
        d->trough_index.emplace_back(0);
    }
    for (int i = 1; i < _numBins - 1; ++i) {
        if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) {
            d->peak_index.emplace_back(i);
            d->trough_index.emplace_back(trough);
            trough = i;
            lowest = norms[i];
        } else if (norms[i] < lowest) {
            lowest = norms[i];
            trough = i;
        }
    }
    if (norms_last[_numBins - 1] > norms[_numBins - 2]) {
        d->peak_index.emplace_back(_numBins - 1);
        d->trough_index.emplace_back(trough);
    }

    if (d->peak_index.size() == 0) {
        // use max norm of last frame
        int max_idx = 0;
        float max_norm = 0.f;
        vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
        d->peak_index.emplace_back(max_idx);
    }

    const float** p = const_cast<const float**>(d->phase.getPtrs());
    const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
    float** acc = d->phase_accum.getPtrs();

    float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
    float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);

    int num_peaks = (int)d->peak_index.size();
    // time integration for all peaks
    for (int i = 0; i < num_peaks; ++i) {
        const int n = d->peak_index[i];
        auto fn = float(n);

        // determine phase from last frame
        float fn_expChange_a = fn * expChange_a;
        float fn_expChange_s = fn * expChange_s;

        for (int ch = 0; ch < num_channels; ++ch) {
            acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
        }
    }

    // go from first peak to 0
    for (int n = d->peak_index[0]; n > 0; --n) {
        for (int ch = 0; ch < num_channels; ++ch) {
            acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
        }
    }

    // 'grow' from pairs of peaks to the lowest norm in between
    for (int i = 0; i < num_peaks - 1; ++i) {
        const int mid = d->trough_index[i + 1];
        for (int n = d->peak_index[i]; n < mid; ++n) {
            for (int ch = 0; ch < num_channels; ++ch) {
                acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
            }
        }
        for (int n = d->peak_index[i + 1]; n > mid + 1; --n) {
            for (int ch = 0; ch < num_channels; ++ch) {
                acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
            }
        }
    }

    // last peak to the end
    for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) {
        for (int ch = 0; ch < num_channels; ++ch) {
            acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
        }
    }

    d->last_norm.assignSamples(d->norm);
    d->last_phase.assignSamples(d->phase);
}

void TimeAndPitch::_applyImagingReduction()
{
    // Upsampling brings spectral components down, including those that were
    // beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
    // spectrum, which, if not lowpass-filtered by the resampler, may yield
    // an aliasing-like quality if what is mirrored is a harmonic series. A
    // simple thing to do against that is to scramble the phase values there.
    // This way we preserve natural energy fluctuations, but the artifact
    // isn't noticeable as much anymore, because much more noise-like, which
    // is what one typically gets at those frequencies.
    constexpr auto alignment = 64 / sizeof(float);
    const int imagingBeginBin
        =std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;

    if (imagingBeginBin >= _numBins) {
        return;
    }

    const auto n = _numBins - imagingBeginBin;
    auto pSpec = d->spectrum.getPtr(0);
    auto pRand = d->random_phases.getPtr(0);
    vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
    // Just rotating the random phase vector to produce pseudo-random
    // sequences of phases.
    std::uniform_int_distribution<size_t> dis(0, n - 1);
    const auto middle = dis(d->randomGenerator);
    std::rotate(pRand, pRand + middle, pRand + n);
}

/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
void TimeAndPitch::_process_hop(int hop_a, int hop_s)
{
    if (d->exact_hop_a != d->exact_hop_s) {
        if (_numChannels == 2) {
            _lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
        }

        for (int ch = 0; ch < _numChannels; ++ch) {
            vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
            _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
        }

        // determine norm/phase
        d->fft.forwardReal(d->fft_timeseries, d->spectrum);
        // norms of the mid channel only (or sole channel) are needed in
        // _time_stretch
        vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
        for (int ch = 0; ch < _numChannels; ++ch) {
            vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
        }

        if (_shiftTimbreCb) {
            _shiftTimbreCb(
                1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
        }

        if (_reduceImaging && _pitchFactor < 1.) {
            _applyImagingReduction();
        }

        if (_numChannels == 1) {
            _time_stretch<1>((float)hop_a, (float)hop_s);
        } else if (_numChannels == 2) {
            _time_stretch<2>((float)hop_a, (float)hop_s);
        }

        for (int ch = 0; ch < _numChannels; ++ch) {
            _unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
        }

        for (int ch = 0; ch < _numChannels; ++ch) {
            vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
                       d->spectrum.getNumSamples());
        }
        d->fft.inverseReal(d->spectrum, d->fft_timeseries);

        for (int ch = 0; ch < _numChannels; ++ch) {
            vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
                                 d->fft_timeseries.getNumSamples());
        }

        if (_numChannels == 2) {
            _ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
        }

        for (int ch = 0; ch < _numChannels; ++ch) {
            _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
            vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
        }
    } else { // stretch factor == 1.0 => just apply window
        for (int ch = 0; ch < _numChannels; ++ch) {
            vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
        }
    }

    {
        float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
        for (int ch = 0; ch < _numChannels; ++ch) {
            d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
                                                           gainFact);
        }

        if (normalize_window) {
            d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
        }
    }
    _outBufferWriteOffset += hop_s;
    _availableOutputSamples += hop_s;
}

void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
{
    assert(timeScale > 0.0);
    assert(pitchFactor > 0.0);
    _pitchFactor = pitchFactor;
    _timeStretch = timeScale * pitchFactor;

    _overlap_a = overlap;
    double overlap_s = overlap;
    if (_timeStretch > 1.0 || !modulate_synthesis_hop) {
        _overlap_a *= _timeStretch;
    } else {
        overlap_s /= _timeStretch;
    }

    d->exact_hop_a = double(fftSize) / _overlap_a;
    if (!modulate_synthesis_hop) {
        d->exact_hop_s = double(fftSize) / overlap_s;
    } else {
        // switch after processing next block, unless it's the first one
        d->next_exact_hop_s = double(fftSize) / overlap_s;
        if (d->exact_hop_s == 0.0) { // on first chunk set it immediately
            d->exact_hop_s = d->next_exact_hop_s;
        }
    }
}

void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
{
    assert(numSamples <= _maxBlockSize);
    for (int ch = 0; ch < _numChannels; ++ch) {
        d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
        d->inResampleInputBuffer[ch].advance(numSamples);
    }
    _resampleReadPos -= numSamples;

    // determine integer hop size for the next hop.
    // The error accumulators will make the value fluctuate by one to reach arbitrary scale
    // ratios
    if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors
        d->exact_hop_s = d->next_exact_hop_s;
    }

    const int hop_s = int(d->exact_hop_s + d->hop_s_err);
    const int hop_a = int(d->exact_hop_a + d->hop_a_err);

    double step = 0.0;
    double read_pos = _resampleReadPos;
    while (read_pos < 0.0)
    {
        auto int_pos = int(std::floor(read_pos));
        float frac_pos = float(read_pos - int_pos);
        for (int ch = 0; ch < _numChannels; ++ch) {
            float smp[6];
            d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
            float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
            d->inCircularBuffer[ch].writeOffset0(s);
            d->inCircularBuffer[ch].advance(1);
        }
        _analysis_hop_counter++;
        step++;

        if (_analysis_hop_counter >= hop_a) {
            _analysis_hop_counter -= hop_a;
            d->hop_s_err += d->exact_hop_s - hop_s;
            d->hop_a_err += d->exact_hop_a - hop_a;
            for (int ch = 0; ch < _numChannels; ++ch) {
                d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
            }
            _process_hop(hop_a, hop_s);
        }
        read_pos = _resampleReadPos + step * _pitchFactor;
    }
    _resampleReadPos = read_pos;
}

void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
{
    assert(numSamples <= _maxBlockSize);
    for (int ch = 0; ch < _numChannels; ++ch) {
        d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
        if (normalize_window) {
            constexpr float curve
                =4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
            for (int i = 0; i < numSamples; ++i) {
                float x = d->normalizationBuffer.read(i);
                out_smp[ch][i] *= x / (x * x + 1 / curve);
            }
        }
        d->outCircularBuffer[ch].advance(numSamples);
    }

    d->normalizationBuffer.clearBlock(0, numSamples);
    d->normalizationBuffer.advance(numSamples);

    _outBufferWriteOffset -= numSamples;
    _availableOutputSamples -= numSamples;

    if (modulate_synthesis_hop) {
        d->exact_hop_s = d->next_exact_hop_s;
    }
}

void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
{
    setTimeStretchAndPitchFactor(1.0, pitchFactor);
    feedAudio(smp, numSamples);
    retrieveAudio(smp, numSamples);
}

int TimeAndPitch::getLatencySamples() const
{
    return fftSize - fftSize / overlap + 3; // 3 for resampling
}

int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const
{
    const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
    return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
}
} // namespace staffpad