ardour/libs/staffpad/TimeAndPitch.cpp
2025-10-09 00:01:26 +02:00

559 lines
19 KiB
C++

#include "staffpad/TimeAndPitch.h"
#include <algorithm>
#include <array>
#include <cassert>
#include <random>
#include <stdlib.h>
#include <utility>
#include "CircularSampleBuffer.h"
#include "FourierTransform_pffft.h"
#include "SamplesFloat.h"
#include "SimdTypes.h"
using namespace staffpad::audio;
namespace staffpad {
namespace {
constexpr double twoPi = 6.28318530717958647692f;
/**
* 6-point lagrange interpolator SNR ~ -35.2db
* interpolates between samples 2 and 3 using t.
*
* smp pointer to a memory region containing 6 samples
* t fractional sample to interpolate t >= 0 && t < 1
* return value the interpolated sample
*/
inline float lagrange6(const float (& smp)[6], float t)
{
auto* y = &smp[2];
auto ym1_y1 = y[-1] + y[1];
auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
auto c0 = y[0];
auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
// Estrin's scheme
auto t2 = t * t;
return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
}
void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
{
std::vector<std::complex<float> > v(size);
std::uniform_real_distribution<float> dis(
-3.14159265358979323846f, 3.14159265358979323846f);
std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
}
} // namespace
struct TimeAndPitch::impl
{
impl(int fft_size)
: fft(fft_size)
{
randomGenerator.seed(0);
}
FourierTransform fft;
std::mt19937 randomGenerator;
CircularSampleBuffer<float> inResampleInputBuffer[2];
CircularSampleBuffer<float> inCircularBuffer[2];
CircularSampleBuffer<float> outCircularBuffer[2];
CircularSampleBuffer<float> normalizationBuffer;
SamplesReal fft_timeseries;
SamplesComplex spectrum;
SamplesReal norm;
SamplesReal phase;
SamplesReal last_phase;
SamplesReal phase_accum;
SamplesReal cosWindow;
SamplesReal sqWindow;
SamplesReal last_norm;
SamplesReal random_phases;
double exact_hop_a = 512.0, hop_a_err = 0.0;
double exact_hop_s = 0.0;
double next_exact_hop_s = 512.0;
double hop_s_err = 0.0;
std::vector<int> peak_index, trough_index;
};
TimeAndPitch::TimeAndPitch(
int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
: fftSize(fftSize)
, _reduceImaging(reduceImaging)
, _shiftTimbreCb(std::move(shiftTimbreCb))
{
}
TimeAndPitch::~TimeAndPitch()
{
// Here for ~std::shared_ptr<impl>() to know ~impl()
}
void TimeAndPitch::setup(int numChannels, int maxBlockSize)
{
assert(numChannels == 1 || numChannels == 2);
_numChannels = numChannels;
d = std::make_unique<impl>(fftSize);
_maxBlockSize = maxBlockSize;
_numBins = fftSize / 2 + 1;
// audio sample buffers
d->fft_timeseries.setSize(_numChannels, fftSize);
int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].setSize(_maxBlockSize
+ 6); // needs additional history for resampling. more in the future
d->inCircularBuffer[ch].setSize(fftSize);
d->outCircularBuffer[ch].setSize(outBufferSize);
}
d->normalizationBuffer.setSize(outBufferSize);
// fft coefficient buffers
d->spectrum.setSize(_numChannels, _numBins);
d->norm.setSize(1, _numBins);
d->last_norm.setSize(1, _numBins);
d->phase.setSize(_numChannels, _numBins);
d->last_phase.setSize(_numChannels, _numBins);
d->phase_accum.setSize(_numChannels, _numBins);
d->random_phases.setSize(1, _numBins);
generateRandomPhaseVector(
d->random_phases.getPtr(0), _numBins, d->randomGenerator);
_expectedPhaseChangePerBinPerSample = twoPi / double(fftSize);
// Cos window
// For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
d->cosWindow.setSize(1, fftSize);
d->sqWindow.setSize(1, fftSize);
auto* w = d->cosWindow.getPtr(0);
auto* sqw = d->sqWindow.getPtr(0);
for (int i = 0; i < fftSize; ++i) {
w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
sqw[i] = w[i] * w[i];
}
d->peak_index.reserve(_numBins);
d->trough_index.reserve(_numBins);
// force fft to allocate all buffers now. Currently there is no other way in the fft class
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
reset();
}
int TimeAndPitch::getSamplesToNextHop() const
{
return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter
+ 1); // 1 extra to counter pitch factor error accumulation
}
int TimeAndPitch::getNumAvailableOutputSamples() const
{
return _availableOutputSamples;
}
void TimeAndPitch::reset()
{
_analysis_hop_counter = 0;
_availableOutputSamples = 0;
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].reset();
d->inCircularBuffer[ch].reset();
d->outCircularBuffer[ch].reset();
}
d->normalizationBuffer.reset();
d->last_norm.zeroOut();
d->last_phase.zeroOut();
d->phase_accum.zeroOut();
_outBufferWriteOffset = 0;
d->hop_a_err = 0.0;
d->hop_s_err = 0.0;
d->exact_hop_s = 0.0;
_resampleReadPos = 0.0;
}
namespace {
// wrap a phase value into -PI..PI
inline float _unwrapPhase(float arg)
{
using namespace audio::simd;
return arg - rint(arg * 0.15915494309f) * 6.283185307f;
}
void _unwrapPhaseVec(float* v, int n)
{
audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
}
/// rotate even-sized array by half its size to align fft phase at the center
void _fft_shift(float* v, int n)
{
assert((n & 1) == 0);
int n2 = n >> 1;
audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
auto tmp = a;
a = b;
b = tmp;
});
}
void _lr_to_ms(float* ch1, float* ch2, int n)
{
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
auto l = a, r = b;
a = 0.5f * (l + r);
b = 0.5f * (l - r);
});
}
void _ms_to_lr(float* ch1, float* ch2, int n)
{
audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
auto m = a, s = b;
a = m + s;
b = m - s;
});
}
} // namespace
// ----------------------------------------------------------------------------
template<int num_channels>
void TimeAndPitch::_time_stretch(float a_a, float a_s)
{
auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
// Create a norm array
const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
const auto* norms_last = d->last_norm.getPtr(0);
d->peak_index.clear();
d->trough_index.clear();
float lowest = norms[0];
int trough = 0;
if (norms_last[0] >= norms[1]) {
d->peak_index.emplace_back(0);
d->trough_index.emplace_back(0);
}
for (int i = 1; i < _numBins - 1; ++i) {
if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1]) {
d->peak_index.emplace_back(i);
d->trough_index.emplace_back(trough);
trough = i;
lowest = norms[i];
} else if (norms[i] < lowest) {
lowest = norms[i];
trough = i;
}
}
if (norms_last[_numBins - 1] > norms[_numBins - 2]) {
d->peak_index.emplace_back(_numBins - 1);
d->trough_index.emplace_back(trough);
}
if (d->peak_index.size() == 0) {
// use max norm of last frame
int max_idx = 0;
float max_norm = 0.f;
vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
d->peak_index.emplace_back(max_idx);
}
const float** p = const_cast<const float**>(d->phase.getPtrs());
const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
float** acc = d->phase_accum.getPtrs();
float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
int num_peaks = (int)d->peak_index.size();
// time integration for all peaks
for (int i = 0; i < num_peaks; ++i) {
const int n = d->peak_index[i];
auto fn = float(n);
// determine phase from last frame
float fn_expChange_a = fn * expChange_a;
float fn_expChange_s = fn * expChange_s;
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
}
}
// go from first peak to 0
for (int n = d->peak_index[0]; n > 0; --n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
}
}
// 'grow' from pairs of peaks to the lowest norm in between
for (int i = 0; i < num_peaks - 1; ++i) {
const int mid = d->trough_index[i + 1];
for (int n = d->peak_index[i]; n < mid; ++n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
}
}
for (int n = d->peak_index[i + 1]; n > mid + 1; --n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
}
}
}
// last peak to the end
for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n) {
for (int ch = 0; ch < num_channels; ++ch) {
acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
}
}
d->last_norm.assignSamples(d->norm);
d->last_phase.assignSamples(d->phase);
}
void TimeAndPitch::_applyImagingReduction()
{
// Upsampling brings spectral components down, including those that were
// beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
// spectrum, which, if not lowpass-filtered by the resampler, may yield
// an aliasing-like quality if what is mirrored is a harmonic series. A
// simple thing to do against that is to scramble the phase values there.
// This way we preserve natural energy fluctuations, but the artifact
// isn't noticeable as much anymore, because much more noise-like, which
// is what one typically gets at those frequencies.
constexpr auto alignment = 64 / sizeof(float);
const int imagingBeginBin
=std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;
if (imagingBeginBin >= _numBins) {
return;
}
const auto n = _numBins - imagingBeginBin;
auto pSpec = d->spectrum.getPtr(0);
auto pRand = d->random_phases.getPtr(0);
vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
// Just rotating the random phase vector to produce pseudo-random
// sequences of phases.
std::uniform_int_distribution<size_t> dis(0, n - 1);
const auto middle = dis(d->randomGenerator);
std::rotate(pRand, pRand + middle, pRand + n);
}
/// process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
void TimeAndPitch::_process_hop(int hop_a, int hop_s)
{
if (d->exact_hop_a != d->exact_hop_s) {
if (_numChannels == 2) {
_lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
}
for (int ch = 0; ch < _numChannels; ++ch) {
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
}
// determine norm/phase
d->fft.forwardReal(d->fft_timeseries, d->spectrum);
// norms of the mid channel only (or sole channel) are needed in
// _time_stretch
vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
for (int ch = 0; ch < _numChannels; ++ch) {
vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
}
if (_shiftTimbreCb) {
_shiftTimbreCb(
1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
}
if (_reduceImaging && _pitchFactor < 1.) {
_applyImagingReduction();
}
if (_numChannels == 1) {
_time_stretch<1>((float)hop_a, (float)hop_s);
} else if (_numChannels == 2) {
_time_stretch<2>((float)hop_a, (float)hop_s);
}
for (int ch = 0; ch < _numChannels; ++ch) {
_unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
}
for (int ch = 0; ch < _numChannels; ++ch) {
vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
d->spectrum.getNumSamples());
}
d->fft.inverseReal(d->spectrum, d->fft_timeseries);
for (int ch = 0; ch < _numChannels; ++ch) {
vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
d->fft_timeseries.getNumSamples());
}
if (_numChannels == 2) {
_ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
}
for (int ch = 0; ch < _numChannels; ++ch) {
_fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
}
} else { // stretch factor == 1.0 => just apply window
for (int ch = 0; ch < _numChannels; ++ch) {
vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
}
}
{
float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
for (int ch = 0; ch < _numChannels; ++ch) {
d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
gainFact);
}
if (normalize_window) {
d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
}
}
_outBufferWriteOffset += hop_s;
_availableOutputSamples += hop_s;
}
void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
{
assert(timeScale > 0.0);
assert(pitchFactor > 0.0);
_pitchFactor = pitchFactor;
_timeStretch = timeScale * pitchFactor;
_overlap_a = overlap;
double overlap_s = overlap;
if (_timeStretch > 1.0 || !modulate_synthesis_hop) {
_overlap_a *= _timeStretch;
} else {
overlap_s /= _timeStretch;
}
d->exact_hop_a = double(fftSize) / _overlap_a;
if (!modulate_synthesis_hop) {
d->exact_hop_s = double(fftSize) / overlap_s;
} else {
// switch after processing next block, unless it's the first one
d->next_exact_hop_s = double(fftSize) / overlap_s;
if (d->exact_hop_s == 0.0) { // on first chunk set it immediately
d->exact_hop_s = d->next_exact_hop_s;
}
}
}
void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
{
assert(numSamples <= _maxBlockSize);
for (int ch = 0; ch < _numChannels; ++ch) {
d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
d->inResampleInputBuffer[ch].advance(numSamples);
}
_resampleReadPos -= numSamples;
// determine integer hop size for the next hop.
// The error accumulators will make the value fluctuate by one to reach arbitrary scale
// ratios
if (d->exact_hop_s == 0.0) { // this happens if feedAudio is called without setting up stretch factors
d->exact_hop_s = d->next_exact_hop_s;
}
const int hop_s = int(d->exact_hop_s + d->hop_s_err);
const int hop_a = int(d->exact_hop_a + d->hop_a_err);
double step = 0.0;
double read_pos = _resampleReadPos;
while (read_pos < 0.0)
{
auto int_pos = int(std::floor(read_pos));
float frac_pos = float(read_pos - int_pos);
for (int ch = 0; ch < _numChannels; ++ch) {
float smp[6];
d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
d->inCircularBuffer[ch].writeOffset0(s);
d->inCircularBuffer[ch].advance(1);
}
_analysis_hop_counter++;
step++;
if (_analysis_hop_counter >= hop_a) {
_analysis_hop_counter -= hop_a;
d->hop_s_err += d->exact_hop_s - hop_s;
d->hop_a_err += d->exact_hop_a - hop_a;
for (int ch = 0; ch < _numChannels; ++ch) {
d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
}
_process_hop(hop_a, hop_s);
}
read_pos = _resampleReadPos + step * _pitchFactor;
}
_resampleReadPos = read_pos;
}
void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
{
assert(numSamples <= _maxBlockSize);
for (int ch = 0; ch < _numChannels; ++ch) {
d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
if (normalize_window) {
constexpr float curve
=4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
for (int i = 0; i < numSamples; ++i) {
float x = d->normalizationBuffer.read(i);
out_smp[ch][i] *= x / (x * x + 1 / curve);
}
}
d->outCircularBuffer[ch].advance(numSamples);
}
d->normalizationBuffer.clearBlock(0, numSamples);
d->normalizationBuffer.advance(numSamples);
_outBufferWriteOffset -= numSamples;
_availableOutputSamples -= numSamples;
if (modulate_synthesis_hop) {
d->exact_hop_s = d->next_exact_hop_s;
}
}
void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
{
setTimeStretchAndPitchFactor(1.0, pitchFactor);
feedAudio(smp, numSamples);
retrieveAudio(smp, numSamples);
}
int TimeAndPitch::getLatencySamples() const
{
return fftSize - fftSize / overlap + 3; // 3 for resampling
}
int TimeAndPitch::getLatencySamplesForStretchRatio(float timeStretch) const
{
const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
}
} // namespace staffpad