Use soundtouch for vocal audio time stretching (1/2)

This commit is contained in:
mx 2020-08-23 16:37:56 +08:00 committed by Robin Gareus
parent fb2d33c6a3
commit 262281bc1f
No known key found for this signature in database
GPG key ID: A090BCE02CF57F04
5 changed files with 227 additions and 100 deletions

View file

@ -27,7 +27,6 @@
#include "ardour/filter.h"
#include "ardour/timefx_request.h"
#ifdef USE_RUBBERBAND
#include "ardour/rb_effect.h"
@ -41,8 +40,7 @@ class LIBARDOUR_API RBStretch : public RBEffect {
} /* namespace */
#else
#ifdef HAVE_SOUNDTOUCH
#include <soundtouch/SoundTouch.h>
namespace ARDOUR {
@ -52,16 +50,14 @@ class LIBARDOUR_API STStretch : public Filter {
STStretch (ARDOUR::Session&, TimeFXRequest&);
~STStretch ();
int run (boost::shared_ptr<ARDOUR::Region>);
int run (boost::shared_ptr<ARDOUR::Region>, Progress* progress = 0);
private:
TimeFXRequest& tsr;
soundtouch::SoundTouch st;
};
} /* namespace */
#endif
#endif /* __ardour_stretch_h__ */

View file

@ -26,10 +26,11 @@ namespace ARDOUR {
struct TimeFXRequest : public InterThreadInfo {
TimeFXRequest()
: time_fraction(0), pitch_fraction(0),
quick_seek(false), antialias(false), opts(0) {}
use_soundtouch(false), quick_seek(false), antialias(false), opts(0) {}
float time_fraction;
float pitch_fraction;
/* SoundTouch */
bool use_soundtouch;
bool quick_seek;
bool antialias;
/* RubberBand */

View file

@ -28,6 +28,7 @@
#include "ardour/audiofilesource.h"
#include "ardour/session.h"
#include "ardour/audioregion.h"
#include "ardour/progress.h"
#include "pbd/i18n.h"
@ -40,23 +41,6 @@ STStretch::STStretch (Session& s, TimeFXRequest& req)
: Filter (s)
, tsr (req)
{
float percentage;
/* the soundtouch code wants a *tempo* change percentage, which is
of opposite sign to the length change.
*/
percentage = -tsr.time_fraction;
st.setSampleRate (s.sample_rate());
st.setChannels (1);
st.setTempoChange (percentage);
st.setPitchSemiTones (0);
st.setRateChange (0);
st.setSetting(SETTING_USE_QUICKSEEK, tsr.quick_seek);
st.setSetting(SETTING_USE_AA_FILTER, tsr.antialias);
}
STStretch::~STStretch ()
@ -64,99 +48,232 @@ STStretch::~STStretch ()
}
int
STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
STStretch::run (boost::shared_ptr<Region> r, Progress* progress)
{
boost::shared_ptr<AudioRegion> region = boost::dynamic_pointer_cast<AudioRegion> (r);
if (!region) {
error << "STStretch::run() passed a non-audio region! WTF?" << endmsg;
return -1;
}
SourceList nsrcs;
samplecnt_t total_samples;
samplecnt_t done;
int ret = -1;
const samplecnt_t bufsize = 16384;
const samplecnt_t bufsize = 8192;
gain_t* gain_buffer = 0;
Sample *buffer = 0;
Sample** buffers = 0;
char suffix[32];
string new_name;
string::size_type at;
#ifndef NDEBUG
cerr << "STStretch: source region: position = " << region->position ()
<< ", start = " << region->start ()
<< ", length = " << region->length ()
<< ", ancestral_start = " << region->ancestral_start ()
<< ", ancestral_length = " << region->ancestral_length ()
<< ", stretch " << region->stretch ()
<< ", shift " << region->shift () << endl;
#endif
/*
* We have two cases to consider:
*
* 1. The region has not been stretched before.
*
* In this case, we just want to read region->length() samples
* from region->start().
*
* We will create a new region of region->length() *
* tsr.time_fraction samples. The new region will have its
* start set to 0 (because it has a new audio file that begins
* at the start of the stretched area) and its ancestral_start
* set to region->start() (so that we know where to begin
* reading if we want to stretch it again).
*
* 2. The region has been stretched before.
*
* The region starts at region->start() samples into its
* (possibly previously stretched) source file. But we don't
* want to read from its source file; we want to read from the
* file it was originally stretched from.
*
* The region's source begins at region->ancestral_start()
* samples into its master source file. Thus, we need to start
* reading at region->ancestral_start() + (region->start() /
* region->stretch()) samples into the master source. This
* value will also become the ancestral_start for the new
* region.
*
* We cannot use region->ancestral_length() to establish how
* many samples to read, because it won't be up to date if the
* region has been trimmed since it was last stretched. We
* must read region->length() / region->stretch() samples and
* stretch them by tsr.time_fraction * region->stretch(), for
* a new region of region->length() * tsr.time_fraction
* samples.
*
* Case 1 is of course a special case of 2, where
* region->ancestral_start() == 0 and region->stretch() == 1.
*
* When we ask to read from a region, we supply a position on
* the global timeline. The read function calculates the
* offset into the source as (position - region->position()) +
* region->start(). This calculation is used regardless of
* whether we are reading from a master or
* previously-stretched region. In order to read from a point
* n samples into the master source, we need to provide n -
* region->start() + region->position() as our position
* argument to master_read_at().
*
* Note that region->ancestral_length() is not used.
*
* I hope this is clear.
*/
double stretch = region->stretch () * tsr.time_fraction;
stretch = std::min(20.0, std::max(0.02, stretch));
samplecnt_t read_start = region->ancestral_start () +
samplecnt_t (region->start () / (double)region->stretch ());
samplecnt_t read_duration =
samplecnt_t (region->length () / (double)region->stretch ());
uint32_t channels = region->n_channels ();
#ifndef NDEBUG
cerr << "RBStretcher: input-len = " << read_duration
<< ", rate = " << session.sample_rate ()
<< ", channels = " << channels
<< ", opts = " << tsr.opts
<< ", stretch = " << stretch << endl;
#endif
soundtouch::SoundTouch st[channels];
for (uint32_t i = 0; i < channels; ++i) {
st[i].setSampleRate(session.sample_rate());
st[i].setChannels(1);
st[i].setTempo(1.0 / stretch);
st[i].setSetting(SETTING_USE_QUICKSEEK, tsr.quick_seek);
st[i].setSetting(SETTING_USE_AA_FILTER, tsr.antialias);
st[i].setSetting(SETTING_SEQUENCE_MS, 40);
st[i].setSetting(SETTING_SEEKWINDOW_MS, 15);
st[i].setSetting(SETTING_OVERLAP_MS, 8);
}
progress->set_progress (0);
tsr.done = false;
boost::shared_ptr<AudioRegion> region = boost::dynamic_pointer_cast<AudioRegion>(a_region);
total_samples = region->length() * region->n_channels();
done = 0;
/* the name doesn't need to be super-precise, but allow for 2 fractional
digits just to disambiguate close but not identical stretches.
* digits just to disambiguate close but not identical FX
*/
snprintf (suffix, sizeof (suffix), "@%d", (int) floor (tsr.time_fraction * 100.0f));
snprintf (suffix, sizeof (suffix), "@%d", (int)floor (stretch * 100.0f));
/* create new sources */
samplepos_t pos = 0;
if (make_new_sources (region, nsrcs, suffix)) {
goto out;
}
gain_buffer = new gain_t[bufsize];
buffer = new Sample[bufsize];
buffers = new float*[channels];
// soundtouch throws runtime_error on error
for (uint32_t i = 0; i < channels; ++i) {
buffers[i] = new float[bufsize];
}
/* we read from the master (original) sources for the region,
* not the ones currently in use, in case it's already been
* subject to timefx. */
try {
for (uint32_t i = 0; i < nsrcs.size(); ++i) {
/* start process */
pos = 0;
boost::shared_ptr<AudioSource> asrc
= boost::dynamic_pointer_cast<AudioSource>(nsrcs[i]);
samplepos_t pos = 0;
while (pos < read_duration && !tsr.cancel) {
samplecnt_t this_read = 0;
st.clear();
for (uint32_t i = 0; i < channels; ++i) {
samplepos_t this_time;
this_time = min (bufsize, read_duration - pos);
while (!tsr.cancel && pos < region->length()) {
samplecnt_t this_time;
samplepos_t this_position;
this_position = read_start + pos -
region->start () + region->position ();
this_time = min (bufsize, region->length() - pos);
this_read = region->master_read_at (buffers[i],
buffers[i],
gain_buffer,
this_position,
this_time,
i);
/* read from the master (original) sources for the region,
not the ones currently in use, in case it's already been
subject to timefx.
*/
if ((this_read = region->master_read_at (buffer, buffer, gain_buffer, pos + region->position(), this_time)) != this_time) {
error << string_compose (_("tempoize: error reading data from %1"), asrc->name()) << endmsg;
if (this_read != this_time) {
error << string_compose (_("tempoize: error reading data from %1 at %2 (wanted %3, got %4)"),
region->name (), pos + region->position (), this_time, this_read)
<< endmsg;
goto out;
}
st[i].putSamples (buffers[i], this_read);
}
pos += this_read;
done += this_read;
progress->set_progress (0.25 + ((float)pos / read_duration) * 0.75);
progress->set_progress ((float) done / total_samples);
for (uint32_t i = 0; i < channels; ++i) {
samplecnt_t avail = 0;
while ((avail = st[i].numSamples ()) > 0) {
this_read = min (bufsize, avail);
st.putSamples (buffer, this_read);
this_read = st[i].receiveSamples(buffers[i], this_read);
boost::shared_ptr<AudioSource> asrc = boost::dynamic_pointer_cast<AudioSource> (nsrcs[i]);
if (!asrc) {
continue;
}
while ((this_read = st.receiveSamples (buffer, bufsize)) > 0 && !tsr.cancel) {
if (asrc->write (buffer, this_read) != this_read) {
error << string_compose (_("error writing tempo-adjusted data to %1"), asrc->name()) << endmsg;
if (asrc->write (buffers[i], this_read) != this_read) {
error << string_compose (_("error writing tempo-adjusted data to %1"), nsrcs[i]->name ()) << endmsg;
goto out;
}
}
}
}
if (!tsr.cancel) {
st.flush ();
for (uint32_t i = 0; i < channels; ++i) {
st[i].flush ();
}
}
while (!tsr.cancel && (this_read = st.receiveSamples (buffer, bufsize)) > 0) {
if (asrc->write (buffer, this_read) != this_read) {
error << string_compose (_("error writing tempo-adjusted data to %1"), asrc->name()) << endmsg;
/* completing */
for (uint32_t i = 0; i < channels; ++i) {
samplecnt_t avail = 0;
samplecnt_t this_read = 0;
while ((avail = st[i].numSamples ()) > 0) {
this_read = min (bufsize, avail);
this_read = st[i].receiveSamples(buffers[i], this_read);
boost::shared_ptr<AudioSource> asrc = boost::dynamic_pointer_cast<AudioSource> (nsrcs[i]);
if (!asrc) {
continue;
}
if (asrc->write (buffers[i], this_read) != this_read) {
error << string_compose (_("error writing tempo-adjusted data to %1"), nsrcs[i]->name ()) << endmsg;
goto out;
}
}
}
} catch (runtime_error& err) {
error << _("timefx code failure. please notify ardour-developers.") << endmsg;
error << string_compose (_("programming error: %1"), X_("timefx code failure")) << endmsg;
error << err.what () << endmsg;
goto out;
}
@ -164,7 +281,7 @@ STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
new_name = region->name ();
at = new_name.find ('@');
// remove any existing stretch indicator
/* remove any existing stretch indicator */
if (at != string::npos && at > 2) {
new_name = new_name.substr (0, at - 1);
@ -179,26 +296,36 @@ STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
/* now reset ancestral data for each new region */
for (vector<boost::shared_ptr<Region> >::iterator x = results.begin (); x != results.end (); ++x) {
samplepos_t astart = (*x)->ancestral_start();
samplepos_t alength = (*x)->ancestral_length();
samplepos_t start;
samplecnt_t length;
(*x)->set_ancestral_data (read_start,
read_duration,
stretch,
1.0);
(*x)->set_master_sources (region->master_sources ());
/* multiply the old (possibly previously stretched) region length by the extra
* stretch this time around to get its new length. this is a non-music based edit atm.
*/
(*x)->set_length ((*x)->length () * tsr.time_fraction, 0);
}
// note: tsr.fraction is a percentage of original length. 100 = no change,
// 50 is half as long, 200 is twice as long, etc.
/* stretch region gain envelope */
/* XXX: assuming we've only processed one input region into one result here */
float stretch = (*x)->stretch() * (tsr.time_fraction/100.0);
start = (samplepos_t) floor (astart + ((astart - (*x)->start()) / stretch));
length = (samplecnt_t) floor (alength / stretch);
(*x)->set_ancestral_data (start, length, stretch, (*x)->shift());
if (ret == 0 && tsr.time_fraction != 1) {
boost::shared_ptr<AudioRegion> result = boost::dynamic_pointer_cast<AudioRegion> (results.front ());
assert (result);
result->envelope ()->x_scale (tsr.time_fraction);
}
out:
delete[] gain_buffer;
delete [] buffer;
if (buffers) {
for (uint32_t i = 0; i < channels; ++i) {
delete[] buffers[i];
}
delete[] buffers;
}
if (ret || tsr.cancel) {
for (SourceList::iterator si = nsrcs.begin (); si != nsrcs.end (); ++si) {

View file

@ -327,8 +327,8 @@ def configure(conf):
if conf.is_defined ('HAVE_LV2_1_10_0'):
conf.define ('LV2_EXTENDED', 1)
# autowaf.check_pkg(conf, 'soundtouch-1.0', uselib_store='SOUNDTOUCH',
# mandatory=False)
autowaf.check_pkg(conf, 'soundtouch', uselib_store='SOUNDTOUCH',
atleast_version='1.8.0', mandatory=False)
autowaf.check_pkg(conf, 'cppunit', uselib_store='CPPUNIT',
atleast_version='1.12.0', mandatory=False)
autowaf.check_pkg(conf, 'ogg', uselib_store='OGG', atleast_version='1.1.2')
@ -428,8 +428,10 @@ def build(bld):
'LIBARDOUR="' + bld.env['lwrcase_dirname'] + '"'
]
if bld.is_defined('HAVE_SOUNDTOUCH'):
obj.source += ['st_stretch.cc']
#obj.source += ' st_stretch.cc st_pitch.cc '
#obj.uselib += ' SOUNDTOUCH '
obj.uselib += ['SOUNDTOUCH']
#obj.add_objects = 'default/libs/surfaces/control_protocol/smpte_1.o'
if bld.is_defined('HAVE_LILV') :

View file

@ -28,6 +28,7 @@
#ifdef COMPILER_MINGW
#include <io.h> // For W_OK
#include <windows.h>
#endif
#include <glibmm/fileutils.h>