Use soundtouch for vocal audio time stretching (1/2)

2025-12-06 14:54:56 +01:00 · 2020-08-23 16:37:56 +08:00 · 2020-08-23 16:37:56 +08:00 · 262281bc1f
commit 262281bc1f
parent fb2d33c6a3
5 changed files with 227 additions and 100 deletions
--- a/libs/ardour/ardour/stretch.h
+++ b/libs/ardour/ardour/stretch.h
@ -27,7 +27,6 @@
 #include "ardour/filter.h"
 #include "ardour/timefx_request.h"

-#ifdef USE_RUBBERBAND

 #include "ardour/rb_effect.h"

@ -41,8 +40,7 @@ class LIBARDOUR_API RBStretch : public RBEffect {

 } /* namespace */

-#else
-
+#ifdef HAVE_SOUNDTOUCH
 #include <soundtouch/SoundTouch.h>

 namespace ARDOUR {
@ -52,16 +50,14 @@ class LIBARDOUR_API STStretch : public Filter {
 	STStretch (ARDOUR::Session&, TimeFXRequest&);
 	~STStretch ();

-	int run (boost::shared_ptr<ARDOUR::Region>);
+	int run (boost::shared_ptr<ARDOUR::Region>, Progress* progress = 0);

  private:
 	TimeFXRequest& tsr;
-
-	soundtouch::SoundTouch st;
 };

 } /* namespace */
-
 #endif

+
 #endif /* __ardour_stretch_h__ */
--- a/libs/ardour/ardour/timefx_request.h
+++ b/libs/ardour/ardour/timefx_request.h
@ -26,10 +26,11 @@ namespace ARDOUR {
 	struct TimeFXRequest : public InterThreadInfo {
 		TimeFXRequest()
 			: time_fraction(0), pitch_fraction(0),
-			quick_seek(false), antialias(false),  opts(0) {}
+			use_soundtouch(false), quick_seek(false), antialias(false),  opts(0) {}
 		float time_fraction;
 		float pitch_fraction;
 		/* SoundTouch */
+		bool use_soundtouch;
 		bool  quick_seek;
 		bool  antialias;
 		/* RubberBand */
--- a/libs/ardour/st_stretch.cc
+++ b/libs/ardour/st_stretch.cc
@ -28,6 +28,7 @@
 #include "ardour/audiofilesource.h"
 #include "ardour/session.h"
 #include "ardour/audioregion.h"
+#include "ardour/progress.h"

 #include "pbd/i18n.h"

@ -40,23 +41,6 @@ STStretch::STStretch (Session& s, TimeFXRequest& req)
 	: Filter (s)
 	, tsr (req)
 {
-	float percentage;
-
-	/* the soundtouch code wants a *tempo* change percentage, which is
-	   of opposite sign to the length change.
-	*/
-
-	percentage = -tsr.time_fraction;
-
-	st.setSampleRate (s.sample_rate());
-	st.setChannels (1);
-	st.setTempoChange (percentage);
-	st.setPitchSemiTones (0);
-	st.setRateChange (0);
-
-	st.setSetting(SETTING_USE_QUICKSEEK, tsr.quick_seek);
-	st.setSetting(SETTING_USE_AA_FILTER, tsr.antialias);
-
 }

 STStretch::~STStretch ()
@ -64,99 +48,232 @@ STStretch::~STStretch ()
 }

 int
-STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
+STStretch::run (boost::shared_ptr<Region> r, Progress* progress)
 {
+	boost::shared_ptr<AudioRegion> region = boost::dynamic_pointer_cast<AudioRegion> (r);
+
+	if (!region) {
+		error << "STStretch::run() passed a non-audio region! WTF?" << endmsg;
+		return -1;
+	}
+
 	SourceList        nsrcs;
-	samplecnt_t total_samples;
-	samplecnt_t done;
 	int               ret         = -1;
-	const samplecnt_t bufsize = 16384;
+	const samplecnt_t bufsize     = 8192;
 	gain_t*           gain_buffer = 0;
-	Sample *buffer = 0;
+	Sample**          buffers     = 0;
 	char              suffix[32];
 	string            new_name;
 	string::size_type at;

+#ifndef NDEBUG
+	cerr << "STStretch: source region: position = " << region->position ()
+	     << ", start = " << region->start ()
+	     << ", length = " << region->length ()
+	     << ", ancestral_start = " << region->ancestral_start ()
+	     << ", ancestral_length = " << region->ancestral_length ()
+	     << ", stretch " << region->stretch ()
+	     << ", shift " << region->shift () << endl;
+#endif
+
+	/*
+	 * We have two cases to consider:
+	 *
+	 * 1. The region has not been stretched before.
+	 *
+	 * In this case, we just want to read region->length() samples
+	 * from region->start().
+	 *
+	 * We will create a new region of region->length() *
+	 * tsr.time_fraction samples.  The new region will have its
+	 * start set to 0 (because it has a new audio file that begins
+	 * at the start of the stretched area) and its ancestral_start
+	 * set to region->start() (so that we know where to begin
+	 * reading if we want to stretch it again).
+	 *
+	 * 2. The region has been stretched before.
+	 *
+	 * The region starts at region->start() samples into its
+	 * (possibly previously stretched) source file.  But we don't
+	 * want to read from its source file; we want to read from the
+	 * file it was originally stretched from.
+	 *
+	 * The region's source begins at region->ancestral_start()
+	 * samples into its master source file.  Thus, we need to start
+	 * reading at region->ancestral_start() + (region->start() /
+	 * region->stretch()) samples into the master source.  This
+	 * value will also become the ancestral_start for the new
+	 * region.
+	 *
+	 * We cannot use region->ancestral_length() to establish how
+	 * many samples to read, because it won't be up to date if the
+	 * region has been trimmed since it was last stretched.  We
+	 * must read region->length() / region->stretch() samples and
+	 * stretch them by tsr.time_fraction * region->stretch(), for
+	 * a new region of region->length() * tsr.time_fraction
+	 * samples.
+	 *
+	 * Case 1 is of course a special case of 2, where
+	 * region->ancestral_start() == 0 and region->stretch() == 1.
+	 *
+	 * When we ask to read from a region, we supply a position on
+	 * the global timeline.  The read function calculates the
+	 * offset into the source as (position - region->position()) +
+	 * region->start().  This calculation is used regardless of
+	 * whether we are reading from a master or
+	 * previously-stretched region.  In order to read from a point
+	 * n samples into the master source, we need to provide n -
+	 * region->start() + region->position() as our position
+	 * argument to master_read_at().
+	 *
+	 * Note that region->ancestral_length() is not used.
+	 *
+	 * I hope this is clear.
+	 */
+
+	double stretch = region->stretch () * tsr.time_fraction;
+	stretch = std::min(20.0, std::max(0.02, stretch));
+	samplecnt_t read_start = region->ancestral_start () +
+	                         samplecnt_t (region->start () / (double)region->stretch ());
+
+	samplecnt_t read_duration =
+	    samplecnt_t (region->length () / (double)region->stretch ());
+
+	uint32_t channels = region->n_channels ();
+
+#ifndef NDEBUG
+	cerr << "RBStretcher: input-len = " << read_duration
+	     << ", rate = " << session.sample_rate ()
+	     << ", channels = " << channels
+	     << ", opts = " << tsr.opts
+	     << ", stretch = " << stretch << endl;
+#endif
+
+
+	soundtouch::SoundTouch st[channels];
+	for (uint32_t i = 0; i < channels; ++i) {
+		st[i].setSampleRate(session.sample_rate());
+		st[i].setChannels(1);
+		st[i].setTempo(1.0 / stretch);
+
+		st[i].setSetting(SETTING_USE_QUICKSEEK, tsr.quick_seek);
+		st[i].setSetting(SETTING_USE_AA_FILTER, tsr.antialias);
+        st[i].setSetting(SETTING_SEQUENCE_MS, 40);
+        st[i].setSetting(SETTING_SEEKWINDOW_MS, 15);
+        st[i].setSetting(SETTING_OVERLAP_MS, 8);
+	}
+
 	progress->set_progress (0);
 	tsr.done = false;

-	boost::shared_ptr<AudioRegion> region = boost::dynamic_pointer_cast<AudioRegion>(a_region);
-
-	total_samples = region->length() * region->n_channels();
-	done = 0;

 	/* the name doesn't need to be super-precise, but allow for 2 fractional
-	   digits just to disambiguate close but not identical stretches.
+	 * digits just to disambiguate close but not identical FX
 	 */

-	snprintf (suffix, sizeof (suffix), "@%d", (int) floor (tsr.time_fraction * 100.0f));
+	snprintf (suffix, sizeof (suffix), "@%d", (int)floor (stretch * 100.0f));

 	/* create new sources */

+	samplepos_t pos = 0;
+
 	if (make_new_sources (region, nsrcs, suffix)) {
 		goto out;
 	}

 	gain_buffer = new gain_t[bufsize];
-	buffer = new Sample[bufsize];
+	buffers     = new float*[channels];

-	// soundtouch throws runtime_error on error
+	for (uint32_t i = 0; i < channels; ++i) {
+		buffers[i] = new float[bufsize];
+	}
+
+	/* we read from the master (original) sources for the region,
+	 * not the ones currently in use, in case it's already been
+	 * subject to timefx. */

 	try {
-		for (uint32_t i = 0; i < nsrcs.size(); ++i) {
+		/* start process */
+		pos = 0;

-			boost::shared_ptr<AudioSource> asrc
-				= boost::dynamic_pointer_cast<AudioSource>(nsrcs[i]);
-
-			samplepos_t pos = 0;
+		while (pos < read_duration && !tsr.cancel) {
 			samplecnt_t this_read = 0;

-			st.clear();
+			for (uint32_t i = 0; i < channels; ++i) {
+				samplepos_t this_time;
+				this_time = min (bufsize, read_duration - pos);

-			while (!tsr.cancel && pos < region->length()) {
-				samplecnt_t this_time;
+				samplepos_t this_position;
+				this_position = read_start + pos -
+				                region->start () + region->position ();

-				this_time = min (bufsize, region->length() - pos);
+				this_read = region->master_read_at (buffers[i],
+				                                    buffers[i],
+				                                    gain_buffer,
+				                                    this_position,
+				                                    this_time,
+				                                    i);

-				/* read from the master (original) sources for the region,
-				   not the ones currently in use, in case it's already been
-				   subject to timefx.
-				*/
-
-				if ((this_read = region->master_read_at (buffer, buffer, gain_buffer, pos + region->position(), this_time)) != this_time) {
-					error << string_compose (_("tempoize: error reading data from %1"), asrc->name()) << endmsg;
+				if (this_read != this_time) {
+					error << string_compose (_("tempoize: error reading data from %1 at %2 (wanted %3, got %4)"),
+					                         region->name (), pos + region->position (), this_time, this_read)
+					      << endmsg;
 					goto out;
 				}

+				st[i].putSamples (buffers[i], this_read);
+			}
 			pos += this_read;
-				done += this_read;
+			progress->set_progress (0.25 + ((float)pos / read_duration) * 0.75);

-				progress->set_progress ((float) done / total_samples);
+			for (uint32_t i = 0; i < channels; ++i) {
+				samplecnt_t avail = 0;
+				while ((avail = st[i].numSamples ()) > 0) {
+					this_read = min (bufsize, avail);

-				st.putSamples (buffer, this_read);
+					this_read = st[i].receiveSamples(buffers[i], this_read);
+					boost::shared_ptr<AudioSource> asrc = boost::dynamic_pointer_cast<AudioSource> (nsrcs[i]);
+					if (!asrc) {
+						continue;
+					}

-				while ((this_read = st.receiveSamples (buffer, bufsize)) > 0 && !tsr.cancel) {
-					if (asrc->write (buffer, this_read) != this_read) {
-						error << string_compose (_("error writing tempo-adjusted data to %1"), asrc->name()) << endmsg;
+					if (asrc->write (buffers[i], this_read) != this_read) {
+						error << string_compose (_("error writing tempo-adjusted data to %1"), nsrcs[i]->name ()) << endmsg;
 						goto out;
 					}
 				}
 			}
+		}

 		if (!tsr.cancel) {
-				st.flush ();
+			for (uint32_t i = 0; i < channels; ++i) {
+				st[i].flush ();
+			}
 		}

-			while (!tsr.cancel && (this_read = st.receiveSamples (buffer, bufsize)) > 0) {
-				if (asrc->write (buffer, this_read) != this_read) {
-					error << string_compose (_("error writing tempo-adjusted data to %1"), asrc->name()) << endmsg;
+		/* completing */
+		for (uint32_t i = 0; i < channels; ++i) {
+			samplecnt_t avail = 0;
+			samplecnt_t this_read = 0;
+			while ((avail = st[i].numSamples ()) > 0) {
+				this_read = min (bufsize, avail);
+
+				this_read = st[i].receiveSamples(buffers[i], this_read);
+
+				boost::shared_ptr<AudioSource> asrc = boost::dynamic_pointer_cast<AudioSource> (nsrcs[i]);
+				if (!asrc) {
+					continue;
+				}
+
+				if (asrc->write (buffers[i], this_read) != this_read) {
+					error << string_compose (_("error writing tempo-adjusted data to %1"), nsrcs[i]->name ()) << endmsg;
 					goto out;
 				}
 			}
 		}

 	} catch (runtime_error& err) {
-		error << _("timefx code failure. please notify ardour-developers.") << endmsg;
+		error << string_compose (_("programming error: %1"), X_("timefx code failure")) << endmsg;
 		error << err.what () << endmsg;
 		goto out;
 	}
@ -164,7 +281,7 @@ STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
 	new_name = region->name ();
 	at       = new_name.find ('@');

-	// remove any existing stretch indicator
+	/* remove any existing stretch indicator */

 	if (at != string::npos && at > 2) {
 		new_name = new_name.substr (0, at - 1);
@ -179,26 +296,36 @@ STStretch::run (boost::shared_ptr<Region> a_region, Progress* progress)
 	/* now reset ancestral data for each new region */

 	for (vector<boost::shared_ptr<Region> >::iterator x = results.begin (); x != results.end (); ++x) {
-		samplepos_t astart = (*x)->ancestral_start();
-		samplepos_t alength = (*x)->ancestral_length();
-		samplepos_t start;
-		samplecnt_t length;
+		(*x)->set_ancestral_data (read_start,
+		                          read_duration,
+		                          stretch,
+		                          1.0);
+		(*x)->set_master_sources (region->master_sources ());
+		/* multiply the old (possibly previously stretched) region length by the extra
+		 * stretch this time around to get its new length. this is a non-music based edit atm.
+		 */
+		(*x)->set_length ((*x)->length () * tsr.time_fraction, 0);
+	}

-		// note: tsr.fraction is a percentage of original length. 100 = no change,
-		// 50 is half as long, 200 is twice as long, etc.
+	/* stretch region gain envelope */
+	/* XXX: assuming we've only processed one input region into one result here */

-		float stretch = (*x)->stretch() * (tsr.time_fraction/100.0);
-
-		start = (samplepos_t) floor (astart + ((astart - (*x)->start()) / stretch));
-		length = (samplecnt_t) floor (alength / stretch);
-
-		(*x)->set_ancestral_data (start, length, stretch, (*x)->shift());
+	if (ret == 0 && tsr.time_fraction != 1) {
+		boost::shared_ptr<AudioRegion> result = boost::dynamic_pointer_cast<AudioRegion> (results.front ());
+		assert (result);
+		result->envelope ()->x_scale (tsr.time_fraction);
 	}

 out:

 	delete[] gain_buffer;
-	delete [] buffer;
+
+	if (buffers) {
+		for (uint32_t i = 0; i < channels; ++i) {
+			delete[] buffers[i];
+		}
+		delete[] buffers;
+	}

 	if (ret || tsr.cancel) {
 		for (SourceList::iterator si = nsrcs.begin (); si != nsrcs.end (); ++si) {
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@ -327,8 +327,8 @@ def configure(conf):
    if conf.is_defined ('HAVE_LV2_1_10_0'):
        conf.define ('LV2_EXTENDED', 1)

-#    autowaf.check_pkg(conf, 'soundtouch-1.0', uselib_store='SOUNDTOUCH',
-#                      mandatory=False)
+    autowaf.check_pkg(conf, 'soundtouch', uselib_store='SOUNDTOUCH',
+                      atleast_version='1.8.0', mandatory=False)
    autowaf.check_pkg(conf, 'cppunit', uselib_store='CPPUNIT',
                      atleast_version='1.12.0', mandatory=False)
    autowaf.check_pkg(conf, 'ogg', uselib_store='OGG', atleast_version='1.1.2')
@ -428,8 +428,10 @@ def build(bld):
        'LIBARDOUR="' + bld.env['lwrcase_dirname'] + '"'
        ]

+    if bld.is_defined('HAVE_SOUNDTOUCH'):
+        obj.source += ['st_stretch.cc']
        #obj.source += ' st_stretch.cc st_pitch.cc '
-    #obj.uselib += ' SOUNDTOUCH '
+        obj.uselib += ['SOUNDTOUCH']
    #obj.add_objects = 'default/libs/surfaces/control_protocol/smpte_1.o'

    if bld.is_defined('HAVE_LILV') :
--- a/libs/pbd/file_utils.cc
+++ b/libs/pbd/file_utils.cc
@ -28,6 +28,7 @@

 #ifdef COMPILER_MINGW
 #include <io.h> // For W_OK
+#include <windows.h>
 #endif

 #include <glibmm/fileutils.h>