From 2881bc36983b071ff7ce954ebddd91148576506a Mon Sep 17 00:00:00 2001
From: GZharun <grygoriiz@wavesglobal.com>
Date: Thu, 26 Mar 2015 15:04:46 +0200
Subject: [PATCH 01/13] [Summary] Disabled call for the function which is not
 used in TracksLive. This function prepares label with connection strings for
 strips. [Reviewed by] YPozdnyakov

---
 gtk2_ardour/mixer_strip.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/gtk2_ardour/mixer_strip.cc b/gtk2_ardour/mixer_strip.cc
index acccd211b4..7592c3aa0e 100644
--- a/gtk2_ardour/mixer_strip.cc
+++ b/gtk2_ardour/mixer_strip.cc
@@ -1198,8 +1198,11 @@ void
 MixerStrip::update_input_display ()
 {
 //	update_io_button (_route, _width, true);
-	update_io_button (_route, true);
-  	panners.setup_pan ();
+	
+    // Not used by TracksLive
+    //update_io_button (_route, true);
+  	
+    panners.setup_pan ();
 
 	if (has_audio_outputs ()) {
 		panners.show_all ();
@@ -1212,7 +1215,9 @@ MixerStrip::update_input_display ()
 void
 MixerStrip::update_output_display ()
 {
-	update_io_button (_route, false);
+    // Not used by TracksLive
+	//update_io_button (_route, false);
+    
   	gpm.setup_meters ();
   	panners.setup_pan ();
 

From 6a509344c83b4ae671fc6c5a715adad683d7a490 Mon Sep 17 00:00:00 2001
From: GZharun <grygoriiz@wavesglobal.com>
Date: Fri, 27 Mar 2015 17:01:38 +0200
Subject: [PATCH 02/13] [Summary] Blocked mixer/meter bringe selection change
 notifications during tracks deletion [Reviewed by] YPozdnyakov

---
 gtk2_ardour/ardour_ui.cc    | 9 +++++++++
 gtk2_ardour/editor.h        | 3 +++
 gtk2_ardour/public_editor.h | 4 ++++
 3 files changed, 16 insertions(+)

diff --git a/gtk2_ardour/ardour_ui.cc b/gtk2_ardour/ardour_ui.cc
index 80a2a87ca3..fa1a343d50 100644
--- a/gtk2_ardour/ardour_ui.cc
+++ b/gtk2_ardour/ardour_ui.cc
@@ -133,6 +133,7 @@ typedef uint64_t microseconds_t;
 #include "splash.h"
 #include "theme_manager.h"
 #include "time_axis_view_item.h"
+#include "mixer_bridge_view.h"
 #include "utils.h"
 #include "video_server_dialog.h"
 #include "add_video_dialog.h"
@@ -3807,6 +3808,12 @@ ARDOUR_UI::delete_selected_tracks()
     TrackSelection& track_selection =  editor->get_selection().tracks;
     editor->get_selection().block_tracks_changed (true);
     
+    MixerBridgeView& mixer_view = editor->get_mixer_bridge ();
+    mixer_view.selection().block_routes_changed(true);
+    
+    MixerBridgeView& meter_view = editor->get_meter_bridge ();
+    meter_view.selection().block_routes_changed(true);
+    
     boost::shared_ptr<RouteList> routes_to_remove(new RouteList);
     for (list<TimeAxisView*>::iterator i = track_selection.begin(); i != track_selection.end(); ++i) {
         RouteUI* t = dynamic_cast<RouteUI*> (*i);
@@ -3833,6 +3840,8 @@ ARDOUR_UI::delete_selected_tracks()
     
     /* restore selection notifications and update the selection */
     editor->get_selection().block_tracks_changed (false);
+    mixer_view.selection().block_routes_changed(false);
+    meter_view.selection().block_routes_changed(false);
     editor->get_selection().TracksChanged();
     
     _progress_dialog.hide_pd ();
diff --git a/gtk2_ardour/editor.h b/gtk2_ardour/editor.h
index f8e2b246b1..599aba3f76 100644
--- a/gtk2_ardour/editor.h
+++ b/gtk2_ardour/editor.h
@@ -259,6 +259,9 @@ class Editor : public PublicEditor, public PBD::ScopedConnectionList, public ARD
 	Selection& get_cut_buffer() const { return *cut_buffer; }
 	void track_mixer_selection ();
 
+    MixerBridgeView& get_mixer_bridge () { return _mixer_bridge_view; }
+    MixerBridgeView& get_meter_bridge () { return _meter_bridge_view; }
+    
 	bool extend_selection_to_track (TimeAxisView&);
     bool extend_time_selection_to_track (TimeAxisView&);
 
diff --git a/gtk2_ardour/public_editor.h b/gtk2_ardour/public_editor.h
index 785a0d1f70..0459b12f26 100644
--- a/gtk2_ardour/public_editor.h
+++ b/gtk2_ardour/public_editor.h
@@ -65,6 +65,7 @@ namespace Gtkmm2ext {
         class TearOff;
 }
 
+class MixerBridgeView;
 class AudioRegionView;
 class AutomationLine;
 class AutomationTimeAxisView;
@@ -222,6 +223,9 @@ class PublicEditor : public Gtk::Window, public PBD::StatefulDestructible, publi
 	virtual void set_show_measures (bool yn) = 0;
 	virtual bool show_measures () const = 0;
 
+    virtual MixerBridgeView& get_mixer_bridge () = 0;
+    virtual MixerBridgeView& get_meter_bridge () = 0;
+    
 	virtual Editing::MouseMode effective_mouse_mode () const = 0;
 
         /** Import existing media */

From c1af68b7f61a2ffee224614e7193e1ca0c9d6223 Mon Sep 17 00:00:00 2001
From: GZharun <grygoriiz@wavesglobal.com>
Date: Tue, 31 Mar 2015 15:57:02 +0300
Subject: [PATCH 03/13] [Summary] Added cleanup for GUI properties when route
 is removed. [Details] This issue caused serious overhead when adding new
 routes after previous were removed. Also it resulted in garbage info saved in
 session file. [Reviewed by] PDavis, YPozdnyakov

---
 gtk2_ardour/automation_time_axis.cc |  1 +
 gtk2_ardour/axis_view.h             |  8 ++++++
 gtk2_ardour/gui_object.cc           | 40 +++++++++++++++++++++++++++++
 gtk2_ardour/gui_object.h            |  6 ++++-
 gtk2_ardour/route_time_axis.cc      |  3 +++
 gtk2_ardour/route_ui.cc             |  6 ++++-
 6 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/gtk2_ardour/automation_time_axis.cc b/gtk2_ardour/automation_time_axis.cc
index 1a99c0bc98..79f7c4bfcf 100644
--- a/gtk2_ardour/automation_time_axis.cc
+++ b/gtk2_ardour/automation_time_axis.cc
@@ -231,6 +231,7 @@ AutomationTimeAxisView::AutomationTimeAxisView (
 
 AutomationTimeAxisView::~AutomationTimeAxisView ()
 {
+    cleanup_gui_properties ();
 	delete _view;
 }
 
diff --git a/gtk2_ardour/axis_view.h b/gtk2_ardour/axis_view.h
index 4ce76c92ce..51a48d3150 100644
--- a/gtk2_ardour/axis_view.h
+++ b/gtk2_ardour/axis_view.h
@@ -70,6 +70,14 @@ class AxisView : public virtual Selectable, public PBD::ScopedConnectionList, pu
 		property_hashtable.emplace(property_name, s.str());
 		gui_object_state().set_property<T> (state_id(), property_name, value);
 	}
+    
+    void cleanup_gui_properties ()
+    {
+        // remove related property node from the GUI state
+        gui_object_state().remove_node(state_id() );
+        property_hashtable.clear ();
+    }
+
 
 	bool marked_for_display () const;
 	virtual bool set_marked_for_display (bool);
diff --git a/gtk2_ardour/gui_object.cc b/gtk2_ardour/gui_object.cc
index 3e21f82fa4..bcc2fcdfa1 100644
--- a/gtk2_ardour/gui_object.cc
+++ b/gtk2_ardour/gui_object.cc
@@ -71,6 +71,46 @@ GUIObjectState::get_or_add_node (const string& id)
 	return get_or_add_node (&_state, id);
 }
 
+/** Remove property from the node with provided id.
+ *  If there is no properties except the node id - remove the node.
+ *  @param id property of Object node to look for.
+ *  @param prop_name name of the Object property to remove.
+ *  @return value of true if property is found, or false if not.
+ */
+
+bool
+GUIObjectState::remove_property (const std::string& id, const std::string& prop_name)
+{
+    XMLNode* child = get_node (&_state, id);
+    
+    if (!child) {
+        return false;
+    }
+    
+    XMLProperty* p = child->property (prop_name );
+    if (!p) {
+        return false;
+    }
+    
+    child->remove_property (prop_name );
+    
+    if (child->children().empty() && child->properties().size() == 1 && child->property (X_("id")) ) {
+        remove_node (id);
+    }
+    
+    return true;
+}
+
+/** Remove node with provided id.
+ *  @param id property of Object node to look for.
+*/
+
+void
+GUIObjectState::remove_node (const std::string& id)
+{
+    _state.remove_nodes_and_delete(X_("id"), id );
+}
+
 /** Get a string from our state.
  *  @param id property of Object node to look for.
  *  @param prop_name name of the Object property to return.
diff --git a/gtk2_ardour/gui_object.h b/gtk2_ardour/gui_object.h
index ee6d1cdf4c..9868ef9971 100644
--- a/gtk2_ardour/gui_object.h
+++ b/gtk2_ardour/gui_object.h
@@ -47,12 +47,16 @@ public:
 		s << val;
 		child->add_property (prop_name.c_str(), s.str());
 	}
-
+    
+    bool remove_property (const std::string& id, const std::string& prop_name);
+    
 	std::list<std::string> all_ids () const;
 
 	static XMLNode* get_node (const XMLNode *, const std::string &);
 	XMLNode* get_or_add_node (const std::string &);
 	static XMLNode* get_or_add_node (XMLNode *, const std::string &);
+    
+    void remove_node (const std::string& id);
 	
   private:
 	XMLNode _state;
diff --git a/gtk2_ardour/route_time_axis.cc b/gtk2_ardour/route_time_axis.cc
index e0ce6cde3f..d8cfd87e63 100644
--- a/gtk2_ardour/route_time_axis.cc
+++ b/gtk2_ardour/route_time_axis.cc
@@ -228,6 +228,9 @@ RouteTimeAxisView::set_route (boost::shared_ptr<Route> rt)
 
 RouteTimeAxisView::~RouteTimeAxisView ()
 {
+    // must be handled before CatchDeletion (this)
+    cleanup_gui_properties ();
+    
 	CatchDeletion (this);
 
 	for (list<ProcessorAutomationInfo*>::iterator i = processor_automation.begin(); i != processor_automation.end(); ++i) {
diff --git a/gtk2_ardour/route_ui.cc b/gtk2_ardour/route_ui.cc
index 4631dca820..e9c3b022fb 100644
--- a/gtk2_ardour/route_ui.cc
+++ b/gtk2_ardour/route_ui.cc
@@ -102,9 +102,13 @@ RouteUI::RouteUI (ARDOUR::Session* sess, const std::string& layout_script_file)
 
 RouteUI::~RouteUI()
 {
+    // remove RouteUI property node from the GUI state
+    // must be handled before reseting _route
+    gui_object_state().remove_node(route_state_id() );
+    
 	_route.reset (); /* drop reference to route, so that it can be cleaned up */
 	route_connections.drop_connections ();
-
+    
 	delete solo_menu;
 	delete mute_menu;
 	delete sends_menu;

From 5545ec29acd38867d485282a205c59fae1073092 Mon Sep 17 00:00:00 2001
From: GZharun <grygoriiz@wavesglobal.com>
Date: Wed, 1 Apr 2015 16:32:23 +0300
Subject: [PATCH 04/13] [Summary] When removing multiple tracks made inspector
 to switch to first non selected track (as it happens when we deselect all
 tracks). Each removed track from used to be set (one by one) into the
 inspector.

---
 gtk2_ardour/editor.cc | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/gtk2_ardour/editor.cc b/gtk2_ardour/editor.cc
index f94e559d61..6a5e6dacb1 100644
--- a/gtk2_ardour/editor.cc
+++ b/gtk2_ardour/editor.cc
@@ -5197,6 +5197,12 @@ Editor::add_routes (RouteList& routes)
 	connect_routes_and_update_global_rec_button (routes);
 }
 
+namespace {
+    bool tv_not_selected (TimeAxisView *tv) {
+        return !tv->get_selected ();
+    }
+}
+
 void
 Editor::timeaxisview_deleted (TimeAxisView *tv)
 {
@@ -5240,19 +5246,20 @@ Editor::timeaxisview_deleted (TimeAxisView *tv)
 
 	if (current_mixer_strip && current_mixer_strip->route() == route) {
 
-		TimeAxisView* next_tv;
-
-		if (track_views.empty()) {
-			next_tv = 0;
-		} else if (i == track_views.end()) {
-			next_tv = track_views.front();
-		} else {
-			next_tv = (*i);
+        // find first non selected track
+        TimeAxisView* first_non_selected_tv = 0;
+        
+        if (!track_views.empty() ) {
+            
+            i = std::find_if (track_views.begin(), track_views.end(), tv_not_selected);
+            
+            if (i != track_views.end() ) {
+                first_non_selected_tv = (*i);
+            }
 		}
-
-
-		if (next_tv ) {
-			set_selected_mixer_strip (*next_tv);
+        
+        if (first_non_selected_tv ) {
+			set_selected_mixer_strip (*first_non_selected_tv);
 		} else {
 			/* make the editor mixer strip go away setting the
 			 * button to inactive (which also unticks the menu option)

From 8612cc1c0aeaba9d7025d5b11a37d968097a401f Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Wed, 8 Apr 2015 16:29:33 +0300
Subject: [PATCH 05/13] [Summary] Added SSE sound processing functions support
 for Windows. Version 1.

---
 libs/ardour/mix.cc                    |   6 +-
 libs/ardour/sse_functions_64bit_win.s | 679 ++++++++++++++++++++++++++
 libs/ardour/wscript                   |   7 +
 libs/pbd/fpu.cc                       |   6 +-
 libs/pbd/msvc/fpu.cc                  | 105 ++--
 libs/pbd/wscript                      |  11 +-
 wscript                               |  22 +-
 7 files changed, 762 insertions(+), 74 deletions(-)
 create mode 100644 libs/ardour/sse_functions_64bit_win.s

diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc
index 220cd0660c..adae68ae7f 100644
--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@@ -32,7 +32,7 @@ using namespace ARDOUR;
 // Debug wrappers
 
 float
-debug_compute_peak (ARDOUR::Sample *buf, pframes_t nsamples, float current)
+debug_compute_peak (const ARDOUR::Sample *buf, pframes_t nsamples, float current)
 {
 	if ( ((intptr_t)buf % 16) != 0) {
 		std::cerr << "compute_peak(): buffer unaligned!" << std::endl;
@@ -52,7 +52,7 @@ debug_apply_gain_to_buffer (ARDOUR::Sample *buf, pframes_t nframes, float gain)
 }
 
 void
-debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes, float gain)
+debug_mix_buffers_with_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes, float gain)
 {
 	if ( ((intptr_t)dst & 15) != 0) {
 		std::cerr << "mix_buffers_with_gain(): dst unaligned!" << std::endl;
@@ -67,7 +67,7 @@ debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t
 }
 
 void
-debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes)
+debug_mix_buffers_no_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes)
 {
 	if ( ((intptr_t)dst & 15) != 0) {
 		std::cerr << "mix_buffers_no_gain(): dst unaligned!" << std::endl;
diff --git a/libs/ardour/sse_functions_64bit_win.s b/libs/ardour/sse_functions_64bit_win.s
new file mode 100644
index 0000000000..7a50c9aef5
--- /dev/null
+++ b/libs/ardour/sse_functions_64bit_win.s
@@ -0,0 +1,679 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+	Author: Sampo Savolainen
+	64-bit conversion: John Rigg
+
+    $Id$
+*/
+
+#; Microsoft version of SSE sample processing functions
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+	.def    x86_sse_mix_buffers_with_gain; .scl    2;      .type   32;     
+.endef
+
+x86_sse_mix_buffers_with_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+#; %xmm3 float	gain
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float	*dst
+#; %rsi float	*src	
+#; %rdx unsigned int nframes
+#; %xmm0 float	gain
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx
+	pushq %rdx 
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+	movss %xmm3, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.MBWG_END
+
+	#; Check for alignment
+
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignment offset
+
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+	#; if we are aligned
+	cmp $0, %rbx
+	jz .MBWG_SSE
+	
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+.MBWG_PRELOOP:
+	
+	#; gain is already in %xmm0
+	movss (%rsi), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rdi), %xmm1
+	movss %xmm1, (%rdi)
+
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rdx 	  #; nframes--
+	jz .MBWG_END
+
+	addq $4, %rbx
+	
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
+	jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+	cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+	jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+	#; gain is already in %xmm0
+	shufps  $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+	movaps	(%rsi), %xmm1 #; source => xmm0
+	mulps	%xmm0,  %xmm1 #; apply gain to source
+	addps	(%rdi), %xmm1 #; mix with destination
+	movaps  %xmm1, (%rdi) #; copy result to destination
+	
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
+
+	subq $4, %rdx #; nframes-=4
+	cmp $4, %rdx
+	jge .MBWG_SSELOOP
+
+	cmp $0, %rdx
+	je .MBWG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBWG_NONALIGN:
+	#; not aligned!
+
+	#; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+	movss (%rsi), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rdi), %xmm1
+	movss %xmm1, (%rdi)
+	
+	addq $4, %rdi
+	addq $4, %rsi
+	
+	decq %rdx
+	jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+	.def	x86_sse_mix_buffers_no_gain; .scl    2;   .type   32;
+.endef
+
+x86_sse_mix_buffers_no_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx
+	pushq %rdx 
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBNG_END
+
+	#; Check for alignment
+
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignment offset
+
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+	cmp $0, %rbx
+	je .MBNG_SSE
+
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+.MBNG_PRELOOP:
+		
+	movss (%rsi), %xmm0
+	addss (%rdi), %xmm0
+	movss %xmm0, (%rdi)
+
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rdx 	  #; nframes--
+	jz	.MBNG_END
+	addq $4, %rbx
+	
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
+	jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+	cmp $4, %rdx #; if there are frames left, but less than 4
+	jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+	movaps	(%rsi), %xmm0 #; source => xmm0
+	addps	(%rdi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%rdi) #; copy result to destination
+	
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
+
+	subq $4, %rdx #; nframes-=4
+	cmp $4, %rdx
+	jge .MBNG_SSELOOP
+
+	cmp $0, %rdx
+	je .MBNG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBNG_NONALIGN:
+	#; not aligned!
+
+	movss (%rsi), %xmm0 #; src => xmm0
+	addss (%rdi), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%rdi) #; xmm0 => dst
+	
+	addq $4, %rdi
+	addq $4, %rsi
+	
+	decq %rdx
+	jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
+
+.globl x86_sse_apply_gain_to_buffer
+	.def	x86_sse_apply_gain_to_buffer; .scl    2;   .type   32;
+.endef
+
+x86_sse_apply_gain_to_buffer:
+
+#; due to Microsoft calling convention
+#; %rcx float 			*buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			gain
+#; %xmm1 float			buf[0]
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi	 float 			*buf	32(%rbp)
+#; %rsi  unsigned int 	nframes
+#; %xmm0 float 			gain
+#; %xmm1 float			buf[0]
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rcx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movss %xmm2, %xmm0
+
+	#; the real function	
+
+	#; if nframes == 0, go to end
+	movq %rsi, %rcx #; nframes
+	cmp	$0, %rcx
+	je	.AG_END
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	shufps	$0x00, %xmm0, %xmm0
+	
+	#; Check for alignment
+
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.AG_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+	#; Load next value from the buffer into %xmm1
+	movss (%rdi), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rdi)
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jz	.AG_END #; if we run out of frames, we go to the end
+
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
+	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+	shr $2,%rax #; unsigned divide by 4
+
+	#; %rax = SSE iterations
+	cmp $0, %rax
+	je .AGPOST_START
+
+.AGLP_SSE:
+
+	movaps (%rdi), %xmm1
+	mulps %xmm0, %xmm1
+	movaps %xmm1, (%rdi)
+
+	addq $16, %rdi  #; buf + 4
+	subq $4, %rcx   #; nframes-=4
+
+	decq %rax
+	jnz .AGLP_SSE
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	andq $3, %rcx #; nframes % 4
+	jz .AG_END
+
+.AGPOST_START:
+
+	movss (%rdi), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rdi)
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jnz	.AGPOST_START #; if we run out of frames, we go to the end
+	
+.AG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rcx
+
+	#; return
+	leave
+	ret
+
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
+
+.globl x86_sse_apply_gain_vector
+	.def	x86_sse_apply_gain_vector; .scl    2;   .type   32;
+.endef
+
+
+x86_sse_apply_gain_vector:
+
+#; due to Microsoft calling convention
+#; %rcx float *buf
+#; %rdx float *gain_vector
+#; %r8	unsigned int nframes
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx 
+	pushq %rdx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+
+	#; if nframes == 0 go to end
+	cmp $0, %rdx
+	je .AGA_END
+		
+	#; Check alignment
+	movq %rdi, %rax
+	andq $12, %rax
+		
+	movq %rsi, %rbx
+	andq $12, %rbx
+
+	cmp %rax,%rbx
+	jne .AGA_ENDLOOP
+
+	cmp $0, %rax
+	jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+		
+	movss (%rdi), %xmm0 #; buf => xmm0
+	movss (%rsi), %xmm1 #; gain value => xmm1
+	mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+	movss %xmm0, (%rdi) #; signal with gain => buf
+
+	decq %rdx
+	jz .AGA_END
+
+	addq $4, %rdi #; buf++
+	addq $4, %rsi #; gab++
+	
+	addq $4, %rax
+	cmp $16, %rax
+	jne .AGA_ALIGNLOOP
+	
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+	movq %rdx, %rax #; nframes => %rax
+	shr $2, %rax #; unsigned divide by 4
+
+	cmp $0, %rax
+	je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+	movaps (%rdi), %xmm0
+	movaps (%rsi), %xmm1
+	mulps %xmm1, %xmm0
+	movaps %xmm0, (%rdi)
+
+	addq $16, %rdi
+	addq $16, %rsi
+
+	decq %rax
+	jnz .AGA_SSELOOP
+
+	andq $3, %rdx #; Remaining frames are nframes & 3
+	jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+	movss (%rdi), %xmm0 #; buf => xmm0
+	movss (%rsi), %xmm1 #; gain value => xmm1
+	mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+	movss %xmm0, (%rdi) #; signal with gain => buf
+
+	addq $4,%rdi
+	addq $4,%rsi
+	decq %rdx #; nframes--
+	jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	leave
+	ret
+
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+	.def	x86_sse_compute_peak; .scl    2;   .type   32;
+.endef
+
+	
+x86_sse_compute_peak:
+
+#; due to Microsoft calling convention
+#; %rcx float*          buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			current
+#; %xmm1 float			buf[0]
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi	 float*         buf	32(%rbp)
+#; %rsi	 unsigned int 	nframes
+#; %xmm0 float			current
+#; %xmm1 float			buf[0]
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save registers
+	pushq %rcx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movss %xmm2, %xmm0
+
+	#; if nframes == 0, go to end
+	movq %rsi, %rcx #; nframes
+	cmp	$0, %rcx
+	je	.CP_END
+
+	#; create the "abs" mask in %xmm2
+	pushq   $2147483647
+	movss	(%rsp), %xmm2
+	addq    $8, %rsp
+	shufps	$0x00, %xmm2, %xmm2
+
+	#; Check for alignment
+
+	#;movq 8(%rbp), %rdi #; buf 
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.CP_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+	#; Load next value from the buffer
+	movss (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jz	.CP_END #; if we run out of frames, we go to the end
+	
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
+	jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+	shr $2,%rax #; unsigned divide by 4
+	jz .POST_START
+
+	#; %rax = SSE iterations
+
+	#; current maximum is at %xmm0, but we need to ..
+	shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+	#;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+	movaps (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxps %xmm1, %xmm0
+
+	addq $16, %rdi
+
+	subq $4, %rdx #; nframes-=4
+
+	decq %rax
+	jnz .LP_SSE
+
+	#; Calculate the maximum value contained in the 4 FP's in %xmm0
+	movaps %xmm0, %xmm1
+	shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+	maxps  %xmm1, %xmm0 #; maximums of the two pairs
+	movaps %xmm0, %xmm1
+	shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
+	maxps  %xmm1, %xmm0 
+
+	#; now every float in %xmm0 is the same value, current maximum value
+	
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	#; if no remaining frames, jump to the end
+
+	andq $3, %rcx #; nframes % 4
+	jz .CP_END
+
+.POST_START:
+
+	movss (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+	
+	addq $4, %rdi 	#; buf++;
+	
+	decq %rcx		#; nframes--;
+	jnz .POST_START
+
+.CP_END:
+
+	#; restore registers
+	popq %rsi
+	popq %rdi
+	popq %rcx
+
+	#; return value is in xmm0
+
+	#; return
+	leave
+	ret
+
+#; end proc
\ No newline at end of file
diff --git a/libs/ardour/wscript b/libs/ardour/wscript
index 95ffe42465..eb263e6e78 100644
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@@ -408,6 +408,13 @@ def build(bld):
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s' ]
         elif bld.env['build_target'] == 'x86_64':
             obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s' ]
+        
+        if bld.env['build_target'] == 'mingw':
+                import platform as PLATFORM
+                u = PLATFORM.uname ()
+                cpu = u[4]
+                if re.search ("(x86_64|AMD64)", cpu) != None:
+                        obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
 
     # i18n
     if bld.is_defined('ENABLE_NLS'):
diff --git a/libs/pbd/fpu.cc b/libs/pbd/fpu.cc
index b12d341366..0998f43bdc 100644
--- a/libs/pbd/fpu.cc
+++ b/libs/pbd/fpu.cc
@@ -16,7 +16,7 @@
     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 
 */
-#ifndef  COMPILER_MSVC
+#if !(defined (COMPILER_MSVC) || defined (COMPILER_MINGW))
 #include "libpbd-config.h"
 
 #define _XOPEN_SOURCE 600
@@ -39,10 +39,6 @@ FPU::FPU ()
 
 	_flags = Flags (0);
 
-#if defined(__MINGW64__) // Vkamyshniy: under __MINGW64__ the assembler code below is not compiled
-	return;
-#endif
-
 #if !( (defined __x86_64__) || (defined __i386__) ) // !ARCH_X86
 	return;
 #else
diff --git a/libs/pbd/msvc/fpu.cc b/libs/pbd/msvc/fpu.cc
index 6997405928..2ade2ad511 100644
--- a/libs/pbd/msvc/fpu.cc
+++ b/libs/pbd/msvc/fpu.cc
@@ -1,10 +1,14 @@
-#ifdef COMPILER_MSVC  // Added by JE - 05-12-2009. Inline assembler instructions
-                      // have been changed to Intel format and (in the case of
-                      // cpuid) was replaced by the equivalent VC++ system call).
+// Added by JE - 05-12-2009. Inline assembler instructions
+// have been changed to Intel format and (in the case of
+// cpuid) was replaced by the equivalent VC++ system call).
+
+#if defined (COMPILER_MSVC) || defined (COMPILER_MINGW)
+
 #define _XOPEN_SOURCE 600
 #include <cstdlib>
 #include <stdint.h>
 #include <intrin.h>  // Added by JE - 05-12-2009
+#include <assert.h>
 
 #include <pbd/fpu.h>
 #include <pbd/error.h>
@@ -20,47 +24,19 @@ FPU::FPU ()
 
 	_flags = (Flags)0;
 
-#ifndef ARCH_X86
-	return;
-
-#else
-
 #ifndef USE_X86_64_ASM
-int cpuInfo[4];
+	return;
+#endif
 
+	// Get CPU lfags using Microsof function
+	// It works for both 64 and 32 bit systems
+	// no need to use assembler for getting info from register, this function does this for us
+	int cpuInfo[4];
 	__cpuid (cpuInfo, 1);
 	cpuflags = cpuInfo[3];
-/*
-	__asm {  // This is how the original section would look if converted to Intel syntax.
-             // However, I have grave doubts about whether it's doing the right thing.
-             // It seems as if the intention was to retrieve feature information from
-             // the processor. However, feature information is returned in the ebx register
-             // (if you believe Wikipedia) or in edx (if you believe Microsoft). Unfortunately,
-             // both registers get ignored in the original code!! Confused?? Join the club!!
-		mov   eax, 1
-		push  ebx
-		cpuid
-		mov   edx, 0
-		pop   ebx
-		mov   cpuflags, ecx // This can't be right, surely???
-	}; */
-#else
-// Note that this syntax is currently still in AT&T format !
-	asm volatile (
-		"pushq %%rbx\n"
-		"movq $1, %%rax\n"
-		"cpuid\n"
-		"movq %%rdx, %0\n"
-		"popq %%rbx\n"
-		: "=r" (cpuflags)
-		:
-		: "%rax", "%rcx", "%rdx", "memory"
-		);
-
-#endif /* USE_X86_64_ASM */
 
 	if (cpuflags & (1<<25)) {
-		_flags = Flags (_flags | (HasSSE|HasFlushToZero));
+		_flags = Flags (_flags | (HasSSE|HasFlushToZero) );
 	}
 
 	if (cpuflags & (1<<26)) {
@@ -68,32 +44,46 @@ int cpuInfo[4];
 	}
 
 	if (cpuflags & (1 << 24)) {
-		bool  aligned_malloc = false; // Added by JE - 05-12-2009
-		char* fxbuf = 0;
-// This section changed by JE - 05-12-2009
-#ifdef NO_POSIX_MEMALIGN
-#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)       // All of these support '_aligned_malloc()'
-		fxbuf = (char *) _aligned_malloc(512, 16);  // (note that they all need at least MSVC runtime 7.0)
-		aligned_malloc = true;
-#else
-		fxbuf = (char *) malloc(512);
-#endif
-#else
-		fxbuf = posix_memalign ((void**)&fxbuf, 16, 512);
-#endif
+		char** fxbuf = 0;
+
+		// allocate alligned buffer
+		fxbuf = (char **) malloc (sizeof (char *));
+		assert (fxbuf);
+		*fxbuf = (char *) malloc (512);
+		assert (*fxbuf);
+
 		// Verify that fxbuf is correctly aligned
-		unsigned long buf_addr = (unsigned long)(void*)fxbuf;
+		unsigned long long buf_addr = (unsigned long long)(void*)fxbuf;
 		if ((0 == buf_addr) || (buf_addr % 16))
 			error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg;
 		else
 		{
-			memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
+			memset(*fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
 
+#if defined (COMPILER_MINGW)
+			asm volatile (
+				"fxsave (%0)"
+				:
+				: "r" (*fxbuf)
+				: "memory"
+				);
+/*
+			asm( ".intel_syntax noprefix\n" );
+
+			asm volatile (
+				 "mov eax, fxbuf\n"
+				 "fxsave   [eax]\n" 
+			);
+
+			asm( ".att_syntax prefix\n" );
+*/
+
+#elif defined (COMPILER_MSVC)
 			__asm {
 				mov eax, fxbuf
 				fxsave   [eax]
 			};
-
+#endif
 			uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);
 
 			/* if the mask is zero, set its default value (from intel specs) */
@@ -106,13 +96,10 @@ int cpuInfo[4];
 				_flags = Flags (_flags | HasDenormalsAreZero);
 			}
 
-			if (aligned_malloc)
-				_aligned_free (fxbuf);
-			else
-				free (fxbuf);
+			free (*fxbuf);
+			free (fxbuf);
 		}
 	}
-#endif  // ARCH_X86
 }
 
 FPU::~FPU ()
diff --git a/libs/pbd/wscript b/libs/pbd/wscript
index c4f08b7474..8554c39491 100644
--- a/libs/pbd/wscript
+++ b/libs/pbd/wscript
@@ -48,7 +48,6 @@ libpbd_sources = [
     'ffs.cc',
     'file_manager.cc',
     'file_utils.cc',
-    'fpu.cc',
     'glib_semaphore.cc',
     'id.cc',
     'locale_guard.cc',
@@ -150,8 +149,18 @@ def build(bld):
     if bld.env['build_target'] == 'x86_64':
         obj.defines += [ 'USE_X86_64_ASM' ]
     if bld.env['build_target'] == 'mingw':
+        import re
+        import platform as PLATFORM
+        u = PLATFORM.uname ()
+        cpu = u[4]
+        if re.search ("(x86_64|AMD64)", cpu) != None:
+            obj.defines += [ 'USE_X86_64_ASM' ]
+            obj.defines += ['NO_POSIX_MEMALIGN' ]
         obj.source += [ 'windows_special_dirs.cc' ]
+        obj.source += [ 'msvc/fpu.cc' ]
         obj.uselib += ' OLE'
+    else:
+        obj.source += [ 'fpu.cc' ]
 
     if bld.env['BUILD_TESTS'] and bld.is_defined('HAVE_CPPUNIT'):
         # Unit tests
diff --git a/wscript b/wscript
index 94483b41b4..2f552b6e56 100755
--- a/wscript
+++ b/wscript
@@ -221,7 +221,7 @@ def set_compiler_flags (conf,opt):
         # 
         compiler_flags.append ('-U__STRICT_ANSI__')
 
-    if ((re.search ("i[0-9]86", cpu) != None) or (re.search ("x86_64", cpu) != None)) and conf.env['build_target'] != 'none':
+    if (re.search ("(i[0-9]86|x86_64|AMD64)", cpu) != None) and conf.env['build_target'] != 'none':
 
 
         #
@@ -229,9 +229,8 @@ def set_compiler_flags (conf,opt):
         # the compile-time presence of the macro _LP64 is used to 
         # distingush 32 and 64 bit assembler
         #
-
-        if (re.search ("(i[0-9]86|x86_64)", cpu) != None):
-            compiler_flags.append ("-DARCH_X86")
+ 
+        compiler_flags.append ("-DARCH_X86")
 
         if platform == 'linux' :
 
@@ -258,6 +257,17 @@ def set_compiler_flags (conf,opt):
 
         if not is_clang and ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse:
             compiler_flags.extend (["-msse", "-mfpmath=sse", "-DUSE_XMMINTRIN"])
+            
+        if (conf.env['build_target'] == 'mingw'):
+            if (re.search ("(x86_64|AMD64)", cpu) != None):
+                # on Windows sse is supported by 64 bit platforms only
+                build_host_supports_sse = True
+                
+                # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
+                # compiler_flags.append (["--mmnemonic=att", "msyntax=att")
+                
+                compiler_flags.extend (["-msse", "-mfpmath=sse", "-DUSE_XMMINTRIN", "-masm=att"])
+                
 
     # end of processor-specific section
 
@@ -266,7 +276,7 @@ def set_compiler_flags (conf,opt):
         if sys.platform == 'darwin':
             compiler_flags.append("-DBUILD_VECLIB_OPTIMIZATIONS")
             conf.env.append_value('LINKFLAGS_OSX', ['-framework', 'Accelerate'])
-        elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64':
+        elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64' or (conf.env['build_target'] == 'mingw' and build_host_supports_sse):
             compiler_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
         if not build_host_supports_sse:
             print("\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)")
@@ -695,7 +705,7 @@ def configure(conf):
     autowaf.check_pkg(conf, 'rubberband', uselib_store='RUBBERBAND', mandatory=True)
 
     if Options.options.dist_target == 'mingw':
-        Options.options.fpu_optimization = False
+        Options.options.fpu_optimization = True
         conf.env.append_value('CFLAGS', '-DPLATFORM_WINDOWS')
         conf.env.append_value('CFLAGS', '-DCOMPILER_MINGW')
         conf.env.append_value('CXXFLAGS', '-DPLATFORM_WINDOWS')

From 438bdbfba5497b6e8373185000521855de9ed7af Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Wed, 8 Apr 2015 16:53:49 +0300
Subject: [PATCH 06/13] [Summary] Used SSE optimized function to mix port
 buffer data

---
 libs/backends/wavesaudio/waves_audioport.cc | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/libs/backends/wavesaudio/waves_audioport.cc b/libs/backends/wavesaudio/waves_audioport.cc
index 4ded37d906..48de38d794 100644
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@@ -18,6 +18,7 @@
 */
 
 #include "waves_audioport.h"
+#include "ardour/runtime_functions.h"
 
 using namespace ARDOUR;
 
@@ -40,14 +41,18 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
              * Base class WavesDataPort takes is supposed to provide enough consistentcy
              * of the connections.
              */
-            for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
-				 it != get_connections ().end ();
-				 ++it) {
+
+			// get first buffer data
+			memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample));
+			++it;
+            
+			// mix the rest
+			for (; it != get_connections ().end (); ++it) {
                 Sample* tgt = buffer ();
                 const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
-                for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
-                    *tgt += *src;
-                }
+
+				// use otimized function to mix the buffers
+				ARDOUR::mix_buffers_no_gain (tgt, src, nframes);
             }
         }
     }

From d1da81e798d31042e4589d09768410774dee1325 Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Thu, 9 Apr 2015 13:27:57 +0300
Subject: [PATCH 07/13] [Summary] Added correct memory alignment for Windows in
 ../pbd/malign.h [Reviewed by] YPozdnyakov

---
 libs/ardour/audio_buffer.cc |  4 ++--
 libs/ardour/midi_buffer.cc  |  4 ++--
 libs/pbd/malign.cc          | 19 +++++++++++++++++++
 libs/pbd/pbd/malign.h       |  3 ++-
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/libs/ardour/audio_buffer.cc b/libs/ardour/audio_buffer.cc
index de2c1ddf00..6d8b2aa55f 100644
--- a/libs/ardour/audio_buffer.cc
+++ b/libs/ardour/audio_buffer.cc
@@ -43,7 +43,7 @@ AudioBuffer::AudioBuffer(size_t capacity)
 AudioBuffer::~AudioBuffer()
 {
 	if (_owns_data)
-		free(_data);
+		cache_aligned_free(_data);
 }
 
 void
@@ -60,7 +60,7 @@ AudioBuffer::resize (size_t size)
 		return;
 	}
 
-	free (_data);
+	cache_aligned_free (_data);
 
 	cache_aligned_malloc ((void**) &_data, sizeof (Sample) * size);
 
diff --git a/libs/ardour/midi_buffer.cc b/libs/ardour/midi_buffer.cc
index 4715be928c..a2253f3d30 100644
--- a/libs/ardour/midi_buffer.cc
+++ b/libs/ardour/midi_buffer.cc
@@ -44,7 +44,7 @@ MidiBuffer::MidiBuffer(size_t capacity)
 
 MidiBuffer::~MidiBuffer()
 {
-	free(_data);
+	cache_aligned_free(_data);
 }
 
 void
@@ -60,7 +60,7 @@ MidiBuffer::resize(size_t size)
 		return;
 	}
 
-	free (_data);
+	cache_aligned_free (_data);
 
 	cache_aligned_malloc ((void**) &_data, size);
 
diff --git a/libs/pbd/malign.cc b/libs/pbd/malign.cc
index 978ca00451..8c09f85831 100644
--- a/libs/pbd/malign.cc
+++ b/libs/pbd/malign.cc
@@ -38,6 +38,15 @@ static const int CPU_CACHE_ALIGN = 16; /* arguably 32 on most arches, but it mat
 int cache_aligned_malloc (void** memptr, size_t size)
 {
 #ifndef HAVE_POSIX_MEMALIGN
+#ifdef PLATFORM_WINDOWS
+	if (((*memptr) = _aligned_malloc (size, CPU_CACHE_ALIGN)) == 0) {
+		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
+					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
+		return errno;
+	} else {
+		return 0;
+	}
+#else
 	if (((*memptr) = malloc (size)) == 0) {
 		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
 					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
@@ -45,6 +54,7 @@ int cache_aligned_malloc (void** memptr, size_t size)
 	} else {
 		return 0;
 	}
+#endif
 #else
         if (posix_memalign (memptr, CPU_CACHE_ALIGN, size)) {
 		fatal << string_compose (_("Memory allocation error: posix_memalign (%1 * %2) failed (%3)"),
@@ -54,3 +64,12 @@ int cache_aligned_malloc (void** memptr, size_t size)
 	return 0;
 #endif	
 }
+
+void cache_aligned_free (void* memptr)
+{
+#ifdef PLATFORM_WINDOWS
+	_aligned_free (memptr);
+#else
+	free (memptr);
+#endif
+}
\ No newline at end of file
diff --git a/libs/pbd/pbd/malign.h b/libs/pbd/pbd/malign.h
index 07f42f586f..ecee47c4e6 100644
--- a/libs/pbd/pbd/malign.h
+++ b/libs/pbd/pbd/malign.h
@@ -24,6 +24,7 @@
 
 #include "pbd/libpbd_visibility.h"
 
-LIBPBD_API int cache_aligned_malloc (void** memptr, size_t size);
+LIBPBD_API int  cache_aligned_malloc (void** memptr, size_t size);
+LIBPBD_API void cache_aligned_free (void* memptr);
 
 #endif /* __pbd_malign_h__ */

From ef1fd2b67e8caf9a610533158754d61e0c7ae2ce Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Thu, 9 Apr 2015 13:47:35 +0300
Subject: [PATCH 08/13] [Summary] Added user defined extended alignment for
 WavesAudioPort. [Revieded by] YPozdnyakov

---
 libs/backends/wavesaudio/waves_audioport.cc | 9 ++++++++-
 libs/backends/wavesaudio/waves_audioport.h  | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libs/backends/wavesaudio/waves_audioport.cc b/libs/backends/wavesaudio/waves_audioport.cc
index 48de38d794..0892a2d679 100644
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@@ -19,13 +19,20 @@
 
 #include "waves_audioport.h"
 #include "ardour/runtime_functions.h"
+#include "pbd/malign.h"
 
 using namespace ARDOUR;
 
 WavesAudioPort::WavesAudioPort (const std::string& port_name, PortFlags flags)
     : WavesDataPort (port_name, flags)    
 {
-    memset (_buffer, 0, sizeof (_buffer));
+	cache_aligned_malloc ((void**)&_buffer, MAX_BUFFER_SIZE_BYTES);
+    memset (_buffer, 0, MAX_BUFFER_SIZE_BYTES);
+}
+
+WavesAudioPort::~WavesAudioPort ()
+{
+	cache_aligned_free (_buffer);
 }
 
 
diff --git a/libs/backends/wavesaudio/waves_audioport.h b/libs/backends/wavesaudio/waves_audioport.h
index 5b4ab52449..d658dba868 100644
--- a/libs/backends/wavesaudio/waves_audioport.h
+++ b/libs/backends/wavesaudio/waves_audioport.h
@@ -35,7 +35,7 @@ public:
 
     WavesAudioPort (const std::string& port_name, PortFlags flags);
 
-    virtual ~WavesAudioPort () { };
+    virtual ~WavesAudioPort ();
 
     virtual DataType type () const {    return DataType::AUDIO; };
 
@@ -49,7 +49,7 @@ protected:
 
 private:
 
-    Sample _buffer[MAX_BUFFER_SIZE_SAMPLES];
+    Sample *_buffer;
 };
 
 } // namespace

From 7cc7b15070d59e53bfd805730845d06195855d97 Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Fri, 10 Apr 2015 18:08:13 +0300
Subject: [PATCH 09/13] [Summary] AudioPort buffer does not need 64 byte
 alignment which cache_aligned_malloc provides. Added new function which
 accepts argument to specify required alignment.

AudioPort buffer requires 32 byte alignment

[Review Required] YPosdnyakov
---
 libs/backends/wavesaudio/waves_audioport.cc |  4 +-
 libs/pbd/malign.cc                          | 41 +++++++++++++++++++++
 libs/pbd/pbd/malign.h                       |  3 ++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/libs/backends/wavesaudio/waves_audioport.cc b/libs/backends/wavesaudio/waves_audioport.cc
index 0892a2d679..84f09c0680 100644
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@@ -26,13 +26,13 @@ using namespace ARDOUR;
 WavesAudioPort::WavesAudioPort (const std::string& port_name, PortFlags flags)
     : WavesDataPort (port_name, flags)    
 {
-	cache_aligned_malloc ((void**)&_buffer, MAX_BUFFER_SIZE_BYTES);
+	aligned_malloc ((void**)&_buffer, MAX_BUFFER_SIZE_BYTES, 32 /*32 byte alignment*/);
     memset (_buffer, 0, MAX_BUFFER_SIZE_BYTES);
 }
 
 WavesAudioPort::~WavesAudioPort ()
 {
-	cache_aligned_free (_buffer);
+	aligned_free (_buffer);
 }
 
 
diff --git a/libs/pbd/malign.cc b/libs/pbd/malign.cc
index 8c09f85831..b4b1445528 100644
--- a/libs/pbd/malign.cc
+++ b/libs/pbd/malign.cc
@@ -47,6 +47,8 @@ int cache_aligned_malloc (void** memptr, size_t size)
 		return 0;
 	}
 #else
+	std::string << string_compose (_("Memory allocation error: malloc (%1 * %2)"),
+					 CPU_CACHE_ALIGN, size) << endmsg;
 	if (((*memptr) = malloc (size)) == 0) {
 		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
 					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
@@ -72,4 +74,43 @@ void cache_aligned_free (void* memptr)
 #else
 	free (memptr);
 #endif
+}
+
+int  aligned_malloc (void** memptr, size_t size, size_t alignment)
+{
+#ifndef HAVE_POSIX_MEMALIGN
+#ifdef PLATFORM_WINDOWS
+	if (((*memptr) = _aligned_malloc (size, alignment)) == 0) {
+		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
+					 alignment, size, strerror (errno)) << endmsg;
+		return errno;
+	} else {
+		return 0;
+	}
+#else
+	if (((*memptr) = malloc (size)) == 0) {
+		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
+					 alignment, size, strerror (errno)) << endmsg;
+		return errno;
+	} else {
+		return 0;
+	}
+#endif
+#else
+        if (posix_memalign (memptr, alignment, size)) {
+		fatal << string_compose (_("Memory allocation error: posix_memalign (%1 * %2) failed (%3)"),
+					 alignment, size, strerror (errno)) << endmsg;
+	}
+
+	return 0;
+#endif	
+}
+
+void aligned_free (void* memptr)
+{
+#ifdef PLATFORM_WINDOWS
+	_aligned_free (memptr);
+#else
+	free (memptr);
+#endif
 }
\ No newline at end of file
diff --git a/libs/pbd/pbd/malign.h b/libs/pbd/pbd/malign.h
index ecee47c4e6..09d182fa40 100644
--- a/libs/pbd/pbd/malign.h
+++ b/libs/pbd/pbd/malign.h
@@ -27,4 +27,7 @@
 LIBPBD_API int  cache_aligned_malloc (void** memptr, size_t size);
 LIBPBD_API void cache_aligned_free (void* memptr);
 
+LIBPBD_API int  aligned_malloc (void** memptr, size_t size, size_t alignment);
+LIBPBD_API void aligned_free (void* memptr);
+
 #endif /* __pbd_malign_h__ */

From 394ee5e575aca8e8490794f06ab4b4cc49870951 Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Tue, 14 Apr 2015 12:46:02 +0300
Subject: [PATCH 10/13] [Summary] Review fixes for
 7cc7b15070d59e53bfd805730845d06195855d97 [Reviewed by] YPozdnyakov

---
 libs/pbd/malign.cc | 50 ++++++++++------------------------------------
 1 file changed, 10 insertions(+), 40 deletions(-)

diff --git a/libs/pbd/malign.cc b/libs/pbd/malign.cc
index b4b1445528..711d138e06 100644
--- a/libs/pbd/malign.cc
+++ b/libs/pbd/malign.cc
@@ -35,46 +35,6 @@ static const int CPU_CACHE_ALIGN = 64;
 static const int CPU_CACHE_ALIGN = 16; /* arguably 32 on most arches, but it matters less */
 #endif
 
-int cache_aligned_malloc (void** memptr, size_t size)
-{
-#ifndef HAVE_POSIX_MEMALIGN
-#ifdef PLATFORM_WINDOWS
-	if (((*memptr) = _aligned_malloc (size, CPU_CACHE_ALIGN)) == 0) {
-		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
-					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
-		return errno;
-	} else {
-		return 0;
-	}
-#else
-	std::string << string_compose (_("Memory allocation error: malloc (%1 * %2)"),
-					 CPU_CACHE_ALIGN, size) << endmsg;
-	if (((*memptr) = malloc (size)) == 0) {
-		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
-					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
-		return errno;
-	} else {
-		return 0;
-	}
-#endif
-#else
-        if (posix_memalign (memptr, CPU_CACHE_ALIGN, size)) {
-		fatal << string_compose (_("Memory allocation error: posix_memalign (%1 * %2) failed (%3)"),
-					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
-	}
-
-	return 0;
-#endif	
-}
-
-void cache_aligned_free (void* memptr)
-{
-#ifdef PLATFORM_WINDOWS
-	_aligned_free (memptr);
-#else
-	free (memptr);
-#endif
-}
 
 int  aligned_malloc (void** memptr, size_t size, size_t alignment)
 {
@@ -113,4 +73,14 @@ void aligned_free (void* memptr)
 #else
 	free (memptr);
 #endif
+}
+
+int cache_aligned_malloc (void** memptr, size_t size)
+{
+	return aligned_malloc (memptr, size, CPU_CACHE_ALIGN);
+}
+
+void cache_aligned_free (void* memptr)
+{
+	aligned_free (memptr);
 }
\ No newline at end of file

From 81dbdc4c3ff071f9ca2aea2959d3c671825aee91 Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Thu, 16 Apr 2015 12:18:31 +0300
Subject: [PATCH 11/13] [Summary] Fixed bug in Ardour SSE assembler function.
 Wrong register was decreased in LP_SSE: block of float x86_sse_compute_peak
 function [Details] This bug does not show up when buffers are 16 byte
 aligned, but when they are not - this bug will lead to crash. I tested all
 these functions with non-aligned buffers as well as long they do handle the
 situation with unaligned buffers. [To be reviewed by] Paul Davis

---
 libs/ardour/sse_functions_64bit_win.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/ardour/sse_functions_64bit_win.s b/libs/ardour/sse_functions_64bit_win.s
index 7a50c9aef5..78c50930c2 100644
--- a/libs/ardour/sse_functions_64bit_win.s
+++ b/libs/ardour/sse_functions_64bit_win.s
@@ -629,7 +629,7 @@ x86_sse_compute_peak:
 
 	addq $16, %rdi
 
-	subq $4, %rdx #; nframes-=4
+	subq $4, %rcx #; nframes-=4
 
 	decq %rax
 	jnz .LP_SSE

From 2e763add6023bef674e97ad2c85c9a2416ad3872 Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Thu, 16 Apr 2015 15:52:04 +0300
Subject: [PATCH 12/13] [Summary] Added optimized AVX function for sample
 processing [Details] Added AVX versions of existing 5 SSE functions. Added
 6th AVX function to copy vectors which is 1.5 times faster then memcpy. Data
 consistency and validness  is fully tested after processing with new AVX
 functions on aligned and non aligned buffers.

---
 libs/ardour/ardour/mix.h                      |  12 +
 libs/ardour/ardour/runtime_functions.h        |   6 +-
 libs/ardour/globals.cc                        |  20 +-
 libs/ardour/mix.cc                            |   6 +
 libs/ardour/sse_avx_functions_64bit_win.s     | 587 ++++++++++++++++++
 libs/ardour/sse_functions_avx.cc              | 120 ++++
 libs/ardour/windows/libardour.vcxproj         |   2 +
 libs/ardour/windows/libardour.vcxproj.filters |   2 +
 libs/ardour/wscript                           |   3 +-
 .../backends/wavesaudio/waves_audiobackend.cc |   5 +-
 libs/backends/wavesaudio/waves_audioport.cc   |   3 +-
 libs/pbd/msvc/fpu.cc                          |  40 +-
 libs/pbd/pbd/fpu.h                            |   4 +-
 libs/pbd/wscript                              |   2 +-
 wscript                                       |   5 +-
 15 files changed, 781 insertions(+), 36 deletions(-)
 create mode 100644 libs/ardour/sse_avx_functions_64bit_win.s
 create mode 100644 libs/ardour/sse_functions_avx.cc

diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h
index 3cd9a3e60f..2db444d02b 100644
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@@ -33,7 +33,17 @@ extern "C" {
 	LIBARDOUR_API void  x86_sse_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 }
 
+extern "C" {
+/* AVX functions */
+	LIBARDOUR_API float x86_sse_avx_compute_peak         (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+	LIBARDOUR_API void  x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+	LIBARDOUR_API void  x86_sse_avx_copy_vector          (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+}
+
 LIBARDOUR_API void  x86_sse_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void  x86_sse_avx_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
 
 /* debug wrappers for SSE functions */
 
@@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak               (const ARDOUR::Sample * buf
 LIBARDOUR_API void  debug_apply_gain_to_buffer       (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_with_gain      (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_no_gain        (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  debug_copy_vector                (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 
 #endif
 
@@ -61,5 +72,6 @@ LIBARDOUR_API void  default_find_peaks                (const ARDOUR::Sample * bu
 LIBARDOUR_API void  default_apply_gain_to_buffer      (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_with_gain     (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_no_gain       (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  default_copy_vector				  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 
 #endif /* __ardour_mix_h__ */
diff --git a/libs/ardour/ardour/runtime_functions.h b/libs/ardour/ardour/runtime_functions.h
index e1d6b99f61..45d6ec7015 100644
--- a/libs/ardour/ardour/runtime_functions.h
+++ b/libs/ardour/ardour/runtime_functions.h
@@ -25,17 +25,19 @@
 
 namespace ARDOUR {
 
-	typedef float (*compute_peak_t)			(const ARDOUR::Sample *, pframes_t, float);
-	typedef void  (*find_peaks_t)                   (const ARDOUR::Sample *, pframes_t, float *, float*);
+	typedef float (*compute_peak_t)			    (const ARDOUR::Sample *, pframes_t, float);
+	typedef void  (*find_peaks_t)               (const ARDOUR::Sample *, pframes_t, float *, float*);
 	typedef void  (*apply_gain_to_buffer_t)		(ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_with_gain_t)	(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_no_gain_t)		(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
+	typedef void  (*copy_vector_t)			    (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
 
 	LIBARDOUR_API extern compute_peak_t		compute_peak;
 	LIBARDOUR_API extern find_peaks_t               find_peaks;
 	LIBARDOUR_API extern apply_gain_to_buffer_t	apply_gain_to_buffer;
 	LIBARDOUR_API extern mix_buffers_with_gain_t	mix_buffers_with_gain;
 	LIBARDOUR_API extern mix_buffers_no_gain_t	mix_buffers_no_gain;
+	LIBARDOUR_API extern copy_vector_t			copy_vector;
 }
 
 #endif /* __ardour_runtime_functions_h__ */
diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc
index cf8f8dc03f..188796023a 100644
--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@@ -127,6 +127,7 @@ find_peaks_t            ARDOUR::find_peaks = 0;
 apply_gain_to_buffer_t  ARDOUR::apply_gain_to_buffer = 0;
 mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
 mix_buffers_no_gain_t   ARDOUR::mix_buffers_no_gain = 0;
+copy_vector_t			ARDOUR::copy_vector = 0;
 
 PBD::Signal1<void,std::string> ARDOUR::BootMessage;
 PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
@@ -153,7 +154,21 @@ setup_hardware_optimization (bool try_optimization)
 
 #if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)
 
-		if (fpu.has_sse()) {
+		if (fpu.has_avx()) {
+
+			info << "Using AVX optimized routines" << endmsg;
+
+			// AVX SET
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu.has_sse()) {
 
 			info << "Using SSE optimized routines" << endmsg;
 
@@ -163,6 +178,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer  = x86_sse_apply_gain_to_buffer;
 			mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
 			mix_buffers_no_gain   = x86_sse_mix_buffers_no_gain;
+			copy_vector           = default_copy_vector;
 
 			generic_mix_functions = false;
 
@@ -180,6 +196,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer   = veclib_apply_gain_to_buffer;
 			mix_buffers_with_gain  = veclib_mix_buffers_with_gain;
 			mix_buffers_no_gain    = veclib_mix_buffers_no_gain;
+			copy_vector            = default_copy_vector;
 
 			generic_mix_functions = false;
 
@@ -199,6 +216,7 @@ setup_hardware_optimization (bool try_optimization)
 		apply_gain_to_buffer  = default_apply_gain_to_buffer;
 		mix_buffers_with_gain = default_mix_buffers_with_gain;
 		mix_buffers_no_gain   = default_mix_buffers_no_gain;
+		copy_vector           = default_copy_vector;
 
 		info << "No H/W specific optimizations in use" << endmsg;
 	}
diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc
index adae68ae7f..96ae624487 100644
--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
 	}
 }
 
+void
+default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
+{
+	memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
+}
+
 #if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
 #include <Accelerate/Accelerate.h>
 
diff --git a/libs/ardour/sse_avx_functions_64bit_win.s b/libs/ardour/sse_avx_functions_64bit_win.s
new file mode 100644
index 0000000000..22829db98a
--- /dev/null
+++ b/libs/ardour/sse_avx_functions_64bit_win.s
@@ -0,0 +1,587 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+	Author: Sampo Savolainen
+	64-bit conversion: John Rigg
+
+    $Id$
+*/
+
+#; Microsoft version of AVX sample processing functions
+
+#; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
+
+.globl x86_sse_avx_mix_buffers_with_gain
+	.def    x86_sse_avx_mix_buffers_with_gain; .scl    2;      .type   32;     
+.endef
+
+x86_sse_avx_mix_buffers_with_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+#; %xmm3 float	gain
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	
+	#; move current max to %xmm0 for convenience
+	movss %xmm3, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBWG_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
+
+	#; if we are aligned
+	cmp $0, %rbx
+	jz .MBWG_AVX
+	
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; SSE instructions
+
+.MBWG_PRELOOP:
+	
+	#; gain is already in %xmm0
+	movss (%rdx), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rcx), %xmm1
+	movss %xmm1, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+	decq %r8 	  #; nframes--
+	jz .MBWG_END
+
+	addq $4, %rbx
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .MBWG_PRELOOP
+
+.MBWG_AVX:
+
+	cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
+	jl .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.MBWG_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm1        #; source => xmm0
+	vmulps	%ymm0,  %ymm1, %ymm2 #; apply gain to source
+	vaddps	(%rcx), %ymm2, %ymm1 #; mix with destination
+	vmovaps  %ymm1, (%rcx)        #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .MBWG_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .MBWG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.MBWG_NONALIGN:
+	#; not aligned!
+
+	#; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+	movss (%rdx), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rcx), %xmm1
+	movss %xmm1, (%rcx)
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_avx_mix_buffers_no_gain
+	.def	x86_sse_avx_mix_buffers_no_gain; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_mix_buffers_no_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBNG_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
+
+	cmp $0, %rbx
+	je .MBNG_AVX #; aligned at 32, rpoceed to AVX
+
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; AVX instructions
+
+.MBNG_PRELOOP:
+
+	movss (%rdx), %xmm0
+	addss (%rcx), %xmm0
+	movss %xmm0, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+
+	decq %r8 	  #; nframes--
+	jz	.MBNG_END
+	
+	addq $4, %rbx #; one non-aligned byte less
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .MBNG_PRELOOP
+
+.MBNG_AVX:
+
+	cmp $8, %r8 #; if there are frames left, but less than 8
+	jl .MBNG_NONALIGN #; we can't run AVX
+
+.MBNG_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm0        #; source => xmm0
+	vaddps	(%rcx), %ymm0, %ymm1 #; mix with destination
+	vmovaps  %ymm1, (%rcx)       #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .MBNG_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .MBNG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.MBNG_NONALIGN:
+	#; not aligned!
+	#; 
+
+	movss (%rdx), %xmm0 #; src => xmm0
+	addss (%rcx), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%rcx) #; xmm0 => dst
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_avx_copy_vector
+	.def	x86_sse_avx_copy_vector; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_copy_vector:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.CB_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
+
+	cmp $0, %rbx
+	je .CB_AVX #; aligned at 32, rpoceed to AVX
+
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; AVX instructions
+
+.CB_PRELOOP:
+
+	movss (%rdx), %xmm0
+	movss %xmm0, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+
+	decq %r8 	  #; nframes--
+	jz	.CB_END
+	
+	addq $4, %rbx #; one non-aligned byte less
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .CB_PRELOOP
+
+.CB_AVX:
+
+	cmp $8, %r8 #; if there are frames left, but less than 8
+	jl .CB_NONALIGN #; we can't run AVX
+
+.CB_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm0        #; source => xmm0
+	vmovaps  %ymm0, (%rcx)       #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .CB_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .CB_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.CB_NONALIGN:
+	#; not aligned!
+	#; 
+
+	movss (%rdx), %xmm0 #; src => xmm0
+	movss %xmm0, (%rcx) #; xmm0 => dst
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .CB_NONALIGN
+
+.CB_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
+
+.globl x86_sse_avx_apply_gain_to_buffer
+	.def	x86_sse_avx_apply_gain_to_buffer; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_apply_gain_to_buffer:
+
+#; due to Microsoft calling convention
+#; %rcx float 			*buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			gain			avx specific register
+
+	pushq %rbp
+	movq %rsp, %rbp
+	
+	#; move current max to %xmm0 for convenience
+	movss %xmm2, %xmm0
+
+	#; the real function	
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.AG_END
+	
+	#; Check for alignment
+
+	movq %rcx, %r8 #; buf => %rdx
+	andq $28, %r8 #; check alignment with mask 11100
+	jz	.AG_AVX #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-7 times, doing normal x87 float comparison
+	#; so we reach a 32 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+	#; Load next value from the buffer into %xmm1
+	movss (%rcx), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rcx)
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+	
+	decq %rdx   #; nframes--
+	jz	.AG_END #; if we run out of frames, we go to the end
+
+	addq $4, %r8 #; one non-aligned byte less
+	cmp $16, %r8
+	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_AVX:
+
+	#; We have reached the 32 byte aligned "buf" ("rcx") value
+	#; use AVX instructions
+
+	#; Figure out how many loops we should do
+	movq %rdx, %rax #; copy remaining nframes to %rax for division
+
+	shr $3, %rax #; unsigned divide by 8
+
+	#; %rax = AVX iterations
+	cmp $0, %rax
+	je .AGPOST_START
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.AGLP_AVX:
+
+	vmovaps (%rcx), %ymm1
+	vmulps %ymm0, %ymm1, %ymm2
+	vmovaps %ymm2, (%rcx)
+
+	addq $32, %rcx  #; buf + 8
+	subq $8, %rdx   #; nframes-=8
+
+	decq %rax
+	jnz .AGLP_AVX
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	cmpq $0, %rdx #;
+	jz .AG_END
+
+.AGPOST_START:
+
+	movss (%rcx), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rcx)
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+	
+	decq %rdx   #; nframes--
+	jnz	.AGPOST_START #; if we run out of frames, we go to the end
+	
+.AG_END:
+
+	#; return
+	leave
+	ret
+
+#; end proc
+
+
+#; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_avx_compute_peak
+	.def	x86_sse_avx_compute_peak; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_compute_peak:
+
+#; due to Microsoft calling convention
+#; %rcx float*          buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			current
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; move current max to %xmm0 for convenience
+	movss %xmm2, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.CP_END
+
+	#; Check for alignment 
+	movq %rcx, %r8 #; buf => %rdx
+	andq $28, %r8 #; mask bits 1 & 2
+	jz	.CP_AVX #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-7 times, doing normal x87 float comparison
+	#; so we reach a 32 byte aligned "buf" (=%rcx) value
+
+.LP_START:
+
+	#; Load next value from the buffer
+	movss (%rcx), %xmm1
+	maxss %xmm1, %xmm0
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+
+	decq %rdx   #; nframes--
+	jz	.CP_END #; if we run out of frames, we go to the end
+
+	addq $4, %r8 #; one non-aligned byte less
+	cmp $32, %r8
+	jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_AVX:
+
+	#; We have reached the 32 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rdx, %rax #; copy remaining nframes to %rax for division
+
+	shr $3, %rax #; unsigned divide by 8
+	jz .POST_START
+
+	#; %rax = AVX iterations
+
+	#; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.LP_AVX:
+
+	vmovaps (%rcx), %ymm1
+	vmaxps %ymm1, %ymm0, %ymm0
+
+	addq $32, %rcx #; buf+=8
+	subq $8, %rdx #; nframes-=8
+
+	decq %rax
+	jnz .LP_AVX
+
+	#; Calculate the maximum value contained in the 4 FP's in %ymm0
+	vshufps $0x4e, %ymm0, %ymm0, %ymm1     #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
+	vmaxps  %ymm1, %ymm0, %ymm0            #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
+	vshufps $0xb1, %ymm0, %ymm0, %ymm1     #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
+	vmaxps  %ymm1, %ymm0, %ymm0			   #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
+	vperm2f128 $0x01, %ymm0, %ymm0, %ymm1  #; swap 128 bit halfs
+	vmaxps  %ymm1, %ymm0, %ymm0			   #; the result will be - all 8 elemens are maximums
+
+	#; now every float in %ymm0 is the same value, current maximum value
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	#; if no remaining frames, jump to the end
+	cmp $0, %rdx
+	je .CP_END
+
+.POST_START:
+
+	movss (%rcx), %xmm1
+	maxss %xmm1, %xmm0
+	
+	addq $4, %rcx 	#; buf++;
+	
+	decq %rdx		#; nframes--;
+	jnz .POST_START
+
+.CP_END:
+
+	#; return value is in xmm0
+
+	#; return
+	leave
+	ret
+
+#; end proc
\ No newline at end of file
diff --git a/libs/ardour/sse_functions_avx.cc b/libs/ardour/sse_functions_avx.cc
new file mode 100644
index 0000000000..8c076aacb5
--- /dev/null
+++ b/libs/ardour/sse_functions_avx.cc
@@ -0,0 +1,120 @@
+/*
+    Copyright (C) 2007 Paul sDavis
+    Written by Sampo Savolainen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include "ardour/types.h"
+
+
+void
+x86_sse_avx_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
+{
+	__m256 current_max, current_min, work;
+
+	// Load max and min values into all four slots of the XMM registers
+	current_min = _mm256_set1_ps(*min);
+	current_max = _mm256_set1_ps(*max);
+
+	// Work input until "buf" reaches 16 byte alignment
+	while ( ((intptr_t)buf) % 32 != 0 && nframes > 0) {
+		
+		// Load the next float into the work buffer
+		work = _mm256_set1_ps(*buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+        // use 64 byte prefetch for quadruple quads:
+		// load each 64 bytes into cash before processing
+        while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)                                   
+				_mm_prefetch(((char*)buf+64), _mm_hint(0) );  // A total guess! Assumed to be eqivalent to
+#else                                              // the line below but waiting to be tested !!
+                __builtin_prefetch(buf+64,0,0);
+#endif
+                work = _mm256_load_ps(buf);
+                current_min = _mm256_min_ps(current_min, work);
+                current_max = _mm256_max_ps(current_max, work);
+                buf+=8;
+                work = _mm256_load_ps(buf);
+                current_min = _mm256_min_ps(current_min, work);
+                current_max = _mm256_max_ps(current_max, work);
+                buf+=8;
+
+                nframes-=16;
+        }
+
+	// work through 32 bytes aligned buffers
+	while (nframes >= 8) {
+
+		work = _mm256_load_ps(buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf+=8;
+		nframes-=8;
+	}
+
+	// work through the rest < 4 samples
+	while ( nframes > 0) {
+
+		// Load the next float into the work buffer
+		work = _mm256_set1_ps(*buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+	// Find min & max value in current_max through shuffle tricks
+
+	work = current_min;
+	work =        _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(2, 3, 0, 1));
+	current_min = _mm256_min_ps (work, current_min);
+	work =        _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(1, 0, 3, 2));
+	current_min = _mm256_min_ps (work, current_min);
+	work =        _mm256_permute2f128_ps( current_min, current_min, 1);
+	current_min = _mm256_min_ps (work, current_min);
+
+	*min = current_min[0];
+
+	work = current_max;
+	work =        _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(2, 3, 0, 1));
+	current_max = _mm256_max_ps (work, current_max);
+	work =        _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(1, 0, 3, 2));
+	current_max = _mm256_max_ps (work, current_max);
+	work =        _mm256_permute2f128_ps( current_max, current_max, 1);
+	current_max = _mm256_max_ps (work, current_max);
+
+	*max = current_max[0];
+
+	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non AVX instructions
+	_mm256_zeroupper ();
+}
+
+
+
diff --git a/libs/ardour/windows/libardour.vcxproj b/libs/ardour/windows/libardour.vcxproj
index 5d842cccf9..4952e05b62 100644
--- a/libs/ardour/windows/libardour.vcxproj
+++ b/libs/ardour/windows/libardour.vcxproj
@@ -14,6 +14,8 @@
     <None Include="..\run-profiling.sh" />
     <None Include="..\run-session-tests.sh" />
     <None Include="..\run-tests.sh" />
+    <None Include="..\sse_avx_functions_64bit_win.s" />
+    <None Include="..\sse_functions_64bit_win.s" />
     <None Include="..\test-env.sh" />
     <None Include="..\wscript" />
   </ItemGroup>
diff --git a/libs/ardour/windows/libardour.vcxproj.filters b/libs/ardour/windows/libardour.vcxproj.filters
index 3dc241cfbe..3149130fbf 100644
--- a/libs/ardour/windows/libardour.vcxproj.filters
+++ b/libs/ardour/windows/libardour.vcxproj.filters
@@ -33,6 +33,8 @@
     <None Include="..\wscript">
       <Filter>scripts</Filter>
     </None>
+    <None Include="..\sse_functions_64bit_win.s" />
+    <None Include="..\sse_avx_functions_64bit_win.s" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\ardour\vestige\aeffectx.h">
diff --git a/libs/ardour/wscript b/libs/ardour/wscript
index eb263e6e78..5d9a09d37e 100644
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@@ -414,7 +414,8 @@ def build(bld):
                 u = PLATFORM.uname ()
                 cpu = u[4]
                 if re.search ("(x86_64|AMD64)", cpu) != None:
-                        obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit_win.s' ]
+                        obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_avx.cc' ]
+                        obj.source += [ 'sse_functions_64bit_win.s', 'sse_avx_functions_64bit_win.s' ]
 
     # i18n
     if bld.is_defined('ENABLE_NLS'):
diff --git a/libs/backends/wavesaudio/waves_audiobackend.cc b/libs/backends/wavesaudio/waves_audiobackend.cc
index 407dc180db..f78c528228 100644
--- a/libs/backends/wavesaudio/waves_audiobackend.cc
+++ b/libs/backends/wavesaudio/waves_audiobackend.cc
@@ -21,6 +21,8 @@
 #include "waves_audioport.h"
 #include "waves_midiport.h"
 
+#include "ardour/runtime_functions.h"
+
 using namespace ARDOUR;
 
 #ifdef __MINGW64__
@@ -1169,13 +1171,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
 {
 #if defined(PLATFORM_WINDOWS)
     const float **buffer = (const float**)input_buffer;
-    size_t copied_bytes = nframes*sizeof(float);
 
     for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
         it != _physical_audio_inputs.end();
         ++it)
     {
-        memcpy((*it)->buffer(), *buffer, copied_bytes);
+		ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
         ++buffer;
     }
 #else
diff --git a/libs/backends/wavesaudio/waves_audioport.cc b/libs/backends/wavesaudio/waves_audioport.cc
index 84f09c0680..6d72dd5501 100644
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@@ -50,7 +50,8 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
              */
 
 			// get first buffer data
-			memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample));
+			// use optimized function to fill the buffer intialy
+			ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
 			++it;
             
 			// mix the rest
diff --git a/libs/pbd/msvc/fpu.cc b/libs/pbd/msvc/fpu.cc
index 2ade2ad511..0c46ac3c94 100644
--- a/libs/pbd/msvc/fpu.cc
+++ b/libs/pbd/msvc/fpu.cc
@@ -20,7 +20,8 @@ using namespace std;
 
 FPU::FPU ()
 {
-	unsigned long cpuflags = 0;
+	unsigned long cpuflags_ECX = 0;
+	unsigned long cpuflags_EDX = 0;
 
 	_flags = (Flags)0;
 
@@ -33,24 +34,26 @@ FPU::FPU ()
 	// no need to use assembler for getting info from register, this function does this for us
 	int cpuInfo[4];
 	__cpuid (cpuInfo, 1);
-	cpuflags = cpuInfo[3];
+	cpuflags_ECX = cpuInfo[2]; // flags from ECX register
+	cpuflags_EDX = cpuInfo[3]; // flags from EDX register
 
-	if (cpuflags & (1<<25)) {
+	if (cpuflags_ECX & (1<<28)) {
+		_flags = Flags (_flags | (HasAVX) );
+	}
+
+	if (cpuflags_EDX & (1<<25)) {
 		_flags = Flags (_flags | (HasSSE|HasFlushToZero) );
 	}
 
-	if (cpuflags & (1<<26)) {
+	if (cpuflags_EDX & (1<<26)) {
 		_flags = Flags (_flags | HasSSE2);
 	}
 
-	if (cpuflags & (1 << 24)) {
-		char** fxbuf = 0;
+	if (cpuflags_EDX & (1 << 24)) {
+		char* fxbuf = 0;
 
 		// allocate alligned buffer
-		fxbuf = (char **) malloc (sizeof (char *));
-		assert (fxbuf);
-		*fxbuf = (char *) malloc (512);
-		assert (*fxbuf);
+		fxbuf = (char*)_aligned_malloc(512, 16);
 
 		// Verify that fxbuf is correctly aligned
 		unsigned long long buf_addr = (unsigned long long)(void*)fxbuf;
@@ -58,25 +61,15 @@ FPU::FPU ()
 			error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg;
 		else
 		{
-			memset(*fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
+			memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
 
 #if defined (COMPILER_MINGW)
 			asm volatile (
 				"fxsave (%0)"
 				:
-				: "r" (*fxbuf)
+				: "r" (fxbuf)
 				: "memory"
 				);
-/*
-			asm( ".intel_syntax noprefix\n" );
-
-			asm volatile (
-				 "mov eax, fxbuf\n"
-				 "fxsave   [eax]\n" 
-			);
-
-			asm( ".att_syntax prefix\n" );
-*/
 
 #elif defined (COMPILER_MSVC)
 			__asm {
@@ -96,8 +89,7 @@ FPU::FPU ()
 				_flags = Flags (_flags | HasDenormalsAreZero);
 			}
 
-			free (*fxbuf);
-			free (fxbuf);
+			_aligned_free (fxbuf);
 		}
 	}
 }
diff --git a/libs/pbd/pbd/fpu.h b/libs/pbd/pbd/fpu.h
index 6627951e9f..260cf4db85 100644
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@@ -30,7 +30,8 @@ class LIBPBD_API FPU {
 		HasFlushToZero = 0x1,
 		HasDenormalsAreZero = 0x2,
 		HasSSE = 0x4,
-		HasSSE2 = 0x8
+		HasSSE2 = 0x8,
+		HasAVX = 0x10
 	};
 
   public:
@@ -41,6 +42,7 @@ class LIBPBD_API FPU {
 	bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
+	bool has_avx () const { return _flags & HasAVX; }
 	
   private:
 	Flags _flags;
diff --git a/libs/pbd/wscript b/libs/pbd/wscript
index 8554c39491..93d83cd9d9 100644
--- a/libs/pbd/wscript
+++ b/libs/pbd/wscript
@@ -155,7 +155,7 @@ def build(bld):
         cpu = u[4]
         if re.search ("(x86_64|AMD64)", cpu) != None:
             obj.defines += [ 'USE_X86_64_ASM' ]
-            obj.defines += ['NO_POSIX_MEMALIGN' ]
+        obj.defines += ['NO_POSIX_MEMALIGN' ]
         obj.source += [ 'windows_special_dirs.cc' ]
         obj.source += [ 'msvc/fpu.cc' ]
         obj.uselib += ' OLE'
diff --git a/wscript b/wscript
index 2f552b6e56..dee423afce 100755
--- a/wscript
+++ b/wscript
@@ -262,12 +262,11 @@ def set_compiler_flags (conf,opt):
             if (re.search ("(x86_64|AMD64)", cpu) != None):
                 # on Windows sse is supported by 64 bit platforms only
                 build_host_supports_sse = True
-                
+
                 # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
                 # compiler_flags.append (["--mmnemonic=att", "msyntax=att")
                 
-                compiler_flags.extend (["-msse", "-mfpmath=sse", "-DUSE_XMMINTRIN", "-masm=att"])
-                
+                compiler_flags.extend (["-mavx", "-mvzeroupper", "-DUSE_XMMINTRIN"])  
 
     # end of processor-specific section
 

From 976f2b2945c47469f20d1025d61d1ba20e25828d Mon Sep 17 00:00:00 2001
From: Greg Zharun <grygoriiz@wavesglobal.com>
Date: Tue, 21 Apr 2015 13:37:08 +0300
Subject: [PATCH 13/13] Revert "[Summary] Added cleanup for GUI properties when
 route is removed."

This reverts commit c1af68b7f61a2ffee224614e7193e1ca0c9d6223.
---
 gtk2_ardour/automation_time_axis.cc |  1 -
 gtk2_ardour/axis_view.h             |  8 ------
 gtk2_ardour/gui_object.cc           | 40 -----------------------------
 gtk2_ardour/gui_object.h            |  6 +----
 gtk2_ardour/route_time_axis.cc      |  3 ---
 gtk2_ardour/route_ui.cc             |  6 +----
 6 files changed, 2 insertions(+), 62 deletions(-)

diff --git a/gtk2_ardour/automation_time_axis.cc b/gtk2_ardour/automation_time_axis.cc
index 79f7c4bfcf..1a99c0bc98 100644
--- a/gtk2_ardour/automation_time_axis.cc
+++ b/gtk2_ardour/automation_time_axis.cc
@@ -231,7 +231,6 @@ AutomationTimeAxisView::AutomationTimeAxisView (
 
 AutomationTimeAxisView::~AutomationTimeAxisView ()
 {
-    cleanup_gui_properties ();
 	delete _view;
 }
 
diff --git a/gtk2_ardour/axis_view.h b/gtk2_ardour/axis_view.h
index 51a48d3150..4ce76c92ce 100644
--- a/gtk2_ardour/axis_view.h
+++ b/gtk2_ardour/axis_view.h
@@ -70,14 +70,6 @@ class AxisView : public virtual Selectable, public PBD::ScopedConnectionList, pu
 		property_hashtable.emplace(property_name, s.str());
 		gui_object_state().set_property<T> (state_id(), property_name, value);
 	}
-    
-    void cleanup_gui_properties ()
-    {
-        // remove related property node from the GUI state
-        gui_object_state().remove_node(state_id() );
-        property_hashtable.clear ();
-    }
-
 
 	bool marked_for_display () const;
 	virtual bool set_marked_for_display (bool);
diff --git a/gtk2_ardour/gui_object.cc b/gtk2_ardour/gui_object.cc
index bcc2fcdfa1..3e21f82fa4 100644
--- a/gtk2_ardour/gui_object.cc
+++ b/gtk2_ardour/gui_object.cc
@@ -71,46 +71,6 @@ GUIObjectState::get_or_add_node (const string& id)
 	return get_or_add_node (&_state, id);
 }
 
-/** Remove property from the node with provided id.
- *  If there is no properties except the node id - remove the node.
- *  @param id property of Object node to look for.
- *  @param prop_name name of the Object property to remove.
- *  @return value of true if property is found, or false if not.
- */
-
-bool
-GUIObjectState::remove_property (const std::string& id, const std::string& prop_name)
-{
-    XMLNode* child = get_node (&_state, id);
-    
-    if (!child) {
-        return false;
-    }
-    
-    XMLProperty* p = child->property (prop_name );
-    if (!p) {
-        return false;
-    }
-    
-    child->remove_property (prop_name );
-    
-    if (child->children().empty() && child->properties().size() == 1 && child->property (X_("id")) ) {
-        remove_node (id);
-    }
-    
-    return true;
-}
-
-/** Remove node with provided id.
- *  @param id property of Object node to look for.
-*/
-
-void
-GUIObjectState::remove_node (const std::string& id)
-{
-    _state.remove_nodes_and_delete(X_("id"), id );
-}
-
 /** Get a string from our state.
  *  @param id property of Object node to look for.
  *  @param prop_name name of the Object property to return.
diff --git a/gtk2_ardour/gui_object.h b/gtk2_ardour/gui_object.h
index 9868ef9971..ee6d1cdf4c 100644
--- a/gtk2_ardour/gui_object.h
+++ b/gtk2_ardour/gui_object.h
@@ -47,16 +47,12 @@ public:
 		s << val;
 		child->add_property (prop_name.c_str(), s.str());
 	}
-    
-    bool remove_property (const std::string& id, const std::string& prop_name);
-    
+
 	std::list<std::string> all_ids () const;
 
 	static XMLNode* get_node (const XMLNode *, const std::string &);
 	XMLNode* get_or_add_node (const std::string &);
 	static XMLNode* get_or_add_node (XMLNode *, const std::string &);
-    
-    void remove_node (const std::string& id);
 	
   private:
 	XMLNode _state;
diff --git a/gtk2_ardour/route_time_axis.cc b/gtk2_ardour/route_time_axis.cc
index d8cfd87e63..e0ce6cde3f 100644
--- a/gtk2_ardour/route_time_axis.cc
+++ b/gtk2_ardour/route_time_axis.cc
@@ -228,9 +228,6 @@ RouteTimeAxisView::set_route (boost::shared_ptr<Route> rt)
 
 RouteTimeAxisView::~RouteTimeAxisView ()
 {
-    // must be handled before CatchDeletion (this)
-    cleanup_gui_properties ();
-    
 	CatchDeletion (this);
 
 	for (list<ProcessorAutomationInfo*>::iterator i = processor_automation.begin(); i != processor_automation.end(); ++i) {
diff --git a/gtk2_ardour/route_ui.cc b/gtk2_ardour/route_ui.cc
index e9c3b022fb..4631dca820 100644
--- a/gtk2_ardour/route_ui.cc
+++ b/gtk2_ardour/route_ui.cc
@@ -102,13 +102,9 @@ RouteUI::RouteUI (ARDOUR::Session* sess, const std::string& layout_script_file)
 
 RouteUI::~RouteUI()
 {
-    // remove RouteUI property node from the GUI state
-    // must be handled before reseting _route
-    gui_object_state().remove_node(route_state_id() );
-    
 	_route.reset (); /* drop reference to route, so that it can be cleaned up */
 	route_connections.drop_connections ();
-    
+
 	delete solo_menu;
 	delete mute_menu;
 	delete sends_menu;