Merge branch 'performance'

2026-01-24 14:17:21 +01:00 · 2015-04-21 13:57:41 +03:00 · 2015-04-21 13:57:41 +03:00 · 2a84e1d0c5
commit 2a84e1d0c5
parent 869b456352 976f2b2945
23 changed files with 1600 additions and 114 deletions
--- a/gtk2_ardour/editor.cc
+++ b/gtk2_ardour/editor.cc
@ -5202,6 +5202,12 @@ Editor::add_routes (RouteList& routes)
 	connect_routes_and_update_global_rec_button (routes);
 }

+namespace {
+    bool tv_not_selected (TimeAxisView *tv) {
+        return !tv->get_selected ();
+    }
+}
+
 void
 Editor::timeaxisview_deleted (TimeAxisView *tv)
 {
@ -5245,19 +5251,20 @@ Editor::timeaxisview_deleted (TimeAxisView *tv)

 	if (current_mixer_strip && current_mixer_strip->route() == route) {

-		TimeAxisView* next_tv;
-
-		if (track_views.empty()) {
-			next_tv = 0;
-		} else if (i == track_views.end()) {
-			next_tv = track_views.front();
-		} else {
-			next_tv = (*i);
+        // find first non selected track
+        TimeAxisView* first_non_selected_tv = 0;
+        
+        if (!track_views.empty() ) {
+            
+            i = std::find_if (track_views.begin(), track_views.end(), tv_not_selected);
+            
+            if (i != track_views.end() ) {
+                first_non_selected_tv = (*i);
+            }
 		}
-
-
-		if (next_tv ) {
-			set_selected_mixer_strip (*next_tv);
+        
+        if (first_non_selected_tv ) {
+			set_selected_mixer_strip (*first_non_selected_tv);
 		} else {
 			/* make the editor mixer strip go away setting the
 			 * button to inactive (which also unticks the menu option)
--- a/libs/ardour/ardour/mix.h
+++ b/libs/ardour/ardour/mix.h
@ -33,7 +33,17 @@ extern "C" {
 	LIBARDOUR_API void  x86_sse_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
 }

+extern "C" {
+/* AVX functions */
+	LIBARDOUR_API float x86_sse_avx_compute_peak         (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
+	LIBARDOUR_API void  x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
+	LIBARDOUR_API void  x86_sse_avx_mix_buffers_no_gain  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+	LIBARDOUR_API void  x86_sse_avx_copy_vector          (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+}
+
 LIBARDOUR_API void  x86_sse_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
+LIBARDOUR_API void  x86_sse_avx_find_peaks               (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);

 /* debug wrappers for SSE functions */

@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak               (const ARDOUR::Sample * buf
 LIBARDOUR_API void  debug_apply_gain_to_buffer       (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_with_gain      (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  debug_mix_buffers_no_gain        (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  debug_copy_vector                (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);

 #endif

@ -61,5 +72,6 @@ LIBARDOUR_API void  default_find_peaks                (const ARDOUR::Sample * bu
 LIBARDOUR_API void  default_apply_gain_to_buffer      (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_with_gain     (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
 LIBARDOUR_API void  default_mix_buffers_no_gain       (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
+LIBARDOUR_API void  default_copy_vector				  (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);

 #endif /* __ardour_mix_h__ */
--- a/libs/ardour/ardour/runtime_functions.h
+++ b/libs/ardour/ardour/runtime_functions.h
@ -25,17 +25,19 @@

 namespace ARDOUR {

-	typedef float (*compute_peak_t)			(const ARDOUR::Sample *, pframes_t, float);
-	typedef void  (*find_peaks_t)                   (const ARDOUR::Sample *, pframes_t, float *, float*);
+	typedef float (*compute_peak_t)			    (const ARDOUR::Sample *, pframes_t, float);
+	typedef void  (*find_peaks_t)               (const ARDOUR::Sample *, pframes_t, float *, float*);
 	typedef void  (*apply_gain_to_buffer_t)		(ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_with_gain_t)	(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
 	typedef void  (*mix_buffers_no_gain_t)		(ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
+	typedef void  (*copy_vector_t)			    (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);

 	LIBARDOUR_API extern compute_peak_t		compute_peak;
 	LIBARDOUR_API extern find_peaks_t               find_peaks;
 	LIBARDOUR_API extern apply_gain_to_buffer_t	apply_gain_to_buffer;
 	LIBARDOUR_API extern mix_buffers_with_gain_t	mix_buffers_with_gain;
 	LIBARDOUR_API extern mix_buffers_no_gain_t	mix_buffers_no_gain;
+	LIBARDOUR_API extern copy_vector_t			copy_vector;
 }

 #endif /* __ardour_runtime_functions_h__ */
--- a/libs/ardour/audio_buffer.cc
+++ b/libs/ardour/audio_buffer.cc
@ -43,7 +43,7 @@ AudioBuffer::AudioBuffer(size_t capacity)
 AudioBuffer::~AudioBuffer()
 {
 	if (_owns_data)
-		free(_data);
+		cache_aligned_free(_data);
 }

 void
@ -60,7 +60,7 @@ AudioBuffer::resize (size_t size)
 		return;
 	}

-	free (_data);
+	cache_aligned_free (_data);

 	cache_aligned_malloc ((void**) &_data, sizeof (Sample) * size);

--- a/libs/ardour/globals.cc
+++ b/libs/ardour/globals.cc
@ -127,6 +127,7 @@ find_peaks_t            ARDOUR::find_peaks = 0;
 apply_gain_to_buffer_t  ARDOUR::apply_gain_to_buffer = 0;
 mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
 mix_buffers_no_gain_t   ARDOUR::mix_buffers_no_gain = 0;
+copy_vector_t			ARDOUR::copy_vector = 0;

 PBD::Signal1<void,std::string> ARDOUR::BootMessage;
 PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
@ -153,7 +154,21 @@ setup_hardware_optimization (bool try_optimization)

 #if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)

-		if (fpu.has_sse()) {
+		if (fpu.has_avx()) {
+
+			info << "Using AVX optimized routines" << endmsg;
+
+			// AVX SET
+			compute_peak          = x86_sse_avx_compute_peak;
+			find_peaks            = x86_sse_avx_find_peaks;
+			apply_gain_to_buffer  = x86_sse_avx_apply_gain_to_buffer;
+			mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
+			mix_buffers_no_gain   = x86_sse_avx_mix_buffers_no_gain;
+			copy_vector           = x86_sse_avx_copy_vector;
+
+			generic_mix_functions = false;
+
+		} else if (fpu.has_sse()) {

 			info << "Using SSE optimized routines" << endmsg;

@ -163,6 +178,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer  = x86_sse_apply_gain_to_buffer;
 			mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
 			mix_buffers_no_gain   = x86_sse_mix_buffers_no_gain;
+			copy_vector           = default_copy_vector;

 			generic_mix_functions = false;

@ -180,6 +196,7 @@ setup_hardware_optimization (bool try_optimization)
 			apply_gain_to_buffer   = veclib_apply_gain_to_buffer;
 			mix_buffers_with_gain  = veclib_mix_buffers_with_gain;
 			mix_buffers_no_gain    = veclib_mix_buffers_no_gain;
+			copy_vector            = default_copy_vector;

 			generic_mix_functions = false;

@ -199,6 +216,7 @@ setup_hardware_optimization (bool try_optimization)
 		apply_gain_to_buffer  = default_apply_gain_to_buffer;
 		mix_buffers_with_gain = default_mix_buffers_with_gain;
 		mix_buffers_no_gain   = default_mix_buffers_no_gain;
+		copy_vector           = default_copy_vector;

 		info << "No H/W specific optimizations in use" << endmsg;
 	}
--- a/libs/ardour/midi_buffer.cc
+++ b/libs/ardour/midi_buffer.cc
@ -44,7 +44,7 @@ MidiBuffer::MidiBuffer(size_t capacity)

 MidiBuffer::~MidiBuffer()
 {
-	free(_data);
+	cache_aligned_free(_data);
 }

 void
@ -60,7 +60,7 @@ MidiBuffer::resize(size_t size)
 		return;
 	}

-	free (_data);
+	cache_aligned_free (_data);

 	cache_aligned_malloc ((void**) &_data, size);

--- a/libs/ardour/mix.cc
+++ b/libs/ardour/mix.cc
@ -32,7 +32,7 @@ using namespace ARDOUR;
 // Debug wrappers

 float
-debug_compute_peak (ARDOUR::Sample *buf, pframes_t nsamples, float current)
+debug_compute_peak (const ARDOUR::Sample *buf, pframes_t nsamples, float current)
 {
 	if ( ((intptr_t)buf % 16) != 0) {
 		std::cerr << "compute_peak(): buffer unaligned!" << std::endl;
@ -52,7 +52,7 @@ debug_apply_gain_to_buffer (ARDOUR::Sample *buf, pframes_t nframes, float gain)
 }

 void
-debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes, float gain)
+debug_mix_buffers_with_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes, float gain)
 {
 	if ( ((intptr_t)dst & 15) != 0) {
 		std::cerr << "mix_buffers_with_gain(): dst unaligned!" << std::endl;
@ -67,7 +67,7 @@ debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t
 }

 void
-debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes)
+debug_mix_buffers_no_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes)
 {
 	if ( ((intptr_t)dst & 15) != 0) {
 		std::cerr << "mix_buffers_no_gain(): dst unaligned!" << std::endl;
@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
 	}
 }

+void
+default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
+{
+	memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
+}
+
 #if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
 #include <Accelerate/Accelerate.h>

--- a/libs/ardour/sse_avx_functions_64bit_win.s
+++ b/libs/ardour/sse_avx_functions_64bit_win.s
@ -0,0 +1,587 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+	Author: Sampo Savolainen
+	64-bit conversion: John Rigg
+
+    $Id$
+*/
+
+#; Microsoft version of AVX sample processing functions
+
+#; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
+
+.globl x86_sse_avx_mix_buffers_with_gain
+	.def    x86_sse_avx_mix_buffers_with_gain; .scl    2;      .type   32;     
+.endef
+
+x86_sse_avx_mix_buffers_with_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+#; %xmm3 float	gain
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	
+	#; move current max to %xmm0 for convenience
+	movss %xmm3, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBWG_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
+
+	#; if we are aligned
+	cmp $0, %rbx
+	jz .MBWG_AVX
+	
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; SSE instructions
+
+.MBWG_PRELOOP:
+	
+	#; gain is already in %xmm0
+	movss (%rdx), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rcx), %xmm1
+	movss %xmm1, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+	decq %r8 	  #; nframes--
+	jz .MBWG_END
+
+	addq $4, %rbx
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .MBWG_PRELOOP
+
+.MBWG_AVX:
+
+	cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
+	jl .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.MBWG_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm1        #; source => xmm0
+	vmulps	%ymm0,  %ymm1, %ymm2 #; apply gain to source
+	vaddps	(%rcx), %ymm2, %ymm1 #; mix with destination
+	vmovaps  %ymm1, (%rcx)        #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .MBWG_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .MBWG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.MBWG_NONALIGN:
+	#; not aligned!
+
+	#; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+	movss (%rdx), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rcx), %xmm1
+	movss %xmm1, (%rcx)
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_avx_mix_buffers_no_gain
+	.def	x86_sse_avx_mix_buffers_no_gain; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_mix_buffers_no_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBNG_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
+
+	cmp $0, %rbx
+	je .MBNG_AVX #; aligned at 32, rpoceed to AVX
+
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; AVX instructions
+
+.MBNG_PRELOOP:
+
+	movss (%rdx), %xmm0
+	addss (%rcx), %xmm0
+	movss %xmm0, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+
+	decq %r8 	  #; nframes--
+	jz	.MBNG_END
+	
+	addq $4, %rbx #; one non-aligned byte less
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .MBNG_PRELOOP
+
+.MBNG_AVX:
+
+	cmp $8, %r8 #; if there are frames left, but less than 8
+	jl .MBNG_NONALIGN #; we can't run AVX
+
+.MBNG_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm0        #; source => xmm0
+	vaddps	(%rcx), %ymm0, %ymm1 #; mix with destination
+	vmovaps  %ymm1, (%rcx)       #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .MBNG_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .MBNG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.MBNG_NONALIGN:
+	#; not aligned!
+	#; 
+
+	movss (%rdx), %xmm0 #; src => xmm0
+	addss (%rcx), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%rcx) #; xmm0 => dst
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_avx_copy_vector
+	.def	x86_sse_avx_copy_vector; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_copy_vector:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.CB_END
+
+	#; Check for alignment
+
+	movq %rcx, %rax
+	andq $28, %rax #; mask alignment offset
+
+	movq %rdx, %rbx
+	andq $28, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
+
+	cmp $0, %rbx
+	je .CB_AVX #; aligned at 32, rpoceed to AVX
+
+	#; Pre-loop, we need to run 1-7 frames "manually" without
+	#; AVX instructions
+
+.CB_PRELOOP:
+
+	movss (%rdx), %xmm0
+	movss %xmm0, (%rcx)
+
+	addq $4, %rcx #; dst++
+	addq $4, %rdx #; src++
+
+	decq %r8 	  #; nframes--
+	jz	.CB_END
+	
+	addq $4, %rbx #; one non-aligned byte less
+	
+	cmp $32, %rbx #; test if we've reached 32 byte alignment
+	jne .CB_PRELOOP
+
+.CB_AVX:
+
+	cmp $8, %r8 #; if there are frames left, but less than 8
+	jl .CB_NONALIGN #; we can't run AVX
+
+.CB_AVXLOOP:
+
+	vmovaps	(%rdx), %ymm0        #; source => xmm0
+	vmovaps  %ymm0, (%rcx)       #; copy result to destination
+	
+	addq $32, %rcx #; dst+=8
+	addq $32, %rdx #; src+=8
+
+	subq $8, %r8 #; nframes-=8
+	cmp $8, %r8
+	jge .CB_AVXLOOP
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	cmp $0, %r8
+	je .CB_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-7 frames.
+	
+.CB_NONALIGN:
+	#; not aligned!
+	#; 
+
+	movss (%rdx), %xmm0 #; src => xmm0
+	movss %xmm0, (%rcx) #; xmm0 => dst
+	
+	addq $4, %rcx
+	addq $4, %rdx
+	
+	decq %r8
+	jnz .CB_NONALIGN
+
+.CB_END:
+
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
+
+.globl x86_sse_avx_apply_gain_to_buffer
+	.def	x86_sse_avx_apply_gain_to_buffer; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_apply_gain_to_buffer:
+
+#; due to Microsoft calling convention
+#; %rcx float 			*buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			gain			avx specific register
+
+	pushq %rbp
+	movq %rsp, %rbp
+	
+	#; move current max to %xmm0 for convenience
+	movss %xmm2, %xmm0
+
+	#; the real function	
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.AG_END
+	
+	#; Check for alignment
+
+	movq %rcx, %r8 #; buf => %rdx
+	andq $28, %r8 #; check alignment with mask 11100
+	jz	.AG_AVX #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-7 times, doing normal x87 float comparison
+	#; so we reach a 32 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+	#; Load next value from the buffer into %xmm1
+	movss (%rcx), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rcx)
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+	
+	decq %rdx   #; nframes--
+	jz	.AG_END #; if we run out of frames, we go to the end
+
+	addq $4, %r8 #; one non-aligned byte less
+	cmp $16, %r8
+	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_AVX:
+
+	#; We have reached the 32 byte aligned "buf" ("rcx") value
+	#; use AVX instructions
+
+	#; Figure out how many loops we should do
+	movq %rdx, %rax #; copy remaining nframes to %rax for division
+
+	shr $3, %rax #; unsigned divide by 8
+
+	#; %rax = AVX iterations
+	cmp $0, %rax
+	je .AGPOST_START
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.AGLP_AVX:
+
+	vmovaps (%rcx), %ymm1
+	vmulps %ymm0, %ymm1, %ymm2
+	vmovaps %ymm2, (%rcx)
+
+	addq $32, %rcx  #; buf + 8
+	subq $8, %rdx   #; nframes-=8
+
+	decq %rax
+	jnz .AGLP_AVX
+
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	cmpq $0, %rdx #;
+	jz .AG_END
+
+.AGPOST_START:
+
+	movss (%rcx), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rcx)
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+	
+	decq %rdx   #; nframes--
+	jnz	.AGPOST_START #; if we run out of frames, we go to the end
+	
+.AG_END:
+
+	#; return
+	leave
+	ret
+
+#; end proc
+
+
+#; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_avx_compute_peak
+	.def	x86_sse_avx_compute_peak; .scl    2;   .type   32;
+.endef
+
+x86_sse_avx_compute_peak:
+
+#; due to Microsoft calling convention
+#; %rcx float*          buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			current
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; move current max to %xmm0 for convenience
+	movss %xmm2, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.CP_END
+
+	#; Check for alignment 
+	movq %rcx, %r8 #; buf => %rdx
+	andq $28, %r8 #; mask bits 1 & 2
+	jz	.CP_AVX #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-7 times, doing normal x87 float comparison
+	#; so we reach a 32 byte aligned "buf" (=%rcx) value
+
+.LP_START:
+
+	#; Load next value from the buffer
+	movss (%rcx), %xmm1
+	maxss %xmm1, %xmm0
+
+	#; increment buffer, decrement counter
+	addq $4, %rcx #; buf++;
+
+	decq %rdx   #; nframes--
+	jz	.CP_END #; if we run out of frames, we go to the end
+
+	addq $4, %r8 #; one non-aligned byte less
+	cmp $32, %r8
+	jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_AVX:
+
+	#; We have reached the 32 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rdx, %rax #; copy remaining nframes to %rax for division
+
+	shr $3, %rax #; unsigned divide by 8
+	jz .POST_START
+
+	#; %rax = AVX iterations
+
+	#; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
+	vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
+	vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
+
+.LP_AVX:
+
+	vmovaps (%rcx), %ymm1
+	vmaxps %ymm1, %ymm0, %ymm0
+
+	addq $32, %rcx #; buf+=8
+	subq $8, %rdx #; nframes-=8
+
+	decq %rax
+	jnz .LP_AVX
+
+	#; Calculate the maximum value contained in the 4 FP's in %ymm0
+	vshufps $0x4e, %ymm0, %ymm0, %ymm1     #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
+	vmaxps  %ymm1, %ymm0, %ymm0            #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
+	vshufps $0xb1, %ymm0, %ymm0, %ymm1     #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
+	vmaxps  %ymm1, %ymm0, %ymm0			   #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
+	vperm2f128 $0x01, %ymm0, %ymm0, %ymm1  #; swap 128 bit halfs
+	vmaxps  %ymm1, %ymm0, %ymm0			   #; the result will be - all 8 elemens are maximums
+
+	#; now every float in %ymm0 is the same value, current maximum value
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
+	vzeroupper
+
+	#; if no remaining frames, jump to the end
+	cmp $0, %rdx
+	je .CP_END
+
+.POST_START:
+
+	movss (%rcx), %xmm1
+	maxss %xmm1, %xmm0
+	
+	addq $4, %rcx 	#; buf++;
+	
+	decq %rdx		#; nframes--;
+	jnz .POST_START
+
+.CP_END:
+
+	#; return value is in xmm0
+
+	#; return
+	leave
+	ret
+
+#; end proc
--- a/libs/ardour/sse_functions_64bit_win.s
+++ b/libs/ardour/sse_functions_64bit_win.s
@ -0,0 +1,679 @@
+/*
+    Copyright (C) 2005-2006 Paul Davis, John Rigg
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+	Author: Sampo Savolainen
+	64-bit conversion: John Rigg
+
+    $Id$
+*/
+
+#; Microsoft version of SSE sample processing functions
+
+#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
+
+.globl x86_sse_mix_buffers_with_gain
+	.def    x86_sse_mix_buffers_with_gain; .scl    2;      .type   32;     
+.endef
+
+x86_sse_mix_buffers_with_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+#; %xmm3 float	gain
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float	*dst
+#; %rsi float	*src	
+#; %rdx unsigned int nframes
+#; %xmm0 float	gain
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx
+	pushq %rdx 
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+	movss %xmm3, %xmm0
+
+	#; if nframes == 0, go to end
+	cmp	$0, %rdx
+	je	.MBWG_END
+
+	#; Check for alignment
+
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignment offset
+
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBWG_NONALIGN #; if not aligned, calculate manually
+
+	#; if we are aligned
+	cmp $0, %rbx
+	jz .MBWG_SSE
+	
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+.MBWG_PRELOOP:
+	
+	#; gain is already in %xmm0
+	movss (%rsi), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rdi), %xmm1
+	movss %xmm1, (%rdi)
+
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rdx 	  #; nframes--
+	jz .MBWG_END
+
+	addq $4, %rbx
+	
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
+	jne .MBWG_PRELOOP
+
+
+.MBWG_SSE:
+
+	cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
+	jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
+
+	#; gain is already in %xmm0
+	shufps  $0x00, %xmm0, %xmm0
+
+
+.MBWG_SSELOOP:
+
+	movaps	(%rsi), %xmm1 #; source => xmm0
+	mulps	%xmm0,  %xmm1 #; apply gain to source
+	addps	(%rdi), %xmm1 #; mix with destination
+	movaps  %xmm1, (%rdi) #; copy result to destination
+	
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
+
+	subq $4, %rdx #; nframes-=4
+	cmp $4, %rdx
+	jge .MBWG_SSELOOP
+
+	cmp $0, %rdx
+	je .MBWG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBWG_NONALIGN:
+	#; not aligned!
+
+	#; gain is already in %xmm0
+
+.MBWG_NONALIGNLOOP:
+
+	movss (%rsi), %xmm1
+	mulss %xmm0, %xmm1
+	addss (%rdi), %xmm1
+	movss %xmm1, (%rdi)
+	
+	addq $4, %rdi
+	addq $4, %rsi
+	
+	decq %rdx
+	jnz .MBWG_NONALIGNLOOP
+
+.MBWG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
+
+.globl x86_sse_mix_buffers_no_gain
+	.def	x86_sse_mix_buffers_no_gain; .scl    2;   .type   32;
+.endef
+
+x86_sse_mix_buffers_no_gain:
+
+#; due to Microsoft calling convention
+#; %rcx float *dst
+#; %rdx float *src
+#; %r8 unsigned int nframes
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float *dst
+#; %rsi float *src
+#; %rdx unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx
+	pushq %rdx 
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+
+	#; the real function
+
+	#; if nframes == 0, go to end
+	cmp	$0, %r8
+	je	.MBNG_END
+
+	#; Check for alignment
+
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignment offset
+
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
+
+	cmp %rax, %rbx
+	jne .MBNG_NONALIGN #; if not aligned, calculate manually
+
+	cmp $0, %rbx
+	je .MBNG_SSE
+
+	#; Pre-loop, we need to run 1-3 frames "manually" without
+	#; SSE instructions
+
+.MBNG_PRELOOP:
+		
+	movss (%rsi), %xmm0
+	addss (%rdi), %xmm0
+	movss %xmm0, (%rdi)
+
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rdx 	  #; nframes--
+	jz	.MBNG_END
+	addq $4, %rbx
+	
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
+	jne .MBNG_PRELOOP
+
+.MBNG_SSE:
+
+	cmp $4, %rdx #; if there are frames left, but less than 4
+	jnge .MBNG_NONALIGN #; we can't run SSE
+
+.MBNG_SSELOOP:
+
+	movaps	(%rsi), %xmm0 #; source => xmm0
+	addps	(%rdi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%rdi) #; copy result to destination
+	
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
+
+	subq $4, %rdx #; nframes-=4
+	cmp $4, %rdx
+	jge .MBNG_SSELOOP
+
+	cmp $0, %rdx
+	je .MBNG_END
+
+	#; if there are remaining frames, the nonalign code will do nicely
+	#; for the rest 1-3 frames.
+	
+.MBNG_NONALIGN:
+	#; not aligned!
+
+	movss (%rsi), %xmm0 #; src => xmm0
+	addss (%rdi), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%rdi) #; xmm0 => dst
+	
+	addq $4, %rdi
+	addq $4, %rsi
+	
+	decq %rdx
+	jnz .MBNG_NONALIGN
+
+.MBNG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	#; return
+	leave
+	ret
+
+
+#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
+
+.globl x86_sse_apply_gain_to_buffer
+	.def	x86_sse_apply_gain_to_buffer; .scl    2;   .type   32;
+.endef
+
+x86_sse_apply_gain_to_buffer:
+
+#; due to Microsoft calling convention
+#; %rcx float 			*buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			gain
+#; %xmm1 float			buf[0]
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi	 float 			*buf	32(%rbp)
+#; %rsi  unsigned int 	nframes
+#; %xmm0 float 			gain
+#; %xmm1 float			buf[0]
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rcx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movss %xmm2, %xmm0
+
+	#; the real function	
+
+	#; if nframes == 0, go to end
+	movq %rsi, %rcx #; nframes
+	cmp	$0, %rcx
+	je	.AG_END
+
+	#; set up the gain buffer (gain is already in %xmm0)
+	shufps	$0x00, %xmm0, %xmm0
+	
+	#; Check for alignment
+
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.AG_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.AGLP_START:
+
+	#; Load next value from the buffer into %xmm1
+	movss (%rdi), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rdi)
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jz	.AG_END #; if we run out of frames, we go to the end
+
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
+	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
+
+.AG_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+	shr $2,%rax #; unsigned divide by 4
+
+	#; %rax = SSE iterations
+	cmp $0, %rax
+	je .AGPOST_START
+
+.AGLP_SSE:
+
+	movaps (%rdi), %xmm1
+	mulps %xmm0, %xmm1
+	movaps %xmm1, (%rdi)
+
+	addq $16, %rdi  #; buf + 4
+	subq $4, %rcx   #; nframes-=4
+
+	decq %rax
+	jnz .AGLP_SSE
+
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	andq $3, %rcx #; nframes % 4
+	jz .AG_END
+
+.AGPOST_START:
+
+	movss (%rdi), %xmm1
+	mulss %xmm0, %xmm1
+	movss %xmm1, (%rdi)
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jnz	.AGPOST_START #; if we run out of frames, we go to the end
+	
+.AG_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rcx
+
+	#; return
+	leave
+	ret
+
+#; end proc
+
+
+#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
+
+.globl x86_sse_apply_gain_vector
+	.def	x86_sse_apply_gain_vector; .scl    2;   .type   32;
+.endef
+
+
+x86_sse_apply_gain_vector:
+
+#; due to Microsoft calling convention
+#; %rcx float *buf
+#; %rdx float *gain_vector
+#; %r8	unsigned int nframes
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi float *buf
+#; %rsi float *gain_vector
+#; %rdx unsigned int nframes
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save the registers
+	pushq %rbx #; must be preserved
+	pushq %rcx 
+	pushq %rdx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+	
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movq %r8, %rdx
+
+	#; if nframes == 0 go to end
+	cmp $0, %rdx
+	je .AGA_END
+		
+	#; Check alignment
+	movq %rdi, %rax
+	andq $12, %rax
+		
+	movq %rsi, %rbx
+	andq $12, %rbx
+
+	cmp %rax,%rbx
+	jne .AGA_ENDLOOP
+
+	cmp $0, %rax
+	jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
+
+#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
+.AGA_ALIGNLOOP:
+		
+	movss (%rdi), %xmm0 #; buf => xmm0
+	movss (%rsi), %xmm1 #; gain value => xmm1
+	mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+	movss %xmm0, (%rdi) #; signal with gain => buf
+
+	decq %rdx
+	jz .AGA_END
+
+	addq $4, %rdi #; buf++
+	addq $4, %rsi #; gab++
+	
+	addq $4, %rax
+	cmp $16, %rax
+	jne .AGA_ALIGNLOOP
+	
+#; There are frames left for sure, as that is checked in the beginning
+#; and within the previous loop. BUT, there might be less than 4 frames
+#; to process
+
+.AGA_SSE:
+	movq %rdx, %rax #; nframes => %rax
+	shr $2, %rax #; unsigned divide by 4
+
+	cmp $0, %rax
+	je .AGA_ENDLOOP
+
+.AGA_SSELOOP:
+	movaps (%rdi), %xmm0
+	movaps (%rsi), %xmm1
+	mulps %xmm1, %xmm0
+	movaps %xmm0, (%rdi)
+
+	addq $16, %rdi
+	addq $16, %rsi
+
+	decq %rax
+	jnz .AGA_SSELOOP
+
+	andq $3, %rdx #; Remaining frames are nframes & 3
+	jz .AGA_END
+
+
+#; Inside this loop, we know there are frames left to process
+#; but because either there are < 4 frames left, or the buffers
+#; are not aligned, we can't use the parallel SSE ops
+.AGA_ENDLOOP:
+	movss (%rdi), %xmm0 #; buf => xmm0
+	movss (%rsi), %xmm1 #; gain value => xmm1
+	mulss %xmm1, %xmm0  #; xmm1 * xmm0 => xmm0
+	movss %xmm0, (%rdi) #; signal with gain => buf
+
+	addq $4,%rdi
+	addq $4,%rsi
+	decq %rdx #; nframes--
+	jnz .AGA_ENDLOOP
+
+.AGA_END:
+
+	popq %rsi
+	popq %rdi
+	popq %rdx
+	popq %rcx
+	popq %rbx
+
+	leave
+	ret
+
+#; end proc
+
+
+#; float x86_sse_compute_peak(float *buf, long nframes, float current);
+
+.globl x86_sse_compute_peak
+	.def	x86_sse_compute_peak; .scl    2;   .type   32;
+.endef
+
+	
+x86_sse_compute_peak:
+
+#; due to Microsoft calling convention
+#; %rcx float*          buf	32(%rbp)
+#; %rdx unsigned int 	nframes
+#; %xmm2 float			current
+#; %xmm1 float			buf[0]
+
+#; due to System V AMD64 (Linux) calling convention
+#; %rdi	 float*         buf	32(%rbp)
+#; %rsi	 unsigned int 	nframes
+#; %xmm0 float			current
+#; %xmm1 float			buf[0]
+
+	pushq %rbp
+	movq %rsp, %rbp
+
+	#; save registers
+	pushq %rcx
+	pushq %rdi #; must be preserved
+	pushq %rsi #; must be preserved
+
+	#; to keep algorithms universal - move input params into Linux specific registers
+	movq %rcx, %rdi
+	movq %rdx, %rsi
+	movss %xmm2, %xmm0
+
+	#; if nframes == 0, go to end
+	movq %rsi, %rcx #; nframes
+	cmp	$0, %rcx
+	je	.CP_END
+
+	#; create the "abs" mask in %xmm2
+	pushq   $2147483647
+	movss	(%rsp), %xmm2
+	addq    $8, %rsp
+	shufps	$0x00, %xmm2, %xmm2
+
+	#; Check for alignment
+
+	#;movq 8(%rbp), %rdi #; buf 
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	jz	.CP_SSE #; if buffer IS aligned
+
+	#; PRE-LOOP
+	#; we iterate 1-3 times, doing normal x87 float comparison
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
+
+.LP_START:
+
+	#; Load next value from the buffer
+	movss (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+
+	#; increment buffer, decrement counter
+	addq $4, %rdi #; buf++;
+	
+	decq %rcx   #; nframes--
+	jz	.CP_END #; if we run out of frames, we go to the end
+	
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
+	jne .LP_START #; if more non-aligned frames exist, we do a do-over
+
+.CP_SSE:
+
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
+
+	#; Figure out how many loops we should do
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
+
+	shr $2,%rax #; unsigned divide by 4
+	jz .POST_START
+
+	#; %rax = SSE iterations
+
+	#; current maximum is at %xmm0, but we need to ..
+	shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
+
+	#;prefetcht0 16(%rdi)
+
+.LP_SSE:
+
+	movaps (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxps %xmm1, %xmm0
+
+	addq $16, %rdi
+
+	subq $4, %rcx #; nframes-=4
+
+	decq %rax
+	jnz .LP_SSE
+
+	#; Calculate the maximum value contained in the 4 FP's in %xmm0
+	movaps %xmm0, %xmm1
+	shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
+	maxps  %xmm1, %xmm0 #; maximums of the two pairs
+	movaps %xmm0, %xmm1
+	shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
+	maxps  %xmm1, %xmm0 
+
+	#; now every float in %xmm0 is the same value, current maximum value
+	
+	#; Next we need to post-process all remaining frames
+	#; the remaining frame count is in %rcx
+	
+	#; if no remaining frames, jump to the end
+
+	andq $3, %rcx #; nframes % 4
+	jz .CP_END
+
+.POST_START:
+
+	movss (%rdi), %xmm1
+	andps %xmm2, %xmm1
+	maxss %xmm1, %xmm0
+	
+	addq $4, %rdi 	#; buf++;
+	
+	decq %rcx		#; nframes--;
+	jnz .POST_START
+
+.CP_END:
+
+	#; restore registers
+	popq %rsi
+	popq %rdi
+	popq %rcx
+
+	#; return value is in xmm0
+
+	#; return
+	leave
+	ret
+
+#; end proc
--- a/libs/ardour/sse_functions_avx.cc
+++ b/libs/ardour/sse_functions_avx.cc
@ -0,0 +1,120 @@
+/*
+    Copyright (C) 2007 Paul sDavis
+    Written by Sampo Savolainen
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+#include "ardour/types.h"
+
+
+void
+x86_sse_avx_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
+{
+	__m256 current_max, current_min, work;
+
+	// Load max and min values into all four slots of the XMM registers
+	current_min = _mm256_set1_ps(*min);
+	current_max = _mm256_set1_ps(*max);
+
+	// Work input until "buf" reaches 16 byte alignment
+	while ( ((intptr_t)buf) % 32 != 0 && nframes > 0) {
+		
+		// Load the next float into the work buffer
+		work = _mm256_set1_ps(*buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+        // use 64 byte prefetch for quadruple quads:
+		// load each 64 bytes into cash before processing
+        while (nframes >= 16) {
+#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)                                   
+				_mm_prefetch(((char*)buf+64), _mm_hint(0) );  // A total guess! Assumed to be eqivalent to
+#else                                              // the line below but waiting to be tested !!
+                __builtin_prefetch(buf+64,0,0);
+#endif
+                work = _mm256_load_ps(buf);
+                current_min = _mm256_min_ps(current_min, work);
+                current_max = _mm256_max_ps(current_max, work);
+                buf+=8;
+                work = _mm256_load_ps(buf);
+                current_min = _mm256_min_ps(current_min, work);
+                current_max = _mm256_max_ps(current_max, work);
+                buf+=8;
+
+                nframes-=16;
+        }
+
+	// work through 32 bytes aligned buffers
+	while (nframes >= 8) {
+
+		work = _mm256_load_ps(buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf+=8;
+		nframes-=8;
+	}
+
+	// work through the rest < 4 samples
+	while ( nframes > 0) {
+
+		// Load the next float into the work buffer
+		work = _mm256_set1_ps(*buf);
+
+		current_min = _mm256_min_ps(current_min, work);
+		current_max = _mm256_max_ps(current_max, work);
+
+		buf++;
+		nframes--;
+	}
+
+	// Find min & max value in current_max through shuffle tricks
+
+	work = current_min;
+	work =        _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(2, 3, 0, 1));
+	current_min = _mm256_min_ps (work, current_min);
+	work =        _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(1, 0, 3, 2));
+	current_min = _mm256_min_ps (work, current_min);
+	work =        _mm256_permute2f128_ps( current_min, current_min, 1);
+	current_min = _mm256_min_ps (work, current_min);
+
+	*min = current_min[0];
+
+	work = current_max;
+	work =        _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(2, 3, 0, 1));
+	current_max = _mm256_max_ps (work, current_max);
+	work =        _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(1, 0, 3, 2));
+	current_max = _mm256_max_ps (work, current_max);
+	work =        _mm256_permute2f128_ps( current_max, current_max, 1);
+	current_max = _mm256_max_ps (work, current_max);
+
+	*max = current_max[0];
+
+	// zero upper 128 bit of 256 bit ymm register to avoid penalties using non AVX instructions
+	_mm256_zeroupper ();
+}
+
+
+
--- a/libs/ardour/windows/libardour.vcxproj
+++ b/libs/ardour/windows/libardour.vcxproj
@ -14,6 +14,8 @@
    <None Include="..\run-profiling.sh" />
    <None Include="..\run-session-tests.sh" />
    <None Include="..\run-tests.sh" />
+    <None Include="..\sse_avx_functions_64bit_win.s" />
+    <None Include="..\sse_functions_64bit_win.s" />
    <None Include="..\test-env.sh" />
    <None Include="..\wscript" />
  </ItemGroup>
--- a/libs/ardour/windows/libardour.vcxproj.filters
+++ b/libs/ardour/windows/libardour.vcxproj.filters
@ -33,6 +33,8 @@
    <None Include="..\wscript">
      <Filter>scripts</Filter>
    </None>
+    <None Include="..\sse_functions_64bit_win.s" />
+    <None Include="..\sse_avx_functions_64bit_win.s" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\ardour\vestige\aeffectx.h">
--- a/libs/ardour/wscript
+++ b/libs/ardour/wscript
@ -408,6 +408,14 @@ def build(bld):
            obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s' ]
        elif bld.env['build_target'] == 'x86_64':
            obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s' ]
+        
+        if bld.env['build_target'] == 'mingw':
+                import platform as PLATFORM
+                u = PLATFORM.uname ()
+                cpu = u[4]
+                if re.search ("(x86_64|AMD64)", cpu) != None:
+                        obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_avx.cc' ]
+                        obj.source += [ 'sse_functions_64bit_win.s', 'sse_avx_functions_64bit_win.s' ]

    # i18n
    if bld.is_defined('ENABLE_NLS'):
--- a/libs/backends/wavesaudio/waves_audiobackend.cc
+++ b/libs/backends/wavesaudio/waves_audiobackend.cc
@ -21,6 +21,8 @@
 #include "waves_audioport.h"
 #include "waves_midiport.h"

+#include "ardour/runtime_functions.h"
+
 using namespace ARDOUR;

 #ifdef __MINGW64__
@ -1169,13 +1171,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
 {
 #if defined(PLATFORM_WINDOWS)
    const float **buffer = (const float**)input_buffer;
-    size_t copied_bytes = nframes*sizeof(float);

    for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
        it != _physical_audio_inputs.end();
        ++it)
    {
-        memcpy((*it)->buffer(), *buffer, copied_bytes);
+		ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
        ++buffer;
    }
 #else
--- a/libs/backends/wavesaudio/waves_audioport.cc
+++ b/libs/backends/wavesaudio/waves_audioport.cc
@ -18,13 +18,21 @@
 */

 #include "waves_audioport.h"
+#include "ardour/runtime_functions.h"
+#include "pbd/malign.h"

 using namespace ARDOUR;

 WavesAudioPort::WavesAudioPort (const std::string& port_name, PortFlags flags)
    : WavesDataPort (port_name, flags)    
 {
-    memset (_buffer, 0, sizeof (_buffer));
+	aligned_malloc ((void**)&_buffer, MAX_BUFFER_SIZE_BYTES, 32 /*32 byte alignment*/);
+    memset (_buffer, 0, MAX_BUFFER_SIZE_BYTES);
+}
+
+WavesAudioPort::~WavesAudioPort ()
+{
+	aligned_free (_buffer);
 }


@ -40,14 +48,19 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
             * Base class WavesDataPort takes is supposed to provide enough consistentcy
             * of the connections.
             */
-            for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
-				 it != get_connections ().end ();
-				 ++it) {
+
+			// get first buffer data
+			// use optimized function to fill the buffer intialy
+			ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
+			++it;
+            
+			// mix the rest
+			for (; it != get_connections ().end (); ++it) {
                Sample* tgt = buffer ();
                const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
-                for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src)    {
-                    *tgt += *src;
-                }
+
+				// use otimized function to mix the buffers
+				ARDOUR::mix_buffers_no_gain (tgt, src, nframes);
            }
        }
    }
--- a/libs/backends/wavesaudio/waves_audioport.h
+++ b/libs/backends/wavesaudio/waves_audioport.h
@ -35,7 +35,7 @@ public:

    WavesAudioPort (const std::string& port_name, PortFlags flags);

-    virtual ~WavesAudioPort () { };
+    virtual ~WavesAudioPort ();

    virtual DataType type () const {    return DataType::AUDIO; };

@ -49,7 +49,7 @@ protected:

 private:

-    Sample _buffer[MAX_BUFFER_SIZE_SAMPLES];
+    Sample *_buffer;
 };

 } // namespace
--- a/libs/pbd/fpu.cc
+++ b/libs/pbd/fpu.cc
@ -16,7 +16,7 @@
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

 */
-#ifndef  COMPILER_MSVC
+#if !(defined (COMPILER_MSVC) || defined (COMPILER_MINGW))
 #include "libpbd-config.h"

 #define _XOPEN_SOURCE 600
@ -39,10 +39,6 @@ FPU::FPU ()

 	_flags = Flags (0);

-#if defined(__MINGW64__) // Vkamyshniy: under __MINGW64__ the assembler code below is not compiled
-	return;
-#endif
-
 #if !( (defined __x86_64__) || (defined __i386__) ) // !ARCH_X86
 	return;
 #else
--- a/libs/pbd/malign.cc
+++ b/libs/pbd/malign.cc
@ -35,22 +35,52 @@ static const int CPU_CACHE_ALIGN = 64;
 static const int CPU_CACHE_ALIGN = 16; /* arguably 32 on most arches, but it matters less */
 #endif

-int cache_aligned_malloc (void** memptr, size_t size)
+
+int  aligned_malloc (void** memptr, size_t size, size_t alignment)
 {
 #ifndef HAVE_POSIX_MEMALIGN
-	if (((*memptr) = malloc (size)) == 0) {
+#ifdef PLATFORM_WINDOWS
+	if (((*memptr) = _aligned_malloc (size, alignment)) == 0) {
 		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
-					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
+					 alignment, size, strerror (errno)) << endmsg;
 		return errno;
 	} else {
 		return 0;
 	}
 #else
-        if (posix_memalign (memptr, CPU_CACHE_ALIGN, size)) {
+	if (((*memptr) = malloc (size)) == 0) {
+		fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
+					 alignment, size, strerror (errno)) << endmsg;
+		return errno;
+	} else {
+		return 0;
+	}
+#endif
+#else
+        if (posix_memalign (memptr, alignment, size)) {
 		fatal << string_compose (_("Memory allocation error: posix_memalign (%1 * %2) failed (%3)"),
-					 CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
+					 alignment, size, strerror (errno)) << endmsg;
 	}

 	return 0;
 #endif	
 }
+
+void aligned_free (void* memptr)
+{
+#ifdef PLATFORM_WINDOWS
+	_aligned_free (memptr);
+#else
+	free (memptr);
+#endif
+}
+
+int cache_aligned_malloc (void** memptr, size_t size)
+{
+	return aligned_malloc (memptr, size, CPU_CACHE_ALIGN);
+}
+
+void cache_aligned_free (void* memptr)
+{
+	aligned_free (memptr);
+}
--- a/libs/pbd/msvc/fpu.cc
+++ b/libs/pbd/msvc/fpu.cc
@ -1,10 +1,14 @@
-#ifdef COMPILER_MSVC  // Added by JE - 05-12-2009. Inline assembler instructions
-                      // have been changed to Intel format and (in the case of
-                      // cpuid) was replaced by the equivalent VC++ system call).
+// Added by JE - 05-12-2009. Inline assembler instructions
+// have been changed to Intel format and (in the case of
+// cpuid) was replaced by the equivalent VC++ system call).
+
+#if defined (COMPILER_MSVC) || defined (COMPILER_MINGW)
+
 #define _XOPEN_SOURCE 600
 #include <cstdlib>
 #include <stdint.h>
 #include <intrin.h>  // Added by JE - 05-12-2009
+#include <assert.h>

 #include <pbd/fpu.h>
 #include <pbd/error.h>
@ -16,84 +20,63 @@ using namespace std;

 FPU::FPU ()
 {
-	unsigned long cpuflags = 0;
+	unsigned long cpuflags_ECX = 0;
+	unsigned long cpuflags_EDX = 0;

 	_flags = (Flags)0;

-#ifndef ARCH_X86
-	return;
-
-#else
-
 #ifndef USE_X86_64_ASM
-int cpuInfo[4];
+	return;
+#endif

+	// Get CPU lfags using Microsof function
+	// It works for both 64 and 32 bit systems
+	// no need to use assembler for getting info from register, this function does this for us
+	int cpuInfo[4];
 	__cpuid (cpuInfo, 1);
-	cpuflags = cpuInfo[3];
-/*
-	__asm {  // This is how the original section would look if converted to Intel syntax.
-             // However, I have grave doubts about whether it's doing the right thing.
-             // It seems as if the intention was to retrieve feature information from
-             // the processor. However, feature information is returned in the ebx register
-             // (if you believe Wikipedia) or in edx (if you believe Microsoft). Unfortunately,
-             // both registers get ignored in the original code!! Confused?? Join the club!!
-		mov   eax, 1
-		push  ebx
-		cpuid
-		mov   edx, 0
-		pop   ebx
-		mov   cpuflags, ecx // This can't be right, surely???
-	}; */
-#else
-// Note that this syntax is currently still in AT&T format !
-	asm volatile (
-		"pushq %%rbx\n"
-		"movq $1, %%rax\n"
-		"cpuid\n"
-		"movq %%rdx, %0\n"
-		"popq %%rbx\n"
-		: "=r" (cpuflags)
-		:
-		: "%rax", "%rcx", "%rdx", "memory"
-		);
+	cpuflags_ECX = cpuInfo[2]; // flags from ECX register
+	cpuflags_EDX = cpuInfo[3]; // flags from EDX register

-#endif /* USE_X86_64_ASM */
-
-	if (cpuflags & (1<<25)) {
-		_flags = Flags (_flags | (HasSSE|HasFlushToZero));
+	if (cpuflags_ECX & (1<<28)) {
+		_flags = Flags (_flags | (HasAVX) );
 	}

-	if (cpuflags & (1<<26)) {
+	if (cpuflags_EDX & (1<<25)) {
+		_flags = Flags (_flags | (HasSSE|HasFlushToZero) );
+	}
+
+	if (cpuflags_EDX & (1<<26)) {
 		_flags = Flags (_flags | HasSSE2);
 	}

-	if (cpuflags & (1 << 24)) {
-		bool  aligned_malloc = false; // Added by JE - 05-12-2009
+	if (cpuflags_EDX & (1 << 24)) {
 		char* fxbuf = 0;
-// This section changed by JE - 05-12-2009
-#ifdef NO_POSIX_MEMALIGN
-#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)       // All of these support '_aligned_malloc()'
-		fxbuf = (char *) _aligned_malloc(512, 16);  // (note that they all need at least MSVC runtime 7.0)
-		aligned_malloc = true;
-#else
-		fxbuf = (char *) malloc(512);
-#endif
-#else
-		fxbuf = posix_memalign ((void**)&fxbuf, 16, 512);
-#endif
+
+		// allocate alligned buffer
+		fxbuf = (char*)_aligned_malloc(512, 16);
+
 		// Verify that fxbuf is correctly aligned
-		unsigned long buf_addr = (unsigned long)(void*)fxbuf;
+		unsigned long long buf_addr = (unsigned long long)(void*)fxbuf;
 		if ((0 == buf_addr) || (buf_addr % 16))
 			error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg;
 		else
 		{
 			memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009

+#if defined (COMPILER_MINGW)
+			asm volatile (
+				"fxsave (%0)"
+				:
+				: "r" (fxbuf)
+				: "memory"
+				);
+
+#elif defined (COMPILER_MSVC)
 			__asm {
 				mov eax, fxbuf
 				fxsave   [eax]
 			};
-
+#endif
 			uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);

 			/* if the mask is zero, set its default value (from intel specs) */
@ -106,13 +89,9 @@ int cpuInfo[4];
 				_flags = Flags (_flags | HasDenormalsAreZero);
 			}

-			if (aligned_malloc)
-				_aligned_free (fxbuf);
-			else
-				free (fxbuf);
+			_aligned_free (fxbuf);
 		}
 	}
-#endif  // ARCH_X86
 }

 FPU::~FPU ()
--- a/libs/pbd/pbd/fpu.h
+++ b/libs/pbd/pbd/fpu.h
@ -30,7 +30,8 @@ class LIBPBD_API FPU {
 		HasFlushToZero = 0x1,
 		HasDenormalsAreZero = 0x2,
 		HasSSE = 0x4,
-		HasSSE2 = 0x8
+		HasSSE2 = 0x8,
+		HasAVX = 0x10
 	};

  public:
@ -41,6 +42,7 @@ class LIBPBD_API FPU {
 	bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
 	bool has_sse () const { return _flags & HasSSE; }
 	bool has_sse2 () const { return _flags & HasSSE2; }
+	bool has_avx () const { return _flags & HasAVX; }
 	
  private:
 	Flags _flags;
--- a/libs/pbd/pbd/malign.h
+++ b/libs/pbd/pbd/malign.h
@ -24,6 +24,10 @@

 #include "pbd/libpbd_visibility.h"

-LIBPBD_API int cache_aligned_malloc (void** memptr, size_t size);
+LIBPBD_API int  cache_aligned_malloc (void** memptr, size_t size);
+LIBPBD_API void cache_aligned_free (void* memptr);
+
+LIBPBD_API int  aligned_malloc (void** memptr, size_t size, size_t alignment);
+LIBPBD_API void aligned_free (void* memptr);

 #endif /* __pbd_malign_h__ */
--- a/libs/pbd/wscript
+++ b/libs/pbd/wscript
@ -48,7 +48,6 @@ libpbd_sources = [
    'ffs.cc',
    'file_manager.cc',
    'file_utils.cc',
-    'fpu.cc',
    'glib_semaphore.cc',
    'id.cc',
    'locale_guard.cc',
@ -150,8 +149,18 @@ def build(bld):
    if bld.env['build_target'] == 'x86_64':
        obj.defines += [ 'USE_X86_64_ASM' ]
    if bld.env['build_target'] == 'mingw':
+        import re
+        import platform as PLATFORM
+        u = PLATFORM.uname ()
+        cpu = u[4]
+        if re.search ("(x86_64|AMD64)", cpu) != None:
+            obj.defines += [ 'USE_X86_64_ASM' ]
+        obj.defines += ['NO_POSIX_MEMALIGN' ]
        obj.source += [ 'windows_special_dirs.cc' ]
+        obj.source += [ 'msvc/fpu.cc' ]
        obj.uselib += ' OLE'
+    else:
+        obj.source += [ 'fpu.cc' ]

    if bld.env['BUILD_TESTS'] and bld.is_defined('HAVE_CPPUNIT'):
        # Unit tests
--- a/21
+++ b/21
@ -221,7 +221,7 @@ def set_compiler_flags (conf,opt):
        # 
        compiler_flags.append ('-U__STRICT_ANSI__')

-    if ((re.search ("i[0-9]86", cpu) != None) or (re.search ("x86_64", cpu) != None)) and conf.env['build_target'] != 'none':
+    if (re.search ("(i[0-9]86|x86_64|AMD64)", cpu) != None) and conf.env['build_target'] != 'none':


        #
@ -229,9 +229,8 @@ def set_compiler_flags (conf,opt):
        # the compile-time presence of the macro _LP64 is used to 
        # distingush 32 and 64 bit assembler
        #
-
-        if (re.search ("(i[0-9]86|x86_64)", cpu) != None):
-            compiler_flags.append ("-DARCH_X86")
+ 
+        compiler_flags.append ("-DARCH_X86")

        if platform == 'linux' :

@ -258,6 +257,16 @@ def set_compiler_flags (conf,opt):

        if not is_clang and ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse:
            compiler_flags.extend (["-msse", "-mfpmath=sse", "-DUSE_XMMINTRIN"])
+            
+        if (conf.env['build_target'] == 'mingw'):
+            if (re.search ("(x86_64|AMD64)", cpu) != None):
+                # on Windows sse is supported by 64 bit platforms only
+                build_host_supports_sse = True
+
+                # mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
+                # compiler_flags.append (["--mmnemonic=att", "msyntax=att")
+                
+                compiler_flags.extend (["-mavx", "-mvzeroupper", "-DUSE_XMMINTRIN"])  

    # end of processor-specific section

@ -266,7 +275,7 @@ def set_compiler_flags (conf,opt):
        if sys.platform == 'darwin':
            compiler_flags.append("-DBUILD_VECLIB_OPTIMIZATIONS")
            conf.env.append_value('LINKFLAGS_OSX', ['-framework', 'Accelerate'])
-        elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64':
+        elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64' or (conf.env['build_target'] == 'mingw' and build_host_supports_sse):
            compiler_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
        if not build_host_supports_sse:
            print("\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)")
@ -695,7 +704,7 @@ def configure(conf):
    autowaf.check_pkg(conf, 'rubberband', uselib_store='RUBBERBAND', mandatory=True)

    if Options.options.dist_target == 'mingw':
-        Options.options.fpu_optimization = False
+        Options.options.fpu_optimization = True
        conf.env.append_value('CFLAGS', '-DPLATFORM_WINDOWS')
        conf.env.append_value('CFLAGS', '-DCOMPILER_MINGW')
        conf.env.append_value('CXXFLAGS', '-DPLATFORM_WINDOWS')