mirror of
https://github.com/Ardour/ardour.git
synced 2026-01-24 14:17:21 +01:00
Merge branch 'performance'
This commit is contained in:
commit
2a84e1d0c5
23 changed files with 1600 additions and 114 deletions
|
|
@ -5202,6 +5202,12 @@ Editor::add_routes (RouteList& routes)
|
|||
connect_routes_and_update_global_rec_button (routes);
|
||||
}
|
||||
|
||||
namespace {
|
||||
bool tv_not_selected (TimeAxisView *tv) {
|
||||
return !tv->get_selected ();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
Editor::timeaxisview_deleted (TimeAxisView *tv)
|
||||
{
|
||||
|
|
@ -5245,19 +5251,20 @@ Editor::timeaxisview_deleted (TimeAxisView *tv)
|
|||
|
||||
if (current_mixer_strip && current_mixer_strip->route() == route) {
|
||||
|
||||
TimeAxisView* next_tv;
|
||||
|
||||
if (track_views.empty()) {
|
||||
next_tv = 0;
|
||||
} else if (i == track_views.end()) {
|
||||
next_tv = track_views.front();
|
||||
} else {
|
||||
next_tv = (*i);
|
||||
// find first non selected track
|
||||
TimeAxisView* first_non_selected_tv = 0;
|
||||
|
||||
if (!track_views.empty() ) {
|
||||
|
||||
i = std::find_if (track_views.begin(), track_views.end(), tv_not_selected);
|
||||
|
||||
if (i != track_views.end() ) {
|
||||
first_non_selected_tv = (*i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (next_tv ) {
|
||||
set_selected_mixer_strip (*next_tv);
|
||||
|
||||
if (first_non_selected_tv ) {
|
||||
set_selected_mixer_strip (*first_non_selected_tv);
|
||||
} else {
|
||||
/* make the editor mixer strip go away setting the
|
||||
* button to inactive (which also unticks the menu option)
|
||||
|
|
|
|||
|
|
@ -33,7 +33,17 @@ extern "C" {
|
|||
LIBARDOUR_API void x86_sse_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
/* AVX functions */
|
||||
LIBARDOUR_API float x86_sse_avx_compute_peak (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float current);
|
||||
LIBARDOUR_API void x86_sse_avx_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void x86_sse_avx_mix_buffers_with_gain(ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void x86_sse_avx_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void x86_sse_avx_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
}
|
||||
|
||||
LIBARDOUR_API void x86_sse_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
|
||||
LIBARDOUR_API void x86_sse_avx_find_peaks (const ARDOUR::Sample * buf, ARDOUR::pframes_t nsamples, float *min, float *max);
|
||||
|
||||
/* debug wrappers for SSE functions */
|
||||
|
||||
|
|
@ -41,6 +51,7 @@ LIBARDOUR_API float debug_compute_peak (const ARDOUR::Sample * buf
|
|||
LIBARDOUR_API void debug_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void debug_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void debug_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void debug_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -61,5 +72,6 @@ LIBARDOUR_API void default_find_peaks (const ARDOUR::Sample * bu
|
|||
LIBARDOUR_API void default_apply_gain_to_buffer (ARDOUR::Sample * buf, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void default_mix_buffers_with_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes, float gain);
|
||||
LIBARDOUR_API void default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
LIBARDOUR_API void default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, ARDOUR::pframes_t nframes);
|
||||
|
||||
#endif /* __ardour_mix_h__ */
|
||||
|
|
|
|||
|
|
@ -25,17 +25,19 @@
|
|||
|
||||
namespace ARDOUR {
|
||||
|
||||
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
|
||||
typedef float (*compute_peak_t) (const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*find_peaks_t) (const ARDOUR::Sample *, pframes_t, float *, float*);
|
||||
typedef void (*apply_gain_to_buffer_t) (ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*mix_buffers_with_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t, float);
|
||||
typedef void (*mix_buffers_no_gain_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
|
||||
typedef void (*copy_vector_t) (ARDOUR::Sample *, const ARDOUR::Sample *, pframes_t);
|
||||
|
||||
LIBARDOUR_API extern compute_peak_t compute_peak;
|
||||
LIBARDOUR_API extern find_peaks_t find_peaks;
|
||||
LIBARDOUR_API extern apply_gain_to_buffer_t apply_gain_to_buffer;
|
||||
LIBARDOUR_API extern mix_buffers_with_gain_t mix_buffers_with_gain;
|
||||
LIBARDOUR_API extern mix_buffers_no_gain_t mix_buffers_no_gain;
|
||||
LIBARDOUR_API extern copy_vector_t copy_vector;
|
||||
}
|
||||
|
||||
#endif /* __ardour_runtime_functions_h__ */
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ AudioBuffer::AudioBuffer(size_t capacity)
|
|||
AudioBuffer::~AudioBuffer()
|
||||
{
|
||||
if (_owns_data)
|
||||
free(_data);
|
||||
cache_aligned_free(_data);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -60,7 +60,7 @@ AudioBuffer::resize (size_t size)
|
|||
return;
|
||||
}
|
||||
|
||||
free (_data);
|
||||
cache_aligned_free (_data);
|
||||
|
||||
cache_aligned_malloc ((void**) &_data, sizeof (Sample) * size);
|
||||
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ find_peaks_t ARDOUR::find_peaks = 0;
|
|||
apply_gain_to_buffer_t ARDOUR::apply_gain_to_buffer = 0;
|
||||
mix_buffers_with_gain_t ARDOUR::mix_buffers_with_gain = 0;
|
||||
mix_buffers_no_gain_t ARDOUR::mix_buffers_no_gain = 0;
|
||||
copy_vector_t ARDOUR::copy_vector = 0;
|
||||
|
||||
PBD::Signal1<void,std::string> ARDOUR::BootMessage;
|
||||
PBD::Signal3<void,std::string,std::string,bool> ARDOUR::PluginScanMessage;
|
||||
|
|
@ -153,7 +154,21 @@ setup_hardware_optimization (bool try_optimization)
|
|||
|
||||
#if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS)
|
||||
|
||||
if (fpu.has_sse()) {
|
||||
if (fpu.has_avx()) {
|
||||
|
||||
info << "Using AVX optimized routines" << endmsg;
|
||||
|
||||
// AVX SET
|
||||
compute_peak = x86_sse_avx_compute_peak;
|
||||
find_peaks = x86_sse_avx_find_peaks;
|
||||
apply_gain_to_buffer = x86_sse_avx_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = x86_sse_avx_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = x86_sse_avx_mix_buffers_no_gain;
|
||||
copy_vector = x86_sse_avx_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
} else if (fpu.has_sse()) {
|
||||
|
||||
info << "Using SSE optimized routines" << endmsg;
|
||||
|
||||
|
|
@ -163,6 +178,7 @@ setup_hardware_optimization (bool try_optimization)
|
|||
apply_gain_to_buffer = x86_sse_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = x86_sse_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = x86_sse_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
|
|
@ -180,6 +196,7 @@ setup_hardware_optimization (bool try_optimization)
|
|||
apply_gain_to_buffer = veclib_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = veclib_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = veclib_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
generic_mix_functions = false;
|
||||
|
||||
|
|
@ -199,6 +216,7 @@ setup_hardware_optimization (bool try_optimization)
|
|||
apply_gain_to_buffer = default_apply_gain_to_buffer;
|
||||
mix_buffers_with_gain = default_mix_buffers_with_gain;
|
||||
mix_buffers_no_gain = default_mix_buffers_no_gain;
|
||||
copy_vector = default_copy_vector;
|
||||
|
||||
info << "No H/W specific optimizations in use" << endmsg;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ MidiBuffer::MidiBuffer(size_t capacity)
|
|||
|
||||
MidiBuffer::~MidiBuffer()
|
||||
{
|
||||
free(_data);
|
||||
cache_aligned_free(_data);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -60,7 +60,7 @@ MidiBuffer::resize(size_t size)
|
|||
return;
|
||||
}
|
||||
|
||||
free (_data);
|
||||
cache_aligned_free (_data);
|
||||
|
||||
cache_aligned_malloc ((void**) &_data, size);
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ using namespace ARDOUR;
|
|||
// Debug wrappers
|
||||
|
||||
float
|
||||
debug_compute_peak (ARDOUR::Sample *buf, pframes_t nsamples, float current)
|
||||
debug_compute_peak (const ARDOUR::Sample *buf, pframes_t nsamples, float current)
|
||||
{
|
||||
if ( ((intptr_t)buf % 16) != 0) {
|
||||
std::cerr << "compute_peak(): buffer unaligned!" << std::endl;
|
||||
|
|
@ -52,7 +52,7 @@ debug_apply_gain_to_buffer (ARDOUR::Sample *buf, pframes_t nframes, float gain)
|
|||
}
|
||||
|
||||
void
|
||||
debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes, float gain)
|
||||
debug_mix_buffers_with_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes, float gain)
|
||||
{
|
||||
if ( ((intptr_t)dst & 15) != 0) {
|
||||
std::cerr << "mix_buffers_with_gain(): dst unaligned!" << std::endl;
|
||||
|
|
@ -67,7 +67,7 @@ debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t
|
|||
}
|
||||
|
||||
void
|
||||
debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, pframes_t nframes)
|
||||
debug_mix_buffers_no_gain (ARDOUR::Sample *dst, const ARDOUR::Sample *src, pframes_t nframes)
|
||||
{
|
||||
if ( ((intptr_t)dst & 15) != 0) {
|
||||
std::cerr << "mix_buffers_no_gain(): dst unaligned!" << std::endl;
|
||||
|
|
@ -136,6 +136,12 @@ default_mix_buffers_no_gain (ARDOUR::Sample * dst, const ARDOUR::Sample * src, p
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
default_copy_vector (ARDOUR::Sample * dst, const ARDOUR::Sample * src, pframes_t nframes)
|
||||
{
|
||||
memcpy(dst, src, nframes*sizeof(ARDOUR::Sample));
|
||||
}
|
||||
|
||||
#if defined (__APPLE__) && defined (BUILD_VECLIB_OPTIMIZATIONS)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
|
|
|
|||
587
libs/ardour/sse_avx_functions_64bit_win.s
Normal file
587
libs/ardour/sse_avx_functions_64bit_win.s
Normal file
|
|
@ -0,0 +1,587 @@
|
|||
/*
|
||||
Copyright (C) 2005-2006 Paul Davis, John Rigg
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
Author: Sampo Savolainen
|
||||
64-bit conversion: John Rigg
|
||||
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#; Microsoft version of AVX sample processing functions
|
||||
|
||||
#; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_avx_mix_buffers_with_gain
|
||||
.def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_mix_buffers_with_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
#; %xmm3 float gain
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm3, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .MBWG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
|
||||
|
||||
#; if we are aligned
|
||||
cmp $0, %rbx
|
||||
jz .MBWG_AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBWG_PRELOOP:
|
||||
|
||||
#; gain is already in %xmm0
|
||||
movss (%rdx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rcx), %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
decq %r8 #; nframes--
|
||||
jz .MBWG_END
|
||||
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .MBWG_PRELOOP
|
||||
|
||||
.MBWG_AVX:
|
||||
|
||||
cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
|
||||
jl .MBWG_NONALIGN #; we jump straight to the "normal" code
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.MBWG_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm1 #; source => xmm0
|
||||
vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source
|
||||
vaddps (%rcx), %ymm2, %ymm1 #; mix with destination
|
||||
vmovaps %ymm1, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .MBWG_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .MBWG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.MBWG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
#; gain is already in %xmm0
|
||||
|
||||
.MBWG_NONALIGNLOOP:
|
||||
|
||||
movss (%rdx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rcx), %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .MBWG_NONALIGNLOOP
|
||||
|
||||
.MBWG_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_avx_mix_buffers_no_gain
|
||||
.def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_mix_buffers_no_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .MBNG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .MBNG_AVX #; aligned at 32, rpoceed to AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; AVX instructions
|
||||
|
||||
.MBNG_PRELOOP:
|
||||
|
||||
movss (%rdx), %xmm0
|
||||
addss (%rcx), %xmm0
|
||||
movss %xmm0, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
|
||||
decq %r8 #; nframes--
|
||||
jz .MBNG_END
|
||||
|
||||
addq $4, %rbx #; one non-aligned byte less
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .MBNG_PRELOOP
|
||||
|
||||
.MBNG_AVX:
|
||||
|
||||
cmp $8, %r8 #; if there are frames left, but less than 8
|
||||
jl .MBNG_NONALIGN #; we can't run AVX
|
||||
|
||||
.MBNG_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm0 #; source => xmm0
|
||||
vaddps (%rcx), %ymm0, %ymm1 #; mix with destination
|
||||
vmovaps %ymm1, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .MBNG_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .MBNG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.MBNG_NONALIGN:
|
||||
#; not aligned!
|
||||
#;
|
||||
|
||||
movss (%rdx), %xmm0 #; src => xmm0
|
||||
addss (%rcx), %xmm0 #; xmm0 += dst
|
||||
movss %xmm0, (%rcx) #; xmm0 => dst
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .MBNG_NONALIGN
|
||||
|
||||
.MBNG_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_avx_copy_vector
|
||||
.def x86_sse_avx_copy_vector; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_copy_vector:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .CB_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %rax
|
||||
andq $28, %rax #; mask alignment offset
|
||||
|
||||
movq %rdx, %rbx
|
||||
andq $28, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .CB_AVX #; aligned at 32, rpoceed to AVX
|
||||
|
||||
#; Pre-loop, we need to run 1-7 frames "manually" without
|
||||
#; AVX instructions
|
||||
|
||||
.CB_PRELOOP:
|
||||
|
||||
movss (%rdx), %xmm0
|
||||
movss %xmm0, (%rcx)
|
||||
|
||||
addq $4, %rcx #; dst++
|
||||
addq $4, %rdx #; src++
|
||||
|
||||
decq %r8 #; nframes--
|
||||
jz .CB_END
|
||||
|
||||
addq $4, %rbx #; one non-aligned byte less
|
||||
|
||||
cmp $32, %rbx #; test if we've reached 32 byte alignment
|
||||
jne .CB_PRELOOP
|
||||
|
||||
.CB_AVX:
|
||||
|
||||
cmp $8, %r8 #; if there are frames left, but less than 8
|
||||
jl .CB_NONALIGN #; we can't run AVX
|
||||
|
||||
.CB_AVXLOOP:
|
||||
|
||||
vmovaps (%rdx), %ymm0 #; source => xmm0
|
||||
vmovaps %ymm0, (%rcx) #; copy result to destination
|
||||
|
||||
addq $32, %rcx #; dst+=8
|
||||
addq $32, %rdx #; src+=8
|
||||
|
||||
subq $8, %r8 #; nframes-=8
|
||||
cmp $8, %r8
|
||||
jge .CB_AVXLOOP
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
cmp $0, %r8
|
||||
je .CB_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-7 frames.
|
||||
|
||||
.CB_NONALIGN:
|
||||
#; not aligned!
|
||||
#;
|
||||
|
||||
movss (%rdx), %xmm0 #; src => xmm0
|
||||
movss %xmm0, (%rcx) #; xmm0 => dst
|
||||
|
||||
addq $4, %rcx
|
||||
addq $4, %rdx
|
||||
|
||||
decq %r8
|
||||
jnz .CB_NONALIGN
|
||||
|
||||
.CB_END:
|
||||
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_avx_apply_gain_to_buffer
|
||||
.def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_apply_gain_to_buffer:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float gain avx specific register
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .AG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rcx, %r8 #; buf => %rdx
|
||||
andq $28, %r8 #; check alignment with mask 11100
|
||||
jz .AG_AVX #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-7 times, doing normal x87 float comparison
|
||||
#; so we reach a 32 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.AGLP_START:
|
||||
|
||||
#; Load next value from the buffer into %xmm1
|
||||
movss (%rcx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jz .AG_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %r8 #; one non-aligned byte less
|
||||
cmp $16, %r8
|
||||
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.AG_AVX:
|
||||
|
||||
#; We have reached the 32 byte aligned "buf" ("rcx") value
|
||||
#; use AVX instructions
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rdx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $3, %rax #; unsigned divide by 8
|
||||
|
||||
#; %rax = AVX iterations
|
||||
cmp $0, %rax
|
||||
je .AGPOST_START
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.AGLP_AVX:
|
||||
|
||||
vmovaps (%rcx), %ymm1
|
||||
vmulps %ymm0, %ymm1, %ymm2
|
||||
vmovaps %ymm2, (%rcx)
|
||||
|
||||
addq $32, %rcx #; buf + 8
|
||||
subq $8, %rdx #; nframes-=8
|
||||
|
||||
decq %rax
|
||||
jnz .AGLP_AVX
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
cmpq $0, %rdx #;
|
||||
jz .AG_END
|
||||
|
||||
.AGPOST_START:
|
||||
|
||||
movss (%rcx), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rcx)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jnz .AGPOST_START #; if we run out of frames, we go to the end
|
||||
|
||||
.AG_END:
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
|
||||
|
||||
#; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
|
||||
|
||||
.globl x86_sse_avx_compute_peak
|
||||
.def x86_sse_avx_compute_peak; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_avx_compute_peak:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float* buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float current
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; move current max to %xmm0 for convenience
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .CP_END
|
||||
|
||||
#; Check for alignment
|
||||
movq %rcx, %r8 #; buf => %rdx
|
||||
andq $28, %r8 #; mask bits 1 & 2
|
||||
jz .CP_AVX #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-7 times, doing normal x87 float comparison
|
||||
#; so we reach a 32 byte aligned "buf" (=%rcx) value
|
||||
|
||||
.LP_START:
|
||||
|
||||
#; Load next value from the buffer
|
||||
movss (%rcx), %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--
|
||||
jz .CP_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %r8 #; one non-aligned byte less
|
||||
cmp $32, %r8
|
||||
jne .LP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.CP_AVX:
|
||||
|
||||
#; We have reached the 32 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rdx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $3, %rax #; unsigned divide by 8
|
||||
jz .POST_START
|
||||
|
||||
#; %rax = AVX iterations
|
||||
|
||||
#; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
|
||||
vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
|
||||
vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
|
||||
|
||||
.LP_AVX:
|
||||
|
||||
vmovaps (%rcx), %ymm1
|
||||
vmaxps %ymm1, %ymm0, %ymm0
|
||||
|
||||
addq $32, %rcx #; buf+=8
|
||||
subq $8, %rdx #; nframes-=8
|
||||
|
||||
decq %rax
|
||||
jnz .LP_AVX
|
||||
|
||||
#; Calculate the maximum value contained in the 4 FP's in %ymm0
|
||||
vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
|
||||
vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
|
||||
vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs
|
||||
vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums
|
||||
|
||||
#; now every float in %ymm0 is the same value, current maximum value
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
#; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
|
||||
vzeroupper
|
||||
|
||||
#; if no remaining frames, jump to the end
|
||||
cmp $0, %rdx
|
||||
je .CP_END
|
||||
|
||||
.POST_START:
|
||||
|
||||
movss (%rcx), %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
addq $4, %rcx #; buf++;
|
||||
|
||||
decq %rdx #; nframes--;
|
||||
jnz .POST_START
|
||||
|
||||
.CP_END:
|
||||
|
||||
#; return value is in xmm0
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
679
libs/ardour/sse_functions_64bit_win.s
Normal file
679
libs/ardour/sse_functions_64bit_win.s
Normal file
|
|
@ -0,0 +1,679 @@
|
|||
/*
|
||||
Copyright (C) 2005-2006 Paul Davis, John Rigg
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
Author: Sampo Savolainen
|
||||
64-bit conversion: John Rigg
|
||||
|
||||
$Id$
|
||||
*/
|
||||
|
||||
#; Microsoft version of SSE sample processing functions
|
||||
|
||||
#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_mix_buffers_with_gain
|
||||
.def x86_sse_mix_buffers_with_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_mix_buffers_with_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
#; %xmm3 float gain
|
||||
|
||||
#; due to System V AMD64 (Linux) calling convention
|
||||
#; %rdi float *dst
|
||||
#; %rsi float *src
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm0 float gain
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
pushq %rcx
|
||||
pushq %rdx
|
||||
pushq %rdi #; must be preserved
|
||||
pushq %rsi #; must be preserved
|
||||
|
||||
#; to keep algorithms universal - move input params into Linux specific registers
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
movss %xmm3, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %rdx
|
||||
je .MBWG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax #; mask alignment offset
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBWG_NONALIGN #; if not aligned, calculate manually
|
||||
|
||||
#; if we are aligned
|
||||
cmp $0, %rbx
|
||||
jz .MBWG_SSE
|
||||
|
||||
#; Pre-loop, we need to run 1-3 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBWG_PRELOOP:
|
||||
|
||||
#; gain is already in %xmm0
|
||||
movss (%rsi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rdi), %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
addq $4, %rdi #; dst++
|
||||
addq $4, %rsi #; src++
|
||||
decq %rdx #; nframes--
|
||||
jz .MBWG_END
|
||||
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $16, %rbx #; test if we've reached 16 byte alignment
|
||||
jne .MBWG_PRELOOP
|
||||
|
||||
|
||||
.MBWG_SSE:
|
||||
|
||||
cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
|
||||
jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
|
||||
|
||||
#; gain is already in %xmm0
|
||||
shufps $0x00, %xmm0, %xmm0
|
||||
|
||||
|
||||
.MBWG_SSELOOP:
|
||||
|
||||
movaps (%rsi), %xmm1 #; source => xmm0
|
||||
mulps %xmm0, %xmm1 #; apply gain to source
|
||||
addps (%rdi), %xmm1 #; mix with destination
|
||||
movaps %xmm1, (%rdi) #; copy result to destination
|
||||
|
||||
addq $16, %rdi #; dst+=4
|
||||
addq $16, %rsi #; src+=4
|
||||
|
||||
subq $4, %rdx #; nframes-=4
|
||||
cmp $4, %rdx
|
||||
jge .MBWG_SSELOOP
|
||||
|
||||
cmp $0, %rdx
|
||||
je .MBWG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-3 frames.
|
||||
|
||||
.MBWG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
#; gain is already in %xmm0
|
||||
|
||||
.MBWG_NONALIGNLOOP:
|
||||
|
||||
movss (%rsi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
addss (%rdi), %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
addq $4, %rdi
|
||||
addq $4, %rsi
|
||||
|
||||
decq %rdx
|
||||
jnz .MBWG_NONALIGNLOOP
|
||||
|
||||
.MBWG_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rdx
|
||||
popq %rcx
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
|
||||
|
||||
.globl x86_sse_mix_buffers_no_gain
|
||||
.def x86_sse_mix_buffers_no_gain; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_mix_buffers_no_gain:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *dst
|
||||
#; %rdx float *src
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
#; due to System V AMD64 (Linux) calling convention
|
||||
#; %rdi float *dst
|
||||
#; %rsi float *src
|
||||
#; %rdx unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
pushq %rcx
|
||||
pushq %rdx
|
||||
pushq %rdi #; must be preserved
|
||||
pushq %rsi #; must be preserved
|
||||
|
||||
#; to keep algorithms universal - move input params into Linux specific registers
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
cmp $0, %r8
|
||||
je .MBNG_END
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax #; mask alignment offset
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx #; mask alignment offset
|
||||
|
||||
cmp %rax, %rbx
|
||||
jne .MBNG_NONALIGN #; if not aligned, calculate manually
|
||||
|
||||
cmp $0, %rbx
|
||||
je .MBNG_SSE
|
||||
|
||||
#; Pre-loop, we need to run 1-3 frames "manually" without
|
||||
#; SSE instructions
|
||||
|
||||
.MBNG_PRELOOP:
|
||||
|
||||
movss (%rsi), %xmm0
|
||||
addss (%rdi), %xmm0
|
||||
movss %xmm0, (%rdi)
|
||||
|
||||
addq $4, %rdi #; dst++
|
||||
addq $4, %rsi #; src++
|
||||
decq %rdx #; nframes--
|
||||
jz .MBNG_END
|
||||
addq $4, %rbx
|
||||
|
||||
cmp $16, %rbx #; test if we've reached 16 byte alignment
|
||||
jne .MBNG_PRELOOP
|
||||
|
||||
.MBNG_SSE:
|
||||
|
||||
cmp $4, %rdx #; if there are frames left, but less than 4
|
||||
jnge .MBNG_NONALIGN #; we can't run SSE
|
||||
|
||||
.MBNG_SSELOOP:
|
||||
|
||||
movaps (%rsi), %xmm0 #; source => xmm0
|
||||
addps (%rdi), %xmm0 #; mix with destination
|
||||
movaps %xmm0, (%rdi) #; copy result to destination
|
||||
|
||||
addq $16, %rdi #; dst+=4
|
||||
addq $16, %rsi #; src+=4
|
||||
|
||||
subq $4, %rdx #; nframes-=4
|
||||
cmp $4, %rdx
|
||||
jge .MBNG_SSELOOP
|
||||
|
||||
cmp $0, %rdx
|
||||
je .MBNG_END
|
||||
|
||||
#; if there are remaining frames, the nonalign code will do nicely
|
||||
#; for the rest 1-3 frames.
|
||||
|
||||
.MBNG_NONALIGN:
|
||||
#; not aligned!
|
||||
|
||||
movss (%rsi), %xmm0 #; src => xmm0
|
||||
addss (%rdi), %xmm0 #; xmm0 += dst
|
||||
movss %xmm0, (%rdi) #; xmm0 => dst
|
||||
|
||||
addq $4, %rdi
|
||||
addq $4, %rsi
|
||||
|
||||
decq %rdx
|
||||
jnz .MBNG_NONALIGN
|
||||
|
||||
.MBNG_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rdx
|
||||
popq %rcx
|
||||
popq %rbx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
|
||||
#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
|
||||
|
||||
.globl x86_sse_apply_gain_to_buffer
|
||||
.def x86_sse_apply_gain_to_buffer; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
x86_sse_apply_gain_to_buffer:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float gain
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
#; due to System V AMD64 (Linux) calling convention
|
||||
#; %rdi float *buf 32(%rbp)
|
||||
#; %rsi unsigned int nframes
|
||||
#; %xmm0 float gain
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rcx
|
||||
pushq %rdi #; must be preserved
|
||||
pushq %rsi #; must be preserved
|
||||
|
||||
#; to keep algorithms universal - move input params into Linux specific registers
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; the real function
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
movq %rsi, %rcx #; nframes
|
||||
cmp $0, %rcx
|
||||
je .AG_END
|
||||
|
||||
#; set up the gain buffer (gain is already in %xmm0)
|
||||
shufps $0x00, %xmm0, %xmm0
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
movq %rdi, %rdx #; buf => %rdx
|
||||
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
|
||||
jz .AG_SSE #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-3 times, doing normal x87 float comparison
|
||||
#; so we reach a 16 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.AGLP_START:
|
||||
|
||||
#; Load next value from the buffer into %xmm1
|
||||
movss (%rdi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jz .AG_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %rdx #; one non-aligned byte less
|
||||
cmp $16, %rdx
|
||||
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.AG_SSE:
|
||||
|
||||
#; We have reached the 16 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rcx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $2,%rax #; unsigned divide by 4
|
||||
|
||||
#; %rax = SSE iterations
|
||||
cmp $0, %rax
|
||||
je .AGPOST_START
|
||||
|
||||
.AGLP_SSE:
|
||||
|
||||
movaps (%rdi), %xmm1
|
||||
mulps %xmm0, %xmm1
|
||||
movaps %xmm1, (%rdi)
|
||||
|
||||
addq $16, %rdi #; buf + 4
|
||||
subq $4, %rcx #; nframes-=4
|
||||
|
||||
decq %rax
|
||||
jnz .AGLP_SSE
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
andq $3, %rcx #; nframes % 4
|
||||
jz .AG_END
|
||||
|
||||
.AGPOST_START:
|
||||
|
||||
movss (%rdi), %xmm1
|
||||
mulss %xmm0, %xmm1
|
||||
movss %xmm1, (%rdi)
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jnz .AGPOST_START #; if we run out of frames, we go to the end
|
||||
|
||||
.AG_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rcx
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
|
||||
|
||||
#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
|
||||
|
||||
.globl x86_sse_apply_gain_vector
|
||||
.def x86_sse_apply_gain_vector; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
|
||||
x86_sse_apply_gain_vector:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float *buf
|
||||
#; %rdx float *gain_vector
|
||||
#; %r8 unsigned int nframes
|
||||
|
||||
#; due to System V AMD64 (Linux) calling convention
|
||||
#; %rdi float *buf
|
||||
#; %rsi float *gain_vector
|
||||
#; %rdx unsigned int nframes
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save the registers
|
||||
pushq %rbx #; must be preserved
|
||||
pushq %rcx
|
||||
pushq %rdx
|
||||
pushq %rdi #; must be preserved
|
||||
pushq %rsi #; must be preserved
|
||||
|
||||
#; to keep algorithms universal - move input params into Linux specific registers
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movq %r8, %rdx
|
||||
|
||||
#; if nframes == 0 go to end
|
||||
cmp $0, %rdx
|
||||
je .AGA_END
|
||||
|
||||
#; Check alignment
|
||||
movq %rdi, %rax
|
||||
andq $12, %rax
|
||||
|
||||
movq %rsi, %rbx
|
||||
andq $12, %rbx
|
||||
|
||||
cmp %rax,%rbx
|
||||
jne .AGA_ENDLOOP
|
||||
|
||||
cmp $0, %rax
|
||||
jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
|
||||
|
||||
#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
|
||||
.AGA_ALIGNLOOP:
|
||||
|
||||
movss (%rdi), %xmm0 #; buf => xmm0
|
||||
movss (%rsi), %xmm1 #; gain value => xmm1
|
||||
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
|
||||
movss %xmm0, (%rdi) #; signal with gain => buf
|
||||
|
||||
decq %rdx
|
||||
jz .AGA_END
|
||||
|
||||
addq $4, %rdi #; buf++
|
||||
addq $4, %rsi #; gab++
|
||||
|
||||
addq $4, %rax
|
||||
cmp $16, %rax
|
||||
jne .AGA_ALIGNLOOP
|
||||
|
||||
#; There are frames left for sure, as that is checked in the beginning
|
||||
#; and within the previous loop. BUT, there might be less than 4 frames
|
||||
#; to process
|
||||
|
||||
.AGA_SSE:
|
||||
movq %rdx, %rax #; nframes => %rax
|
||||
shr $2, %rax #; unsigned divide by 4
|
||||
|
||||
cmp $0, %rax
|
||||
je .AGA_ENDLOOP
|
||||
|
||||
.AGA_SSELOOP:
|
||||
movaps (%rdi), %xmm0
|
||||
movaps (%rsi), %xmm1
|
||||
mulps %xmm1, %xmm0
|
||||
movaps %xmm0, (%rdi)
|
||||
|
||||
addq $16, %rdi
|
||||
addq $16, %rsi
|
||||
|
||||
decq %rax
|
||||
jnz .AGA_SSELOOP
|
||||
|
||||
andq $3, %rdx #; Remaining frames are nframes & 3
|
||||
jz .AGA_END
|
||||
|
||||
|
||||
#; Inside this loop, we know there are frames left to process
|
||||
#; but because either there are < 4 frames left, or the buffers
|
||||
#; are not aligned, we can't use the parallel SSE ops
|
||||
.AGA_ENDLOOP:
|
||||
movss (%rdi), %xmm0 #; buf => xmm0
|
||||
movss (%rsi), %xmm1 #; gain value => xmm1
|
||||
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
|
||||
movss %xmm0, (%rdi) #; signal with gain => buf
|
||||
|
||||
addq $4,%rdi
|
||||
addq $4,%rsi
|
||||
decq %rdx #; nframes--
|
||||
jnz .AGA_ENDLOOP
|
||||
|
||||
.AGA_END:
|
||||
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rdx
|
||||
popq %rcx
|
||||
popq %rbx
|
||||
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
|
||||
|
||||
#; float x86_sse_compute_peak(float *buf, long nframes, float current);
|
||||
|
||||
.globl x86_sse_compute_peak
|
||||
.def x86_sse_compute_peak; .scl 2; .type 32;
|
||||
.endef
|
||||
|
||||
|
||||
x86_sse_compute_peak:
|
||||
|
||||
#; due to Microsoft calling convention
|
||||
#; %rcx float* buf 32(%rbp)
|
||||
#; %rdx unsigned int nframes
|
||||
#; %xmm2 float current
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
#; due to System V AMD64 (Linux) calling convention
|
||||
#; %rdi float* buf 32(%rbp)
|
||||
#; %rsi unsigned int nframes
|
||||
#; %xmm0 float current
|
||||
#; %xmm1 float buf[0]
|
||||
|
||||
pushq %rbp
|
||||
movq %rsp, %rbp
|
||||
|
||||
#; save registers
|
||||
pushq %rcx
|
||||
pushq %rdi #; must be preserved
|
||||
pushq %rsi #; must be preserved
|
||||
|
||||
#; to keep algorithms universal - move input params into Linux specific registers
|
||||
movq %rcx, %rdi
|
||||
movq %rdx, %rsi
|
||||
movss %xmm2, %xmm0
|
||||
|
||||
#; if nframes == 0, go to end
|
||||
movq %rsi, %rcx #; nframes
|
||||
cmp $0, %rcx
|
||||
je .CP_END
|
||||
|
||||
#; create the "abs" mask in %xmm2
|
||||
pushq $2147483647
|
||||
movss (%rsp), %xmm2
|
||||
addq $8, %rsp
|
||||
shufps $0x00, %xmm2, %xmm2
|
||||
|
||||
#; Check for alignment
|
||||
|
||||
#;movq 8(%rbp), %rdi #; buf
|
||||
movq %rdi, %rdx #; buf => %rdx
|
||||
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
|
||||
jz .CP_SSE #; if buffer IS aligned
|
||||
|
||||
#; PRE-LOOP
|
||||
#; we iterate 1-3 times, doing normal x87 float comparison
|
||||
#; so we reach a 16 byte aligned "buf" (=%rdi) value
|
||||
|
||||
.LP_START:
|
||||
|
||||
#; Load next value from the buffer
|
||||
movss (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
#; increment buffer, decrement counter
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--
|
||||
jz .CP_END #; if we run out of frames, we go to the end
|
||||
|
||||
addq $4, %rdx #; one non-aligned byte less
|
||||
cmp $16, %rdx
|
||||
jne .LP_START #; if more non-aligned frames exist, we do a do-over
|
||||
|
||||
.CP_SSE:
|
||||
|
||||
#; We have reached the 16 byte aligned "buf" ("rdi") value
|
||||
|
||||
#; Figure out how many loops we should do
|
||||
movq %rcx, %rax #; copy remaining nframes to %rax for division
|
||||
|
||||
shr $2,%rax #; unsigned divide by 4
|
||||
jz .POST_START
|
||||
|
||||
#; %rax = SSE iterations
|
||||
|
||||
#; current maximum is at %xmm0, but we need to ..
|
||||
shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
|
||||
|
||||
#;prefetcht0 16(%rdi)
|
||||
|
||||
.LP_SSE:
|
||||
|
||||
movaps (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxps %xmm1, %xmm0
|
||||
|
||||
addq $16, %rdi
|
||||
|
||||
subq $4, %rcx #; nframes-=4
|
||||
|
||||
decq %rax
|
||||
jnz .LP_SSE
|
||||
|
||||
#; Calculate the maximum value contained in the 4 FP's in %xmm0
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
|
||||
maxps %xmm1, %xmm0 #; maximums of the two pairs
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
|
||||
maxps %xmm1, %xmm0
|
||||
|
||||
#; now every float in %xmm0 is the same value, current maximum value
|
||||
|
||||
#; Next we need to post-process all remaining frames
|
||||
#; the remaining frame count is in %rcx
|
||||
|
||||
#; if no remaining frames, jump to the end
|
||||
|
||||
andq $3, %rcx #; nframes % 4
|
||||
jz .CP_END
|
||||
|
||||
.POST_START:
|
||||
|
||||
movss (%rdi), %xmm1
|
||||
andps %xmm2, %xmm1
|
||||
maxss %xmm1, %xmm0
|
||||
|
||||
addq $4, %rdi #; buf++;
|
||||
|
||||
decq %rcx #; nframes--;
|
||||
jnz .POST_START
|
||||
|
||||
.CP_END:
|
||||
|
||||
#; restore registers
|
||||
popq %rsi
|
||||
popq %rdi
|
||||
popq %rcx
|
||||
|
||||
#; return value is in xmm0
|
||||
|
||||
#; return
|
||||
leave
|
||||
ret
|
||||
|
||||
#; end proc
|
||||
120
libs/ardour/sse_functions_avx.cc
Normal file
120
libs/ardour/sse_functions_avx.cc
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
Copyright (C) 2007 Paul sDavis
|
||||
Written by Sampo Savolainen
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
*/
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include "ardour/types.h"
|
||||
|
||||
|
||||
void
|
||||
x86_sse_avx_find_peaks(const ARDOUR::Sample* buf, ARDOUR::pframes_t nframes, float *min, float *max)
|
||||
{
|
||||
__m256 current_max, current_min, work;
|
||||
|
||||
// Load max and min values into all four slots of the XMM registers
|
||||
current_min = _mm256_set1_ps(*min);
|
||||
current_max = _mm256_set1_ps(*max);
|
||||
|
||||
// Work input until "buf" reaches 16 byte alignment
|
||||
while ( ((intptr_t)buf) % 32 != 0 && nframes > 0) {
|
||||
|
||||
// Load the next float into the work buffer
|
||||
work = _mm256_set1_ps(*buf);
|
||||
|
||||
current_min = _mm256_min_ps(current_min, work);
|
||||
current_max = _mm256_max_ps(current_max, work);
|
||||
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
// use 64 byte prefetch for quadruple quads:
|
||||
// load each 64 bytes into cash before processing
|
||||
while (nframes >= 16) {
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW)
|
||||
_mm_prefetch(((char*)buf+64), _mm_hint(0) ); // A total guess! Assumed to be eqivalent to
|
||||
#else // the line below but waiting to be tested !!
|
||||
__builtin_prefetch(buf+64,0,0);
|
||||
#endif
|
||||
work = _mm256_load_ps(buf);
|
||||
current_min = _mm256_min_ps(current_min, work);
|
||||
current_max = _mm256_max_ps(current_max, work);
|
||||
buf+=8;
|
||||
work = _mm256_load_ps(buf);
|
||||
current_min = _mm256_min_ps(current_min, work);
|
||||
current_max = _mm256_max_ps(current_max, work);
|
||||
buf+=8;
|
||||
|
||||
nframes-=16;
|
||||
}
|
||||
|
||||
// work through 32 bytes aligned buffers
|
||||
while (nframes >= 8) {
|
||||
|
||||
work = _mm256_load_ps(buf);
|
||||
|
||||
current_min = _mm256_min_ps(current_min, work);
|
||||
current_max = _mm256_max_ps(current_max, work);
|
||||
|
||||
buf+=8;
|
||||
nframes-=8;
|
||||
}
|
||||
|
||||
// work through the rest < 4 samples
|
||||
while ( nframes > 0) {
|
||||
|
||||
// Load the next float into the work buffer
|
||||
work = _mm256_set1_ps(*buf);
|
||||
|
||||
current_min = _mm256_min_ps(current_min, work);
|
||||
current_max = _mm256_max_ps(current_max, work);
|
||||
|
||||
buf++;
|
||||
nframes--;
|
||||
}
|
||||
|
||||
// Find min & max value in current_max through shuffle tricks
|
||||
|
||||
work = current_min;
|
||||
work = _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
current_min = _mm256_min_ps (work, current_min);
|
||||
work = _mm256_shuffle_ps (current_min, current_min, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
current_min = _mm256_min_ps (work, current_min);
|
||||
work = _mm256_permute2f128_ps( current_min, current_min, 1);
|
||||
current_min = _mm256_min_ps (work, current_min);
|
||||
|
||||
*min = current_min[0];
|
||||
|
||||
work = current_max;
|
||||
work = _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(2, 3, 0, 1));
|
||||
current_max = _mm256_max_ps (work, current_max);
|
||||
work = _mm256_shuffle_ps(current_max, current_max, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
current_max = _mm256_max_ps (work, current_max);
|
||||
work = _mm256_permute2f128_ps( current_max, current_max, 1);
|
||||
current_max = _mm256_max_ps (work, current_max);
|
||||
|
||||
*max = current_max[0];
|
||||
|
||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non AVX instructions
|
||||
_mm256_zeroupper ();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -14,6 +14,8 @@
|
|||
<None Include="..\run-profiling.sh" />
|
||||
<None Include="..\run-session-tests.sh" />
|
||||
<None Include="..\run-tests.sh" />
|
||||
<None Include="..\sse_avx_functions_64bit_win.s" />
|
||||
<None Include="..\sse_functions_64bit_win.s" />
|
||||
<None Include="..\test-env.sh" />
|
||||
<None Include="..\wscript" />
|
||||
</ItemGroup>
|
||||
|
|
|
|||
|
|
@ -33,6 +33,8 @@
|
|||
<None Include="..\wscript">
|
||||
<Filter>scripts</Filter>
|
||||
</None>
|
||||
<None Include="..\sse_functions_64bit_win.s" />
|
||||
<None Include="..\sse_avx_functions_64bit_win.s" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\ardour\vestige\aeffectx.h">
|
||||
|
|
|
|||
|
|
@ -408,6 +408,14 @@ def build(bld):
|
|||
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions.s' ]
|
||||
elif bld.env['build_target'] == 'x86_64':
|
||||
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_64bit.s' ]
|
||||
|
||||
if bld.env['build_target'] == 'mingw':
|
||||
import platform as PLATFORM
|
||||
u = PLATFORM.uname ()
|
||||
cpu = u[4]
|
||||
if re.search ("(x86_64|AMD64)", cpu) != None:
|
||||
obj.source += [ 'sse_functions_xmm.cc', 'sse_functions_avx.cc' ]
|
||||
obj.source += [ 'sse_functions_64bit_win.s', 'sse_avx_functions_64bit_win.s' ]
|
||||
|
||||
# i18n
|
||||
if bld.is_defined('ENABLE_NLS'):
|
||||
|
|
|
|||
|
|
@ -21,6 +21,8 @@
|
|||
#include "waves_audioport.h"
|
||||
#include "waves_midiport.h"
|
||||
|
||||
#include "ardour/runtime_functions.h"
|
||||
|
||||
using namespace ARDOUR;
|
||||
|
||||
#ifdef __MINGW64__
|
||||
|
|
@ -1169,13 +1171,12 @@ WavesAudioBackend::_read_audio_data_from_device (const float* input_buffer, pfra
|
|||
{
|
||||
#if defined(PLATFORM_WINDOWS)
|
||||
const float **buffer = (const float**)input_buffer;
|
||||
size_t copied_bytes = nframes*sizeof(float);
|
||||
|
||||
for(std::vector<WavesAudioPort*>::iterator it = _physical_audio_inputs.begin ();
|
||||
it != _physical_audio_inputs.end();
|
||||
++it)
|
||||
{
|
||||
memcpy((*it)->buffer(), *buffer, copied_bytes);
|
||||
ARDOUR::copy_vector ((*it)->buffer(), *buffer, nframes);
|
||||
++buffer;
|
||||
}
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -18,13 +18,21 @@
|
|||
*/
|
||||
|
||||
#include "waves_audioport.h"
|
||||
#include "ardour/runtime_functions.h"
|
||||
#include "pbd/malign.h"
|
||||
|
||||
using namespace ARDOUR;
|
||||
|
||||
WavesAudioPort::WavesAudioPort (const std::string& port_name, PortFlags flags)
|
||||
: WavesDataPort (port_name, flags)
|
||||
{
|
||||
memset (_buffer, 0, sizeof (_buffer));
|
||||
aligned_malloc ((void**)&_buffer, MAX_BUFFER_SIZE_BYTES, 32 /*32 byte alignment*/);
|
||||
memset (_buffer, 0, MAX_BUFFER_SIZE_BYTES);
|
||||
}
|
||||
|
||||
WavesAudioPort::~WavesAudioPort ()
|
||||
{
|
||||
aligned_free (_buffer);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -40,14 +48,19 @@ void* WavesAudioPort::get_buffer (pframes_t nframes)
|
|||
* Base class WavesDataPort takes is supposed to provide enough consistentcy
|
||||
* of the connections.
|
||||
*/
|
||||
for (memcpy (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes * sizeof (Sample)), ++it;
|
||||
it != get_connections ().end ();
|
||||
++it) {
|
||||
|
||||
// get first buffer data
|
||||
// use optimized function to fill the buffer intialy
|
||||
ARDOUR::copy_vector (_buffer, ((const WavesAudioPort*)*it)->const_buffer (), nframes);
|
||||
++it;
|
||||
|
||||
// mix the rest
|
||||
for (; it != get_connections ().end (); ++it) {
|
||||
Sample* tgt = buffer ();
|
||||
const Sample* src = ((const WavesAudioPort*)*it)->const_buffer ();
|
||||
for (uint32_t frame = 0; frame < nframes; ++frame, ++tgt, ++src) {
|
||||
*tgt += *src;
|
||||
}
|
||||
|
||||
// use otimized function to mix the buffers
|
||||
ARDOUR::mix_buffers_no_gain (tgt, src, nframes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
WavesAudioPort (const std::string& port_name, PortFlags flags);
|
||||
|
||||
virtual ~WavesAudioPort () { };
|
||||
virtual ~WavesAudioPort ();
|
||||
|
||||
virtual DataType type () const { return DataType::AUDIO; };
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ protected:
|
|||
|
||||
private:
|
||||
|
||||
Sample _buffer[MAX_BUFFER_SIZE_SAMPLES];
|
||||
Sample *_buffer;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
*/
|
||||
#ifndef COMPILER_MSVC
|
||||
#if !(defined (COMPILER_MSVC) || defined (COMPILER_MINGW))
|
||||
#include "libpbd-config.h"
|
||||
|
||||
#define _XOPEN_SOURCE 600
|
||||
|
|
@ -39,10 +39,6 @@ FPU::FPU ()
|
|||
|
||||
_flags = Flags (0);
|
||||
|
||||
#if defined(__MINGW64__) // Vkamyshniy: under __MINGW64__ the assembler code below is not compiled
|
||||
return;
|
||||
#endif
|
||||
|
||||
#if !( (defined __x86_64__) || (defined __i386__) ) // !ARCH_X86
|
||||
return;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -35,22 +35,52 @@ static const int CPU_CACHE_ALIGN = 64;
|
|||
static const int CPU_CACHE_ALIGN = 16; /* arguably 32 on most arches, but it matters less */
|
||||
#endif
|
||||
|
||||
int cache_aligned_malloc (void** memptr, size_t size)
|
||||
|
||||
int aligned_malloc (void** memptr, size_t size, size_t alignment)
|
||||
{
|
||||
#ifndef HAVE_POSIX_MEMALIGN
|
||||
if (((*memptr) = malloc (size)) == 0) {
|
||||
#ifdef PLATFORM_WINDOWS
|
||||
if (((*memptr) = _aligned_malloc (size, alignment)) == 0) {
|
||||
fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
|
||||
CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
|
||||
alignment, size, strerror (errno)) << endmsg;
|
||||
return errno;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
if (posix_memalign (memptr, CPU_CACHE_ALIGN, size)) {
|
||||
if (((*memptr) = malloc (size)) == 0) {
|
||||
fatal << string_compose (_("Memory allocation error: malloc (%1 * %2) failed (%3)"),
|
||||
alignment, size, strerror (errno)) << endmsg;
|
||||
return errno;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
if (posix_memalign (memptr, alignment, size)) {
|
||||
fatal << string_compose (_("Memory allocation error: posix_memalign (%1 * %2) failed (%3)"),
|
||||
CPU_CACHE_ALIGN, size, strerror (errno)) << endmsg;
|
||||
alignment, size, strerror (errno)) << endmsg;
|
||||
}
|
||||
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void aligned_free (void* memptr)
|
||||
{
|
||||
#ifdef PLATFORM_WINDOWS
|
||||
_aligned_free (memptr);
|
||||
#else
|
||||
free (memptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
int cache_aligned_malloc (void** memptr, size_t size)
|
||||
{
|
||||
return aligned_malloc (memptr, size, CPU_CACHE_ALIGN);
|
||||
}
|
||||
|
||||
void cache_aligned_free (void* memptr)
|
||||
{
|
||||
aligned_free (memptr);
|
||||
}
|
||||
|
|
@ -1,10 +1,14 @@
|
|||
#ifdef COMPILER_MSVC // Added by JE - 05-12-2009. Inline assembler instructions
|
||||
// have been changed to Intel format and (in the case of
|
||||
// cpuid) was replaced by the equivalent VC++ system call).
|
||||
// Added by JE - 05-12-2009. Inline assembler instructions
|
||||
// have been changed to Intel format and (in the case of
|
||||
// cpuid) was replaced by the equivalent VC++ system call).
|
||||
|
||||
#if defined (COMPILER_MSVC) || defined (COMPILER_MINGW)
|
||||
|
||||
#define _XOPEN_SOURCE 600
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
#include <intrin.h> // Added by JE - 05-12-2009
|
||||
#include <assert.h>
|
||||
|
||||
#include <pbd/fpu.h>
|
||||
#include <pbd/error.h>
|
||||
|
|
@ -16,84 +20,63 @@ using namespace std;
|
|||
|
||||
FPU::FPU ()
|
||||
{
|
||||
unsigned long cpuflags = 0;
|
||||
unsigned long cpuflags_ECX = 0;
|
||||
unsigned long cpuflags_EDX = 0;
|
||||
|
||||
_flags = (Flags)0;
|
||||
|
||||
#ifndef ARCH_X86
|
||||
return;
|
||||
|
||||
#else
|
||||
|
||||
#ifndef USE_X86_64_ASM
|
||||
int cpuInfo[4];
|
||||
return;
|
||||
#endif
|
||||
|
||||
// Get CPU lfags using Microsof function
|
||||
// It works for both 64 and 32 bit systems
|
||||
// no need to use assembler for getting info from register, this function does this for us
|
||||
int cpuInfo[4];
|
||||
__cpuid (cpuInfo, 1);
|
||||
cpuflags = cpuInfo[3];
|
||||
/*
|
||||
__asm { // This is how the original section would look if converted to Intel syntax.
|
||||
// However, I have grave doubts about whether it's doing the right thing.
|
||||
// It seems as if the intention was to retrieve feature information from
|
||||
// the processor. However, feature information is returned in the ebx register
|
||||
// (if you believe Wikipedia) or in edx (if you believe Microsoft). Unfortunately,
|
||||
// both registers get ignored in the original code!! Confused?? Join the club!!
|
||||
mov eax, 1
|
||||
push ebx
|
||||
cpuid
|
||||
mov edx, 0
|
||||
pop ebx
|
||||
mov cpuflags, ecx // This can't be right, surely???
|
||||
}; */
|
||||
#else
|
||||
// Note that this syntax is currently still in AT&T format !
|
||||
asm volatile (
|
||||
"pushq %%rbx\n"
|
||||
"movq $1, %%rax\n"
|
||||
"cpuid\n"
|
||||
"movq %%rdx, %0\n"
|
||||
"popq %%rbx\n"
|
||||
: "=r" (cpuflags)
|
||||
:
|
||||
: "%rax", "%rcx", "%rdx", "memory"
|
||||
);
|
||||
cpuflags_ECX = cpuInfo[2]; // flags from ECX register
|
||||
cpuflags_EDX = cpuInfo[3]; // flags from EDX register
|
||||
|
||||
#endif /* USE_X86_64_ASM */
|
||||
|
||||
if (cpuflags & (1<<25)) {
|
||||
_flags = Flags (_flags | (HasSSE|HasFlushToZero));
|
||||
if (cpuflags_ECX & (1<<28)) {
|
||||
_flags = Flags (_flags | (HasAVX) );
|
||||
}
|
||||
|
||||
if (cpuflags & (1<<26)) {
|
||||
if (cpuflags_EDX & (1<<25)) {
|
||||
_flags = Flags (_flags | (HasSSE|HasFlushToZero) );
|
||||
}
|
||||
|
||||
if (cpuflags_EDX & (1<<26)) {
|
||||
_flags = Flags (_flags | HasSSE2);
|
||||
}
|
||||
|
||||
if (cpuflags & (1 << 24)) {
|
||||
bool aligned_malloc = false; // Added by JE - 05-12-2009
|
||||
if (cpuflags_EDX & (1 << 24)) {
|
||||
char* fxbuf = 0;
|
||||
// This section changed by JE - 05-12-2009
|
||||
#ifdef NO_POSIX_MEMALIGN
|
||||
#if defined(COMPILER_MSVC) || defined(COMPILER_MINGW) // All of these support '_aligned_malloc()'
|
||||
fxbuf = (char *) _aligned_malloc(512, 16); // (note that they all need at least MSVC runtime 7.0)
|
||||
aligned_malloc = true;
|
||||
#else
|
||||
fxbuf = (char *) malloc(512);
|
||||
#endif
|
||||
#else
|
||||
fxbuf = posix_memalign ((void**)&fxbuf, 16, 512);
|
||||
#endif
|
||||
|
||||
// allocate alligned buffer
|
||||
fxbuf = (char*)_aligned_malloc(512, 16);
|
||||
|
||||
// Verify that fxbuf is correctly aligned
|
||||
unsigned long buf_addr = (unsigned long)(void*)fxbuf;
|
||||
unsigned long long buf_addr = (unsigned long long)(void*)fxbuf;
|
||||
if ((0 == buf_addr) || (buf_addr % 16))
|
||||
error << _("cannot allocate 16 byte aligned buffer for h/w feature detection") << endmsg;
|
||||
else
|
||||
{
|
||||
memset(fxbuf, 0, 512); // Initialize the buffer !!! Added by JE - 12-12-2009
|
||||
|
||||
#if defined (COMPILER_MINGW)
|
||||
asm volatile (
|
||||
"fxsave (%0)"
|
||||
:
|
||||
: "r" (fxbuf)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
#elif defined (COMPILER_MSVC)
|
||||
__asm {
|
||||
mov eax, fxbuf
|
||||
fxsave [eax]
|
||||
};
|
||||
|
||||
#endif
|
||||
uint32_t mxcsr_mask = *((uint32_t*) &fxbuf[28]);
|
||||
|
||||
/* if the mask is zero, set its default value (from intel specs) */
|
||||
|
|
@ -106,13 +89,9 @@ int cpuInfo[4];
|
|||
_flags = Flags (_flags | HasDenormalsAreZero);
|
||||
}
|
||||
|
||||
if (aligned_malloc)
|
||||
_aligned_free (fxbuf);
|
||||
else
|
||||
free (fxbuf);
|
||||
_aligned_free (fxbuf);
|
||||
}
|
||||
}
|
||||
#endif // ARCH_X86
|
||||
}
|
||||
|
||||
FPU::~FPU ()
|
||||
|
|
|
|||
|
|
@ -30,7 +30,8 @@ class LIBPBD_API FPU {
|
|||
HasFlushToZero = 0x1,
|
||||
HasDenormalsAreZero = 0x2,
|
||||
HasSSE = 0x4,
|
||||
HasSSE2 = 0x8
|
||||
HasSSE2 = 0x8,
|
||||
HasAVX = 0x10
|
||||
};
|
||||
|
||||
public:
|
||||
|
|
@ -41,6 +42,7 @@ class LIBPBD_API FPU {
|
|||
bool has_denormals_are_zero () const { return _flags & HasDenormalsAreZero; }
|
||||
bool has_sse () const { return _flags & HasSSE; }
|
||||
bool has_sse2 () const { return _flags & HasSSE2; }
|
||||
bool has_avx () const { return _flags & HasAVX; }
|
||||
|
||||
private:
|
||||
Flags _flags;
|
||||
|
|
|
|||
|
|
@ -24,6 +24,10 @@
|
|||
|
||||
#include "pbd/libpbd_visibility.h"
|
||||
|
||||
LIBPBD_API int cache_aligned_malloc (void** memptr, size_t size);
|
||||
LIBPBD_API int cache_aligned_malloc (void** memptr, size_t size);
|
||||
LIBPBD_API void cache_aligned_free (void* memptr);
|
||||
|
||||
LIBPBD_API int aligned_malloc (void** memptr, size_t size, size_t alignment);
|
||||
LIBPBD_API void aligned_free (void* memptr);
|
||||
|
||||
#endif /* __pbd_malign_h__ */
|
||||
|
|
|
|||
|
|
@ -48,7 +48,6 @@ libpbd_sources = [
|
|||
'ffs.cc',
|
||||
'file_manager.cc',
|
||||
'file_utils.cc',
|
||||
'fpu.cc',
|
||||
'glib_semaphore.cc',
|
||||
'id.cc',
|
||||
'locale_guard.cc',
|
||||
|
|
@ -150,8 +149,18 @@ def build(bld):
|
|||
if bld.env['build_target'] == 'x86_64':
|
||||
obj.defines += [ 'USE_X86_64_ASM' ]
|
||||
if bld.env['build_target'] == 'mingw':
|
||||
import re
|
||||
import platform as PLATFORM
|
||||
u = PLATFORM.uname ()
|
||||
cpu = u[4]
|
||||
if re.search ("(x86_64|AMD64)", cpu) != None:
|
||||
obj.defines += [ 'USE_X86_64_ASM' ]
|
||||
obj.defines += ['NO_POSIX_MEMALIGN' ]
|
||||
obj.source += [ 'windows_special_dirs.cc' ]
|
||||
obj.source += [ 'msvc/fpu.cc' ]
|
||||
obj.uselib += ' OLE'
|
||||
else:
|
||||
obj.source += [ 'fpu.cc' ]
|
||||
|
||||
if bld.env['BUILD_TESTS'] and bld.is_defined('HAVE_CPPUNIT'):
|
||||
# Unit tests
|
||||
|
|
|
|||
21
wscript
21
wscript
|
|
@ -221,7 +221,7 @@ def set_compiler_flags (conf,opt):
|
|||
#
|
||||
compiler_flags.append ('-U__STRICT_ANSI__')
|
||||
|
||||
if ((re.search ("i[0-9]86", cpu) != None) or (re.search ("x86_64", cpu) != None)) and conf.env['build_target'] != 'none':
|
||||
if (re.search ("(i[0-9]86|x86_64|AMD64)", cpu) != None) and conf.env['build_target'] != 'none':
|
||||
|
||||
|
||||
#
|
||||
|
|
@ -229,9 +229,8 @@ def set_compiler_flags (conf,opt):
|
|||
# the compile-time presence of the macro _LP64 is used to
|
||||
# distingush 32 and 64 bit assembler
|
||||
#
|
||||
|
||||
if (re.search ("(i[0-9]86|x86_64)", cpu) != None):
|
||||
compiler_flags.append ("-DARCH_X86")
|
||||
|
||||
compiler_flags.append ("-DARCH_X86")
|
||||
|
||||
if platform == 'linux' :
|
||||
|
||||
|
|
@ -258,6 +257,16 @@ def set_compiler_flags (conf,opt):
|
|||
|
||||
if not is_clang and ((conf.env['build_target'] == 'i686') or (conf.env['build_target'] == 'x86_64')) and build_host_supports_sse:
|
||||
compiler_flags.extend (["-msse", "-mfpmath=sse", "-DUSE_XMMINTRIN"])
|
||||
|
||||
if (conf.env['build_target'] == 'mingw'):
|
||||
if (re.search ("(x86_64|AMD64)", cpu) != None):
|
||||
# on Windows sse is supported by 64 bit platforms only
|
||||
build_host_supports_sse = True
|
||||
|
||||
# mingw GCC compiler to uses at&t (Unix specific) assembler dialect by default
|
||||
# compiler_flags.append (["--mmnemonic=att", "msyntax=att")
|
||||
|
||||
compiler_flags.extend (["-mavx", "-mvzeroupper", "-DUSE_XMMINTRIN"])
|
||||
|
||||
# end of processor-specific section
|
||||
|
||||
|
|
@ -266,7 +275,7 @@ def set_compiler_flags (conf,opt):
|
|||
if sys.platform == 'darwin':
|
||||
compiler_flags.append("-DBUILD_VECLIB_OPTIMIZATIONS")
|
||||
conf.env.append_value('LINKFLAGS_OSX', ['-framework', 'Accelerate'])
|
||||
elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64':
|
||||
elif conf.env['build_target'] == 'i686' or conf.env['build_target'] == 'x86_64' or (conf.env['build_target'] == 'mingw' and build_host_supports_sse):
|
||||
compiler_flags.append ("-DBUILD_SSE_OPTIMIZATIONS")
|
||||
if not build_host_supports_sse:
|
||||
print("\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)")
|
||||
|
|
@ -695,7 +704,7 @@ def configure(conf):
|
|||
autowaf.check_pkg(conf, 'rubberband', uselib_store='RUBBERBAND', mandatory=True)
|
||||
|
||||
if Options.options.dist_target == 'mingw':
|
||||
Options.options.fpu_optimization = False
|
||||
Options.options.fpu_optimization = True
|
||||
conf.env.append_value('CFLAGS', '-DPLATFORM_WINDOWS')
|
||||
conf.env.append_value('CFLAGS', '-DCOMPILER_MINGW')
|
||||
conf.env.append_value('CXXFLAGS', '-DPLATFORM_WINDOWS')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue