mirror of
https://github.com/Ardour/ardour.git
synced 2025-12-10 08:36:32 +01:00
Removing _mm256_zeroupper()
This is probably not needed in 2021 as compilers will insert them automatically. See stackoverflow reference: https://stackoverflow.com/a/68738289
This commit is contained in:
parent
5fc3ae79ae
commit
25fac546d5
1 changed files with 0 additions and 48 deletions
|
|
@ -157,14 +157,6 @@ x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current)
|
||||||
|
|
||||||
vmax = avx_getmax_ps(vmax);
|
vmax = avx_getmax_ps(vmax);
|
||||||
|
|
||||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX
|
|
||||||
// instructions.
|
|
||||||
|
|
||||||
// _mm256_zeroupper();
|
|
||||||
// This is probably not needed in 2021 as compilers will insert them
|
|
||||||
// automatically. See stackoverflow reference:
|
|
||||||
// https://stackoverflow.com/questions/68736527/do-i-need-to-use-mm256-zeroupper-in-2021
|
|
||||||
|
|
||||||
#if defined(__GNUC__) && (__GNUC__ < 5)
|
#if defined(__GNUC__) && (__GNUC__ < 5)
|
||||||
return *((float *)&vmax);
|
return *((float *)&vmax);
|
||||||
#elif defined(__GNUC__) && (__GNUC__ < 8)
|
#elif defined(__GNUC__) && (__GNUC__ < 8)
|
||||||
|
|
@ -255,13 +247,6 @@ x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *m
|
||||||
vmin = avx_getmin_ps(vmin);
|
vmin = avx_getmin_ps(vmin);
|
||||||
vmax = avx_getmax_ps(vmax);
|
vmax = avx_getmax_ps(vmax);
|
||||||
|
|
||||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring to the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
// zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions
|
|
||||||
_mm256_zeroupper();
|
|
||||||
|
|
||||||
_mm_store_ss(minf, _mm256_castps256_ps128(vmin));
|
_mm_store_ss(minf, _mm256_castps256_ps128(vmin));
|
||||||
_mm_store_ss(maxf, _mm256_castps256_ps128(vmax));
|
_mm_store_ss(maxf, _mm256_castps256_ps128(vmax));
|
||||||
}
|
}
|
||||||
|
|
@ -318,13 +303,6 @@ x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain)
|
||||||
frames -= 8;
|
frames -= 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring to the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
|
||||||
|
|
||||||
// Process the remaining samples
|
// Process the remaining samples
|
||||||
do {
|
do {
|
||||||
__m128 g0 = _mm256_castps256_ps128(vgain);
|
__m128 g0 = _mm256_castps256_ps128(vgain);
|
||||||
|
|
@ -486,13 +464,6 @@ x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32
|
||||||
nframes -= 8;
|
nframes -= 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
|
||||||
|
|
||||||
// Process the remaining samples
|
// Process the remaining samples
|
||||||
do {
|
do {
|
||||||
__m128 g0 = _mm_set_ss(gain);
|
__m128 g0 = _mm_set_ss(gain);
|
||||||
|
|
@ -586,13 +557,6 @@ x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t
|
||||||
nframes -= 8;
|
nframes -= 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// There's a penalty going from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
|
||||||
|
|
||||||
// Process the remaining samples, one sample at a time.
|
// Process the remaining samples, one sample at a time.
|
||||||
do {
|
do {
|
||||||
__m128 g0 = _mm256_castps256_ps128(vgain); // use the same register
|
__m128 g0 = _mm256_castps256_ps128(vgain); // use the same register
|
||||||
|
|
@ -676,12 +640,6 @@ x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t
|
||||||
nframes -= 8;
|
nframes -= 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
// There's a penalty going away from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
|
||||||
|
|
||||||
// Process the remaining samples
|
// Process the remaining samples
|
||||||
do {
|
do {
|
||||||
while (nframes > 0) {
|
while (nframes > 0) {
|
||||||
|
|
@ -807,12 +765,6 @@ x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n
|
||||||
nframes -= 8;
|
nframes -= 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
// There's a penalty going from AVX mode to SSE mode. This can
|
|
||||||
// be avoided by ensuring the CPU that rest of the routine is no
|
|
||||||
// longer interested in the upper portion of the YMM register.
|
|
||||||
|
|
||||||
_mm256_zeroupper(); // zeros the upper portion of YMM register
|
|
||||||
|
|
||||||
// Process the remaining samples
|
// Process the remaining samples
|
||||||
do {
|
do {
|
||||||
while (nframes > 0) {
|
while (nframes > 0) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue