diff --git a/libs/ardour/sse_functions_avx_linux.cc b/libs/ardour/sse_functions_avx_linux.cc index aefc32246b..a7d42c9d3a 100644 --- a/libs/ardour/sse_functions_avx_linux.cc +++ b/libs/ardour/sse_functions_avx_linux.cc @@ -157,14 +157,6 @@ x86_sse_avx_compute_peak(const float *src, uint32_t nframes, float current) vmax = avx_getmax_ps(vmax); - // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX - // instructions. - - // _mm256_zeroupper(); - // This is probably not needed in 2021 as compilers will insert them - // automatically. See stackoverflow reference: - // https://stackoverflow.com/questions/68736527/do-i-need-to-use-mm256-zeroupper-in-2021 - #if defined(__GNUC__) && (__GNUC__ < 5) return *((float *)&vmax); #elif defined(__GNUC__) && (__GNUC__ < 8) @@ -255,13 +247,6 @@ x86_sse_avx_find_peaks(const float *src, uint32_t nframes, float *minf, float *m vmin = avx_getmin_ps(vmin); vmax = avx_getmax_ps(vmax); - // There's a penalty going away from AVX mode to SSE mode. This can - // be avoided by ensuring to the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - // zero upper 128 bit of 256 bit ymm register to avoid penalties using non-AVX instructions - _mm256_zeroupper(); - _mm_store_ss(minf, _mm256_castps256_ps128(vmin)); _mm_store_ss(maxf, _mm256_castps256_ps128(vmax)); } @@ -318,13 +303,6 @@ x86_sse_avx_apply_gain_to_buffer(float *dst, uint32_t nframes, float gain) frames -= 8; } - - // There's a penalty going away from AVX mode to SSE mode. This can - // be avoided by ensuring to the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - _mm256_zeroupper(); // zeros the upper portion of YMM register - // Process the remaining samples do { __m128 g0 = _mm256_castps256_ps128(vgain); @@ -486,13 +464,6 @@ x86_sse_avx_mix_buffers_with_gain_unaligned(float *dst, const float *src, uint32 nframes -= 8; } - - // There's a penalty going away from AVX mode to SSE mode. This can - // be avoided by ensuring the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - _mm256_zeroupper(); // zeros the upper portion of YMM register - // Process the remaining samples do { __m128 g0 = _mm_set_ss(gain); @@ -586,13 +557,6 @@ x86_sse_avx_mix_buffers_with_gain_aligned(float *dst, const float *src, uint32_t nframes -= 8; } - - // There's a penalty going from AVX mode to SSE mode. This can - // be avoided by ensuring the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - _mm256_zeroupper(); // zeros the upper portion of YMM register - // Process the remaining samples, one sample at a time. do { __m128 g0 = _mm256_castps256_ps128(vgain); // use the same register @@ -676,12 +640,6 @@ x86_sse_avx_mix_buffers_no_gain_unaligned(float *dst, const float *src, uint32_t nframes -= 8; } - // There's a penalty going away from AVX mode to SSE mode. This can - // be avoided by ensuring the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - _mm256_zeroupper(); // zeros the upper portion of YMM register - // Process the remaining samples do { while (nframes > 0) { @@ -807,12 +765,6 @@ x86_sse_avx_mix_buffers_no_gain_aligned(float *dst, const float *src, uint32_t n nframes -= 8; } - // There's a penalty going from AVX mode to SSE mode. This can - // be avoided by ensuring the CPU that rest of the routine is no - // longer interested in the upper portion of the YMM register. - - _mm256_zeroupper(); // zeros the upper portion of YMM register - // Process the remaining samples do { while (nframes > 0) {