mirror of
https://github.com/Ardour/ardour.git
synced 2025-12-13 18:16:35 +01:00
246 lines
10 KiB
C
246 lines
10 KiB
C
|
|
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
|
|
|
|
Redistribution and use of the Software in source and binary forms,
|
|
with or without modification, is permitted provided that the
|
|
following conditions are met:
|
|
|
|
- Neither the names of NCAR's Computational and Information Systems
|
|
Laboratory, the University Corporation for Atmospheric Research,
|
|
nor the names of its sponsors or contributors may be used to
|
|
endorse or promote products derived from this Software without
|
|
specific prior written permission.
|
|
|
|
- Redistributions of source code must retain the above copyright
|
|
notices, this list of conditions, and the disclaimer below.
|
|
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions, and the disclaimer below in the
|
|
documentation and/or other materials provided with the
|
|
distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
|
|
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
|
|
SOFTWARE.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
#include <stdint.h>
|
|
|
|
|
|
/*
|
|
* SIMD reference material:
|
|
*
|
|
* general SIMD introduction:
|
|
* https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing
|
|
*
|
|
* SSE 1:
|
|
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/
|
|
*
|
|
* ARM NEON:
|
|
* https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
|
|
*
|
|
* Altivec:
|
|
* https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf
|
|
* https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html
|
|
* better one?
|
|
*
|
|
*/
|
|
|
|
|
|
/*
|
|
Altivec support macros
|
|
*/
|
|
#if !defined(PFFFT_SIMD_DISABLE) && ((defined(__ppc__) || defined(__ppc64__)) && defined(__ALTIVEC__))
|
|
#include <altivec.h>
|
|
typedef vector float v4sf;
|
|
# define SIMD_SZ 4
|
|
# define VREQUIRES_ALIGN 1 /* not sure, if really required */
|
|
# define VZERO() ((vector float) vec_splat_u8(0))
|
|
# define VMUL(a,b) vec_madd(a,b, VZERO())
|
|
# define VADD(a,b) vec_add(a,b)
|
|
# define VMADD(a,b,c) vec_madd(a,b,c)
|
|
# define VSUB(a,b) vec_sub(a,b)
|
|
inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
|
|
# define LD_PS1(p) ld_ps1(&p)
|
|
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
|
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { \
|
|
vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
|
|
vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
|
|
v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
|
|
}
|
|
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
|
v4sf y0 = vec_mergeh(x0, x2); \
|
|
v4sf y1 = vec_mergel(x0, x2); \
|
|
v4sf y2 = vec_mergeh(x1, x3); \
|
|
v4sf y3 = vec_mergel(x1, x3); \
|
|
x0 = vec_mergeh(y0, y2); \
|
|
x1 = vec_mergel(y0, y2); \
|
|
x2 = vec_mergeh(y1, y3); \
|
|
x3 = vec_mergel(y1, y3); \
|
|
}
|
|
# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
|
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
|
|
|
/*
|
|
SSE1 support macros
|
|
*/
|
|
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
|
|
|
|
#include <xmmintrin.h>
|
|
typedef __m128 v4sf;
|
|
/* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions
|
|
* anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
|
|
# define SIMD_SZ 4
|
|
# define VREQUIRES_ALIGN 1
|
|
# define VZERO() _mm_setzero_ps()
|
|
# define VMUL(a,b) _mm_mul_ps(a,b)
|
|
# define VADD(a,b) _mm_add_ps(a,b)
|
|
# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
|
|
# define VSUB(a,b) _mm_sub_ps(a,b)
|
|
# define LD_PS1(p) _mm_set1_ps(p)
|
|
# define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr)
|
|
# define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr)
|
|
# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
|
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
|
|
# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
|
|
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
|
|
/* reverse/flip all floats */
|
|
# define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3))
|
|
/* reverse/flip complex floats */
|
|
# define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2))
|
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
|
|
|
|
/*
|
|
ARM NEON support macros
|
|
*/
|
|
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64))
|
|
# include <arm_neon.h>
|
|
typedef float32x4_t v4sf;
|
|
# define SIMD_SZ 4
|
|
# define VREQUIRES_ALIGN 0 /* usually no alignment required */
|
|
# define VZERO() vdupq_n_f32(0)
|
|
# define VMUL(a,b) vmulq_f32(a,b)
|
|
# define VADD(a,b) vaddq_f32(a,b)
|
|
# define VMADD(a,b,c) vmlaq_f32(c,a,b)
|
|
# define VSUB(a,b) vsubq_f32(a,b)
|
|
# define LD_PS1(p) vld1q_dup_f32(&(p))
|
|
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
|
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
|
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
|
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
|
|
# define VTRANSPOSE4(x0,x1,x2,x3) { \
|
|
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
|
|
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
|
|
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
|
|
float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
|
|
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
|
|
}
|
|
// marginally faster version
|
|
//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
|
|
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
|
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
|
#else
|
|
# if !defined(PFFFT_SIMD_DISABLE)
|
|
# warning "building with simd disabled !\n";
|
|
# define PFFFT_SIMD_DISABLE // fallback to scalar code
|
|
# endif
|
|
#endif
|
|
|
|
// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
|
|
#ifdef PFFFT_SIMD_DISABLE
|
|
typedef float v4sf;
|
|
# define SIMD_SZ 1
|
|
# define VREQUIRES_ALIGN 0
|
|
# define VZERO() 0.f
|
|
# define VMUL(a,b) ((a)*(b))
|
|
# define VADD(a,b) ((a)+(b))
|
|
# define VMADD(a,b,c) ((a)*(b)+(c))
|
|
# define VSUB(a,b) ((a)-(b))
|
|
# define LD_PS1(p) (p)
|
|
# define VLOAD_UNALIGNED(ptr) (*(ptr))
|
|
# define VLOAD_ALIGNED(ptr) (*(ptr))
|
|
# define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
|
|
#endif
|
|
|
|
// shortcuts for complex multiplcations
|
|
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
|
|
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
|
|
#ifndef SVMUL
|
|
// multiply a scalar with a vector
|
|
#define SVMUL(f,v) VMUL(LD_PS1(f),v)
|
|
#endif
|
|
|
|
typedef union v4sf_union {
|
|
v4sf v;
|
|
float f[SIMD_SZ];
|
|
} v4sf_union;
|
|
|
|
#if !defined(PFFFT_SIMD_DISABLE)
|
|
|
|
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
|
|
|
|
/* detect bugs with the vector support macros */
|
|
static void Vvalidate_simd() {
|
|
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
|
|
v4sf_union a0, a1, a2, a3, t, u;
|
|
memcpy(a0.f, f, 4*sizeof(float));
|
|
memcpy(a1.f, f+4, 4*sizeof(float));
|
|
memcpy(a2.f, f+8, 4*sizeof(float));
|
|
memcpy(a3.f, f+12, 4*sizeof(float));
|
|
|
|
t = a0; u = a1; t.v = VZERO();
|
|
printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
|
|
t.v = VADD(a1.v, a2.v);
|
|
printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
|
|
t.v = VMUL(a1.v, a2.v);
|
|
printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
|
|
t.v = VMADD(a1.v, a2.v,a0.v);
|
|
printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
|
|
|
|
INTERLEAVE2(a1.v,a2.v,t.v,u.v);
|
|
printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
|
|
assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
|
|
UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
|
|
printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
|
|
assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
|
|
|
|
t.v=LD_PS1(f[15]);
|
|
printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
|
|
assertv4(t, 15, 15, 15, 15);
|
|
t.v = VSWAPHL(a1.v, a2.v);
|
|
printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
|
|
assertv4(t, 8, 9, 6, 7);
|
|
VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
|
|
printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
|
|
a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
|
|
a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
|
|
assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
|
|
}
|
|
#endif //!PFFFT_SIMD_DISABLE
|
|
|
|
/* SSE and co like 16-bytes aligned pointers */
|
|
#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
|
|
|
|
static
|
|
void *Valigned_malloc(size_t nb_bytes) {
|
|
void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
|
|
if (!p0) return (void *) 0;
|
|
p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
|
|
*((void **) p - 1) = p0;
|
|
return p;
|
|
}
|
|
|
|
static
|
|
void Valigned_free(void *p) {
|
|
if (p) free(*((void **) p - 1));
|
|
}
|