From 31bb303c55fde6d93aa52c9083e64fd1b2b7be54 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 24 Jan 2019 10:17:49 +0300 Subject: [PATCH] simd: organise SIMD-functions * optimize dot product --- .gitignore | 1 + CREDITS | 3 + include/cglm/quat.h | 2 +- include/cglm/simd/arm.h | 41 +++++++++++ include/cglm/simd/intrin.h | 114 ++++++++++--------------------- include/cglm/simd/x86.h | 136 +++++++++++++++++++++++++++++++++++++ include/cglm/vec4.h | 28 ++------ makefile.am | 52 +++++++------- win/cglm.vcxproj | 2 + win/cglm.vcxproj.filters | 6 ++ 10 files changed, 259 insertions(+), 126 deletions(-) create mode 100644 include/cglm/simd/arm.h create mode 100644 include/cglm/simd/x86.h diff --git a/.gitignore b/.gitignore index d500b97..195a82c 100644 --- a/.gitignore +++ b/.gitignore @@ -69,3 +69,4 @@ win/cglm_test_* win/x64 win/x85 win/Debug +cglm-test-ios* diff --git a/CREDITS b/CREDITS index 0488bad..263dd2d 100644 --- a/CREDITS +++ b/CREDITS @@ -52,3 +52,6 @@ https://gamedev.stackexchange.com/questions/28395/rotating-vector3-by-a-quaterni 9. Sphere AABB intersect https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c + +10. Horizontal add +https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 1db0161..f5f29af 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -218,7 +218,7 @@ glm_quat_normalize_to(versor q, versor dest) { float dot; x0 = glmm_load(q); - xdot = glmm_dot(x0, x0); + xdot = glmm_vdot(x0, x0); dot = _mm_cvtss_f32(xdot); if (dot <= 0.0f) { diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h new file mode 100644 index 0000000..5412461 --- /dev/null +++ b/include/cglm/simd/arm.h @@ -0,0 +1,41 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_simd_arm_h +#define cglm_simd_arm_h +#include "intrin.h" +#ifdef CGLM_SIMD_ARM + +#define glmm_load(p) vld1q_f32(p) +#define glmm_store(p, a) vst1q_f32(p, a) + +static inline +float +glmm_hadd(float32x4_t v) { +#if defined(__aarch64__) + return vaddvq_f32(v); +#else + v = vaddq_f32(v, vrev64q_f32(v)); + v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v))); + return vgetq_lane_f32(v, 0); +#endif +} + +static inline +float +glmm_dot(float32x4_t a, float32x4_t b) { + return glmm_hadd(vmulq_f32(a, b)); +} + +static inline +float +glmm_norm(float32x4_t a) { + return sqrtf(glmm_dot(a, a)); +} + +#endif +#endif /* cglm_simd_arm_h */ diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index fb577ea..a44b905 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -27,94 +27,39 @@ #if defined( __SSE__ ) || defined( __SSE2__ ) # include # include - -/* OPTIONAL: You may save some instructions but latency (not sure) */ -#ifdef CGLM_USE_INT_DOMAIN -# define glmm_shuff1(xmm, z, y, x, w) \ - _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ - _MM_SHUFFLE(z, y, x, w))) -#else -# define glmm_shuff1(xmm, z, y, x, w) \ - _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) -#endif - -#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x) -#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ - glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ - z1, y1, x1, w1) - -static inline -__m128 -glmm_dot(__m128 a, __m128 b) { - __m128 x0; - x0 = _mm_mul_ps(a, b); - x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2)); - return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1)); -} - -static inline -__m128 -glmm_norm(__m128 a) { - return _mm_sqrt_ps(glmm_dot(a, a)); -} - -static inline -__m128 -glmm_load3(float v[3]) { - __m128i xy; - __m128 z; - - xy = _mm_loadl_epi64((const __m128i *)v); - z = _mm_load_ss(&v[2]); - - return _mm_movelh_ps(_mm_castsi128_ps(xy), z); -} - -static inline -void -glmm_store3(__m128 vx, float v[3]) { - _mm_storel_pi((__m64 *)&v[0], vx); - _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); -} - -#ifdef CGLM_ALL_UNALIGNED -# define glmm_load(p) _mm_loadu_ps(p) -# define glmm_store(p, a) _mm_storeu_ps(p, a) -#else -# define glmm_load(p) _mm_load_ps(p) -# define glmm_store(p, a) _mm_store_ps(p, a) -#endif - -#endif - -/* x86, x64 */ -#if defined( __SSE__ ) || defined( __SSE2__ ) # define CGLM_SSE_FP 1 -#endif - -#ifdef __AVX__ -# define CGLM_AVX_FP 1 - -#ifdef CGLM_ALL_UNALIGNED -# define glmm_load256(p) _mm256_loadu_ps(p) -# define glmm_store256(p, a) _mm256_storeu_ps(p, a) -#else -# define glmm_load256(p) _mm256_load_ps(p) -# define glmm_store256(p, a) _mm256_store_ps(p, a) -#endif - +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif #endif #if defined(__SSE3__) # include +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif #endif #if defined(__SSE4_1__) # include +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif #endif #if defined(__SSE4_2__) # include +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif +#endif + +#ifdef __AVX__ +# include +# define CGLM_AVX_FP 1 +# ifndef CGLM_SIMD_x86 +# define CGLM_SIMD_x86 +# endif #endif /* ARM Neon */ @@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) { # include # if defined(__ARM_NEON_FP) # define CGLM_NEON_FP 1 +# ifndef CGLM_SIMD_ARM +# define CGLM_SIMD_ARM +# endif # endif -#else -# undef CGLM_NEON_FP +#endif + +#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP) +# ifndef CGLM_SIMD +# define CGLM_SIMD +# endif +#endif + +#if defined(CGLM_SIMD_x86) +# include "x86.h" +#endif + +#if defined(CGLM_SIMD_ARM) +# include "arm.h" #endif #endif /* cglm_intrin_h */ diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h new file mode 100644 index 0000000..520a834 --- /dev/null +++ b/include/cglm/simd/x86.h @@ -0,0 +1,136 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_simd_x86_h +#define cglm_simd_x86_h +#include "intrin.h" +#ifdef CGLM_SIMD_x86 + +#ifdef CGLM_ALL_UNALIGNED +# define glmm_load(p) _mm_loadu_ps(p) +# define glmm_store(p, a) _mm_storeu_ps(p, a) +#else +# define glmm_load(p) _mm_load_ps(p) +# define glmm_store(p, a) _mm_store_ps(p, a) +#endif + +#ifdef CGLM_USE_INT_DOMAIN +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \ + _MM_SHUFFLE(z, y, x, w))) +#else +# define glmm_shuff1(xmm, z, y, x, w) \ + _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w)) +#endif + +#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x) +#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ + glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ + z1, y1, x1, w1) + +#ifdef __AVX__ +# ifdef CGLM_ALL_UNALIGNED +# define glmm_load256(p) _mm256_loadu_ps(p) +# define glmm_store256(p, a) _mm256_storeu_ps(p, a) +# else +# define glmm_load256(p) _mm256_load_ps(p) +# define glmm_store256(p, a) _mm256_store_ps(p, a) +# endif +#endif + +static inline +__m128 +glmm_vhadds(__m128 v) { +#if defined(__SSE3__) + __m128 shuf, sums; + shuf = _mm_movehdup_ps(v); + sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return sums; +#else + __m128 shuf, sums; + shuf = glmm_shuff1(v, 2, 3, 0, 1); + sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return sums; +#endif +} + +static inline +float +glmm_hadd(__m128 v) { + return _mm_cvtss_f32(glmm_vhadds(v)); +} + +static inline +__m128 +glmm_vdots(__m128 a, __m128 b) { +#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT) + return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF); +#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT) + __m128 x0, x1; + x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); + x1 = _mm_hadd_ps(x0, x0); + return _mm_hadd_ps(x1, x1); +#else + return glmm_vhadds(_mm_mul_ps(a, b)); +#endif +} + +static inline +__m128 +glmm_vdot(__m128 a, __m128 b) { +#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT) + return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF); +#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT) + __m128 x0, x1; + x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); + x1 = _mm_hadd_ps(x0, x0); + return _mm_hadd_ps(x1, x1); +#else + __m128 x0; + x0 = _mm_mul_ps(a, b); + x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2)); + return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1)); +#endif +} + +static inline +float +glmm_dot(__m128 a, __m128 b) { + return _mm_cvtss_f32(glmm_vdots(a, b)); +} + +static inline +float +glmm_norm(__m128 a) { + return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a)))); +} + +static inline +__m128 +glmm_load3(float v[3]) { + __m128i xy; + __m128 z; + + xy = _mm_loadl_epi64((const __m128i *)v); + z = _mm_load_ss(&v[2]); + + return _mm_movelh_ps(_mm_castsi128_ps(xy), z); +} + +static inline +void +glmm_store3(__m128 vx, float v[3]) { + _mm_storel_pi((__m64 *)&v[0], vx); + _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); +} + +#endif +#endif /* cglm_simd_x86_h */ diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index 03dc405..9da0d96 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -200,24 +200,8 @@ glm_vec4_one(vec4 v) { CGLM_INLINE float glm_vec4_dot(vec4 a, vec4 b) { -#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT) - return _mm_cvtss_f32(_mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF)); -#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT) - __m128 x0, x1; - x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); - x1 = _mm_hadd_ps(x0, x0); - return _mm_cvtss_f32(_mm_hadd_ps(x1, x1)); -#elif defined(__SSE__) || defined(__SSE2__) - __m128 x0; - x0 = _mm_mul_ps(glmm_load(a), glmm_load(b)); - x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2)); - return _mm_cvtss_f32(_mm_add_ss(x0, glmm_shuff1(x0, 0, 1, 0, 1))); -#elif defined(CGLM_NEON_FP) - float32x4_t v0, v1, v2; - v0 = vmulq_f32(vld1q_f32(a), vld1q_f32(b)); - v1 = vaddq_f32(v0, vrev64q_f32(v0)); - v2 = vaddq_f32(v1, vcombine_f32(vget_high_f32(v1), vget_low_f32(v1))); - return vgetq_lane_f32(v2, 0); +#if defined(CGLM_SIMD) + return glmm_dot(glmm_load(a), glmm_load(b)); #else return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; #endif @@ -250,10 +234,8 @@ glm_vec4_norm2(vec4 v) { CGLM_INLINE float glm_vec4_norm(vec4 v) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - __m128 x0; - x0 = glmm_load(v); - return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0))); +#if defined(CGLM_SIMD) + return glmm_norm(glmm_load(v)); #else return sqrtf(glm_vec4_dot(v, v)); #endif @@ -663,7 +645,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) { float dot; x0 = glmm_load(v); - xdot = glmm_dot(x0, x0); + xdot = glmm_vdot(x0, x0); dot = _mm_cvtss_f32(xdot); if (dot == 0.0f) { diff --git a/makefile.am b/makefile.am index 2e9336c..63fa285 100644 --- a/makefile.am +++ b/makefile.am @@ -34,30 +34,30 @@ test_tests_CFLAGS = $(checkCFLAGS) cglmdir=$(includedir)/cglm cglm_HEADERS = include/cglm/version.h \ - include/cglm/cglm.h \ - include/cglm/call.h \ - include/cglm/cam.h \ - include/cglm/io.h \ - include/cglm/mat4.h \ - include/cglm/mat3.h \ - include/cglm/types.h \ - include/cglm/common.h \ - include/cglm/affine.h \ - include/cglm/vec3.h \ - include/cglm/vec3-ext.h \ - include/cglm/vec4.h \ - include/cglm/vec4-ext.h \ - include/cglm/euler.h \ - include/cglm/util.h \ - include/cglm/quat.h \ - include/cglm/affine-mat.h \ - include/cglm/plane.h \ - include/cglm/frustum.h \ - include/cglm/box.h \ - include/cglm/color.h \ - include/cglm/project.h \ - include/cglm/sphere.h \ - include/cglm/ease.h + include/cglm/cglm.h \ + include/cglm/call.h \ + include/cglm/cam.h \ + include/cglm/io.h \ + include/cglm/mat4.h \ + include/cglm/mat3.h \ + include/cglm/types.h \ + include/cglm/common.h \ + include/cglm/affine.h \ + include/cglm/vec3.h \ + include/cglm/vec3-ext.h \ + include/cglm/vec4.h \ + include/cglm/vec4-ext.h \ + include/cglm/euler.h \ + include/cglm/util.h \ + include/cglm/quat.h \ + include/cglm/affine-mat.h \ + include/cglm/plane.h \ + include/cglm/frustum.h \ + include/cglm/box.h \ + include/cglm/color.h \ + include/cglm/project.h \ + include/cglm/sphere.h \ + include/cglm/ease.h cglm_calldir=$(includedir)/cglm/call cglm_call_HEADERS = include/cglm/call/mat4.h \ @@ -77,7 +77,9 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \ include/cglm/call/ease.h cglm_simddir=$(includedir)/cglm/simd -cglm_simd_HEADERS = include/cglm/simd/intrin.h +cglm_simd_HEADERS = include/cglm/simd/intrin.h \ + include/cglm/simd/x86.h \ + include/cglm/simd/arm.h cglm_simd_sse2dir=$(includedir)/cglm/simd/sse2 cglm_simd_sse2_HEADERS = include/cglm/simd/sse2/affine.h \ diff --git a/win/cglm.vcxproj b/win/cglm.vcxproj index 5678688..97d9d08 100644 --- a/win/cglm.vcxproj +++ b/win/cglm.vcxproj @@ -69,6 +69,7 @@ + @@ -77,6 +78,7 @@ + diff --git a/win/cglm.vcxproj.filters b/win/cglm.vcxproj.filters index 5e65853..a668242 100644 --- a/win/cglm.vcxproj.filters +++ b/win/cglm.vcxproj.filters @@ -233,5 +233,11 @@ include\cglm + + include\cglm\simd + + + include\cglm\simd + \ No newline at end of file