From d13842e7dece4de1877344b14eee945f91a96c3d Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 24 Apr 2021 00:51:09 +0300 Subject: [PATCH] arm: optimize vec4 div with NEON --- CREDITS | 4 ++++ include/cglm/simd/arm.h | 17 +++++++++++++++++ include/cglm/simd/x86.h | 6 ++++++ include/cglm/vec4.h | 4 ++-- 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 94d9d40..96341fc 100644 --- a/CREDITS +++ b/CREDITS @@ -79,3 +79,7 @@ https://stackoverflow.com/a/57793352/2676533 https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic http://github.com/microsoft/DirectXMath + +16. ARM NEON Div + +http://github.com/microsoft/DirectXMath diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 17ce15e..c980e54 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -131,6 +131,23 @@ glmm_norm_inf(float32x4_t a) { return glmm_hmax(glmm_abs(a)); } +static inline +float32x4_t +glmm_div(float32x4_t a, float32x4_t b) { +#if CGLM_ARM641 + return vdivq_f32(a, b); +#else + /* 2 iterations of Newton-Raphson refinement of reciprocal */ + float32x4_t r0, r1; + r0 = vrecpeq_f32(b); + r1 = vrecpsq_f32(r0, b); + r0 = vmulq_f32(r1, r0); + r1 = vrecpsq_f32(r0, b); + r0 = vmulq_f32(r1, r0); + return vmulq_f32(a, r0); +#endif +} + static inline float32x4_t glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) { diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index df32491..dbbd0f8 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -209,6 +209,12 @@ glmm_store3(float v[3], __m128 vx) { _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2)); } +static inline +__m128 +glmm_div(__m128 a, __m128 b) { + return _mm_div_ps(a, b); +} + /* enable FMA macro for MSVC? */ #if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__) # define __FMA__ 1 diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index ce3f017..8e95ec5 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -473,8 +473,8 @@ glm_vec4_scale_as(vec4 v, float s, vec4 dest) { CGLM_INLINE void glm_vec4_div(vec4 a, vec4 b, vec4 dest) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_div_ps(glmm_load(a), glmm_load(b))); +#if defined(CGLM_SIMD) + glmm_store(dest, glmm_div(glmm_load(a), glmm_load(b))); #else dest[0] = a[0] / b[0]; dest[1] = a[1] / b[1];