From 010dcc9837b2ebf5a9b0c4272ac87e5c307bd610 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Wed, 11 Apr 2018 00:17:41 +0300 Subject: [PATCH] optimize normalize quaternion with SIMD * provide _to version for storing into another quat --- include/cglm/quat.h | 48 ++++++++++++++++++++++++++++++-------- include/cglm/simd/intrin.h | 10 ++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/include/cglm/quat.h b/include/cglm/quat.h index fabce2d..970b7f6 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -158,6 +158,43 @@ glm_quat_norm(versor q) { return glm_vec4_norm(q); } +/*! + * @brief normalize quaternion and store result in dest + * + * @param[in] q quaternion to normalze + * @param[out] dest destination quaternion + */ +CGLM_INLINE +void +glm_quat_normalize_to(versor q, versor dest) { +#if defined( __SSE2__ ) || defined( __SSE2__ ) + __m128 xdot, x0; + float dot; + + x0 = _mm_load_ps(q); + xdot = glm_simd_dot(x0, x0); + dot = _mm_cvtss_f32(xdot); + + if (dot <= 0.0f) { + glm_quat_identity(dest); + return; + } + + _mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); +#else + float dot; + + dot = glm_vec4_norm2(q); + + if (dot <= 0.0f) { + glm_quat_identity(q); + return; + } + + glm_vec4_scale(q, 1.0f / sqrtf(dot), dest); +#endif +} + /*! * @brief normalize quaternion * @@ -166,16 +203,7 @@ glm_quat_norm(versor q) { CGLM_INLINE void glm_quat_normalize(versor q) { - float sum; - - sum = glm_vec4_norm2(q); - - if (sum <= 0.0f) { - glm_quat_identity(q); - return; - } - - glm_vec4_scale(q, 1.0f / sqrtf(sum), q); + glm_quat_normalize_to(q, q); } /*! diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index c0f2e53..4c27d90 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -30,6 +30,16 @@ # define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \ _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \ z1, y1, x1, w1) + +CGLM_INLINE +__m128 +glm_simd_dot(__m128 a, __m128 b) { + __m128 x0; + x0 = _mm_mul_ps(a, b); + x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2)); + return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1)); +} + #endif /* x86, x64 */