From 010dcc9837b2ebf5a9b0c4272ac87e5c307bd610 Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Wed, 11 Apr 2018 00:17:41 +0300
Subject: [PATCH] optimize normalize quaternion with SIMD

* provide _to version for storing into another quat
---
 include/cglm/quat.h        | 48 ++++++++++++++++++++++++++++++--------
 include/cglm/simd/intrin.h | 10 ++++++++
 2 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/include/cglm/quat.h b/include/cglm/quat.h
index fabce2d..970b7f6 100644
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -158,6 +158,43 @@ glm_quat_norm(versor q) {
   return glm_vec4_norm(q);
 }
 
+/*!
+ * @brief normalize quaternion and store result in dest
+ *
+ * @param[in]   q     quaternion to normalze
+ * @param[out]  dest  destination quaternion
+ */
+CGLM_INLINE
+void
+glm_quat_normalize_to(versor q, versor dest) {
+#if defined( __SSE2__ ) || defined( __SSE2__ )
+  __m128 xdot, x0;
+  float  dot;
+
+  x0   = _mm_load_ps(q);
+  xdot = glm_simd_dot(x0, x0);
+  dot = _mm_cvtss_f32(xdot);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(dest);
+    return;
+  }
+
+  _mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot)));
+#else
+  float dot;
+
+  dot = glm_vec4_norm2(q);
+
+  if (dot <= 0.0f) {
+    glm_quat_identity(q);
+    return;
+  }
+
+  glm_vec4_scale(q, 1.0f / sqrtf(dot), dest);
+#endif
+}
+
 /*!
  * @brief normalize quaternion
  *
@@ -166,16 +203,7 @@ glm_quat_norm(versor q) {
 CGLM_INLINE
 void
 glm_quat_normalize(versor q) {
-  float sum;
-
-  sum = glm_vec4_norm2(q);
-
-  if (sum <= 0.0f) {
-    glm_quat_identity(q);
-    return;
-  }
-
-  glm_vec4_scale(q, 1.0f / sqrtf(sum), q);
+  glm_quat_normalize_to(q, q);
 }
 
 /*!
diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h
index c0f2e53..4c27d90 100644
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -30,6 +30,16 @@
 #  define _mm_shuffle2_ps(a, b, z0, y0, x0, w0, z1, y1, x1, w1)                \
      _mm_shuffle1_ps(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),        \
                                     z1, y1, x1, w1)
+
+CGLM_INLINE
+__m128
+glm_simd_dot(__m128 a, __m128 b) {
+  __m128 x0;
+  x0 = _mm_mul_ps(a, b);
+  x0 = _mm_add_ps(x0, _mm_shuffle1_ps(x0, 1, 0, 3, 2));
+  return _mm_add_ps(x0, _mm_shuffle1_ps(x0, 0, 1, 0, 1));
+}
+
 #endif
 
 /* x86, x64 */