From 31bb303c55fde6d93aa52c9083e64fd1b2b7be54 Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Thu, 24 Jan 2019 10:17:49 +0300
Subject: [PATCH] simd: organise SIMD-functions

* optimize dot product
---
 .gitignore                 |   1 +
 CREDITS                    |   3 +
 include/cglm/quat.h        |   2 +-
 include/cglm/simd/arm.h    |  41 +++++++++++
 include/cglm/simd/intrin.h | 114 ++++++++++---------------------
 include/cglm/simd/x86.h    | 136 +++++++++++++++++++++++++++++++++++++
 include/cglm/vec4.h        |  28 ++------
 makefile.am                |  52 +++++++-------
 win/cglm.vcxproj           |   2 +
 win/cglm.vcxproj.filters   |   6 ++
 10 files changed, 259 insertions(+), 126 deletions(-)
 create mode 100644 include/cglm/simd/arm.h
 create mode 100644 include/cglm/simd/x86.h

diff --git a/.gitignore b/.gitignore
index d500b97..195a82c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,4 @@ win/cglm_test_*
 win/x64
 win/x85
 win/Debug
+cglm-test-ios*
diff --git a/CREDITS b/CREDITS
index 0488bad..263dd2d 100644
--- a/CREDITS
+++ b/CREDITS
@@ -52,3 +52,6 @@ https://gamedev.stackexchange.com/questions/28395/rotating-vector3-by-a-quaterni
 
 9. Sphere AABB intersect
 https://github.com/erich666/GraphicsGems/blob/master/gems/BoxSphere.c
+
+10. Horizontal add
+https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
diff --git a/include/cglm/quat.h b/include/cglm/quat.h
index 1db0161..f5f29af 100644
--- a/include/cglm/quat.h
+++ b/include/cglm/quat.h
@@ -218,7 +218,7 @@ glm_quat_normalize_to(versor q, versor dest) {
   float  dot;
 
   x0   = glmm_load(q);
-  xdot = glmm_dot(x0, x0);
+  xdot = glmm_vdot(x0, x0);
   dot  = _mm_cvtss_f32(xdot);
 
   if (dot <= 0.0f) {
diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h
new file mode 100644
index 0000000..5412461
--- /dev/null
+++ b/include/cglm/simd/arm.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_simd_arm_h
+#define cglm_simd_arm_h
+#include "intrin.h"
+#ifdef CGLM_SIMD_ARM
+
+#define glmm_load(p)      vld1q_f32(p)
+#define glmm_store(p, a)  vst1q_f32(p, a)
+
+static inline
+float
+glmm_hadd(float32x4_t v) {
+#if defined(__aarch64__)
+  return vaddvq_f32(v);
+#else
+  v = vaddq_f32(v, vrev64q_f32(v));
+  v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
+  return vgetq_lane_f32(v, 0);
+#endif
+}
+
+static inline
+float
+glmm_dot(float32x4_t a, float32x4_t b) {
+  return glmm_hadd(vmulq_f32(a, b));
+}
+
+static inline
+float
+glmm_norm(float32x4_t a) {
+  return sqrtf(glmm_dot(a, a));
+}
+
+#endif
+#endif /* cglm_simd_arm_h */
diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h
index fb577ea..a44b905 100644
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -27,94 +27,39 @@
 #if defined( __SSE__ ) || defined( __SSE2__ )
 #  include <xmmintrin.h>
 #  include <emmintrin.h>
-
-/* OPTIONAL: You may save some instructions but latency (not sure) */
-#ifdef CGLM_USE_INT_DOMAIN
-#  define glmm_shuff1(xmm, z, y, x, w)                                        \
-     _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),                \
-                                        _MM_SHUFFLE(z, y, x, w)))
-#else
-#  define glmm_shuff1(xmm, z, y, x, w)                                        \
-     _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
-#endif
-
-#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
-#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1)                     \
-     glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),           \
-                 z1, y1, x1, w1)
-
-static inline
-__m128
-glmm_dot(__m128 a, __m128 b) {
-  __m128 x0;
-  x0 = _mm_mul_ps(a, b);
-  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
-  return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
-}
-
-static inline
-__m128
-glmm_norm(__m128 a) {
-  return _mm_sqrt_ps(glmm_dot(a, a));
-}
-
-static inline
-__m128
-glmm_load3(float v[3]) {
-  __m128i xy;
-  __m128  z;
-
-  xy = _mm_loadl_epi64((const __m128i *)v);
-  z  = _mm_load_ss(&v[2]);
-
-  return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
-}
-
-static inline
-void
-glmm_store3(__m128 vx, float v[3]) {
-  _mm_storel_pi((__m64 *)&v[0], vx);
-  _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
-}
-
-#ifdef CGLM_ALL_UNALIGNED
-#  define glmm_load(p)      _mm_loadu_ps(p)
-#  define glmm_store(p, a)  _mm_storeu_ps(p, a)
-#else
-#  define glmm_load(p)      _mm_load_ps(p)
-#  define glmm_store(p, a)  _mm_store_ps(p, a)
-#endif
-
-#endif
-
-/* x86, x64 */
-#if defined( __SSE__ ) || defined( __SSE2__ )
 #  define CGLM_SSE_FP 1
-#endif
-
-#ifdef __AVX__
-#  define CGLM_AVX_FP 1
-
-#ifdef CGLM_ALL_UNALIGNED
-#  define glmm_load256(p)      _mm256_loadu_ps(p)
-#  define glmm_store256(p, a)  _mm256_storeu_ps(p, a)
-#else
-#  define glmm_load256(p)      _mm256_load_ps(p)
-#  define glmm_store256(p, a)  _mm256_store_ps(p, a)
-#endif
-
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif
 
 #if defined(__SSE3__)
 #  include <x86intrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif
 
 #if defined(__SSE4_1__)
 #  include <smmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif
 
 #if defined(__SSE4_2__)
 #  include <nmmintrin.h>
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
+#endif
+
+#ifdef __AVX__
+#  include <immintrin.h>
+#  define CGLM_AVX_FP 1
+#  ifndef CGLM_SIMD_x86
+#    define CGLM_SIMD_x86
+#  endif
 #endif
 
 /* ARM Neon */
@@ -122,9 +67,24 @@ glmm_store3(__m128 vx, float v[3]) {
 #  include <arm_neon.h>
 #  if defined(__ARM_NEON_FP)
 #    define CGLM_NEON_FP 1
+#    ifndef CGLM_SIMD_ARM
+#      define CGLM_SIMD_ARM
+#    endif
 #  endif
-#else
-#  undef  CGLM_NEON_FP
+#endif
+
+#if defined(CGLM_SIMD_x86) || defined(CGLM_NEON_FP)
+#  ifndef CGLM_SIMD
+#    define CGLM_SIMD
+#  endif
+#endif
+
+#if defined(CGLM_SIMD_x86)
+#  include "x86.h"
+#endif
+
+#if defined(CGLM_SIMD_ARM)
+#  include "arm.h"
 #endif
 
 #endif /* cglm_intrin_h */
diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h
new file mode 100644
index 0000000..520a834
--- /dev/null
+++ b/include/cglm/simd/x86.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c), Recep Aslantas.
+ *
+ * MIT License (MIT), http://opensource.org/licenses/MIT
+ * Full license can be found in the LICENSE file
+ */
+
+#ifndef cglm_simd_x86_h
+#define cglm_simd_x86_h
+#include "intrin.h"
+#ifdef CGLM_SIMD_x86
+
+#ifdef CGLM_ALL_UNALIGNED
+#  define glmm_load(p)      _mm_loadu_ps(p)
+#  define glmm_store(p, a)  _mm_storeu_ps(p, a)
+#else
+#  define glmm_load(p)      _mm_load_ps(p)
+#  define glmm_store(p, a)  _mm_store_ps(p, a)
+#endif
+
+#ifdef CGLM_USE_INT_DOMAIN
+#  define glmm_shuff1(xmm, z, y, x, w)                                        \
+     _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm),                \
+                                        _MM_SHUFFLE(z, y, x, w)))
+#else
+#  define glmm_shuff1(xmm, z, y, x, w)                                        \
+       _mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
+#endif
+
+#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
+#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1)                     \
+     glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)),           \
+                 z1, y1, x1, w1)
+
+#ifdef __AVX__
+#  ifdef CGLM_ALL_UNALIGNED
+#    define glmm_load256(p)      _mm256_loadu_ps(p)
+#    define glmm_store256(p, a)  _mm256_storeu_ps(p, a)
+#  else
+#    define glmm_load256(p)      _mm256_load_ps(p)
+#    define glmm_store256(p, a)  _mm256_store_ps(p, a)
+#  endif
+#endif
+
+static inline
+__m128
+glmm_vhadds(__m128 v) {
+#if defined(__SSE3__)
+  __m128 shuf, sums;
+  shuf = _mm_movehdup_ps(v);
+  sums = _mm_add_ps(v, shuf);
+  shuf = _mm_movehl_ps(shuf, sums);
+  sums = _mm_add_ss(sums, shuf);
+  return sums;
+#else
+  __m128 shuf, sums;
+  shuf = glmm_shuff1(v, 2, 3, 0, 1);
+  sums = _mm_add_ps(v, shuf);
+  shuf = _mm_movehl_ps(shuf, sums);
+  sums = _mm_add_ss(sums, shuf);
+  return sums;
+#endif
+}
+
+static inline
+float
+glmm_hadd(__m128 v) {
+  return _mm_cvtss_f32(glmm_vhadds(v));
+}
+
+static inline
+__m128
+glmm_vdots(__m128 a, __m128 b) {
+#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
+  return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
+#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
+  __m128 x0, x1;
+  x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
+  x1 = _mm_hadd_ps(x0, x0);
+  return _mm_hadd_ps(x1, x1);
+#else
+  return glmm_vhadds(_mm_mul_ps(a, b));
+#endif
+}
+
+static inline
+__m128
+glmm_vdot(__m128 a, __m128 b) {
+#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
+  return _mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF);
+#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
+  __m128 x0, x1;
+  x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
+  x1 = _mm_hadd_ps(x0, x0);
+  return _mm_hadd_ps(x1, x1);
+#else
+  __m128 x0;
+  x0 = _mm_mul_ps(a, b);
+  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
+  return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
+#endif
+}
+
+static inline
+float
+glmm_dot(__m128 a, __m128 b) {
+  return _mm_cvtss_f32(glmm_vdots(a, b));
+}
+
+static inline
+float
+glmm_norm(__m128 a) {
+  return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a))));
+}
+
+static inline
+__m128
+glmm_load3(float v[3]) {
+  __m128i xy;
+  __m128  z;
+
+  xy = _mm_loadl_epi64((const __m128i *)v);
+  z  = _mm_load_ss(&v[2]);
+
+  return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
+}
+
+static inline
+void
+glmm_store3(__m128 vx, float v[3]) {
+  _mm_storel_pi((__m64 *)&v[0], vx);
+  _mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
+}
+
+#endif
+#endif /* cglm_simd_x86_h */
diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h
index 03dc405..9da0d96 100644
--- a/include/cglm/vec4.h
+++ b/include/cglm/vec4.h
@@ -200,24 +200,8 @@ glm_vec4_one(vec4 v) {
 CGLM_INLINE
 float
 glm_vec4_dot(vec4 a, vec4 b) {
-#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
-  return _mm_cvtss_f32(_mm_dp_ps(glmm_load(a), glmm_load(b), 0xFF));
-#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
-  __m128 x0, x1;
-  x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
-  x1 = _mm_hadd_ps(x0, x0);
-  return _mm_cvtss_f32(_mm_hadd_ps(x1, x1));
-#elif defined(__SSE__) || defined(__SSE2__)
-  __m128 x0;
-  x0 = _mm_mul_ps(glmm_load(a), glmm_load(b));
-  x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
-  return _mm_cvtss_f32(_mm_add_ss(x0, glmm_shuff1(x0, 0, 1, 0, 1)));
-#elif defined(CGLM_NEON_FP)
-  float32x4_t v0, v1, v2;
-  v0 = vmulq_f32(vld1q_f32(a), vld1q_f32(b));
-  v1 = vaddq_f32(v0, vrev64q_f32(v0));
-  v2 = vaddq_f32(v1, vcombine_f32(vget_high_f32(v1), vget_low_f32(v1)));
-  return vgetq_lane_f32(v2, 0);
+#if defined(CGLM_SIMD)
+  return glmm_dot(glmm_load(a), glmm_load(b));
 #else
   return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
 #endif
@@ -250,10 +234,8 @@ glm_vec4_norm2(vec4 v) {
 CGLM_INLINE
 float
 glm_vec4_norm(vec4 v) {
-#if defined( __SSE__ ) || defined( __SSE2__ )
-  __m128 x0;
-  x0 = glmm_load(v);
-  return _mm_cvtss_f32(_mm_sqrt_ss(glmm_dot(x0, x0)));
+#if defined(CGLM_SIMD)
+  return glmm_norm(glmm_load(v));
 #else
   return sqrtf(glm_vec4_dot(v, v));
 #endif
@@ -663,7 +645,7 @@ glm_vec4_normalize_to(vec4 v, vec4 dest) {
   float  dot;
 
   x0   = glmm_load(v);
-  xdot = glmm_dot(x0, x0);
+  xdot = glmm_vdot(x0, x0);
   dot  = _mm_cvtss_f32(xdot);
 
   if (dot == 0.0f) {
diff --git a/makefile.am b/makefile.am
index 2e9336c..63fa285 100644
--- a/makefile.am
+++ b/makefile.am
@@ -34,30 +34,30 @@ test_tests_CFLAGS  = $(checkCFLAGS)
 
 cglmdir=$(includedir)/cglm
 cglm_HEADERS = include/cglm/version.h \
-                  include/cglm/cglm.h \
-                  include/cglm/call.h \
-                  include/cglm/cam.h \
-                  include/cglm/io.h \
-                  include/cglm/mat4.h \
-                  include/cglm/mat3.h \
-                  include/cglm/types.h \
-                  include/cglm/common.h \
-                  include/cglm/affine.h \
-                  include/cglm/vec3.h \
-                  include/cglm/vec3-ext.h \
-                  include/cglm/vec4.h \
-                  include/cglm/vec4-ext.h \
-                  include/cglm/euler.h \
-                  include/cglm/util.h \
-                  include/cglm/quat.h \
-                  include/cglm/affine-mat.h \
-                  include/cglm/plane.h \
-                  include/cglm/frustum.h \
-                  include/cglm/box.h \
-                  include/cglm/color.h \
-                  include/cglm/project.h \
-                  include/cglm/sphere.h \
-                  include/cglm/ease.h
+               include/cglm/cglm.h \
+               include/cglm/call.h \
+               include/cglm/cam.h \
+               include/cglm/io.h \
+               include/cglm/mat4.h \
+               include/cglm/mat3.h \
+               include/cglm/types.h \
+               include/cglm/common.h \
+               include/cglm/affine.h \
+               include/cglm/vec3.h \
+               include/cglm/vec3-ext.h \
+               include/cglm/vec4.h \
+               include/cglm/vec4-ext.h \
+               include/cglm/euler.h \
+               include/cglm/util.h \
+               include/cglm/quat.h \
+               include/cglm/affine-mat.h \
+               include/cglm/plane.h \
+               include/cglm/frustum.h \
+               include/cglm/box.h \
+               include/cglm/color.h \
+               include/cglm/project.h \
+               include/cglm/sphere.h \
+               include/cglm/ease.h
 
 cglm_calldir=$(includedir)/cglm/call
 cglm_call_HEADERS = include/cglm/call/mat4.h \
@@ -77,7 +77,9 @@ cglm_call_HEADERS = include/cglm/call/mat4.h \
                     include/cglm/call/ease.h
 
 cglm_simddir=$(includedir)/cglm/simd
-cglm_simd_HEADERS = include/cglm/simd/intrin.h
+cglm_simd_HEADERS = include/cglm/simd/intrin.h \
+                    include/cglm/simd/x86.h \
+                    include/cglm/simd/arm.h
 
 cglm_simd_sse2dir=$(includedir)/cglm/simd/sse2
 cglm_simd_sse2_HEADERS = include/cglm/simd/sse2/affine.h \
diff --git a/win/cglm.vcxproj b/win/cglm.vcxproj
index 5678688..97d9d08 100644
--- a/win/cglm.vcxproj
+++ b/win/cglm.vcxproj
@@ -69,6 +69,7 @@
     <ClInclude Include="..\include\cglm\plane.h" />
     <ClInclude Include="..\include\cglm\project.h" />
     <ClInclude Include="..\include\cglm\quat.h" />
+    <ClInclude Include="..\include\cglm\simd\arm.h" />
     <ClInclude Include="..\include\cglm\simd\avx\affine.h" />
     <ClInclude Include="..\include\cglm\simd\avx\mat4.h" />
     <ClInclude Include="..\include\cglm\simd\intrin.h" />
@@ -77,6 +78,7 @@
     <ClInclude Include="..\include\cglm\simd\sse2\mat3.h" />
     <ClInclude Include="..\include\cglm\simd\sse2\mat4.h" />
     <ClInclude Include="..\include\cglm\simd\sse2\quat.h" />
+    <ClInclude Include="..\include\cglm\simd\x86.h" />
     <ClInclude Include="..\include\cglm\sphere.h" />
     <ClInclude Include="..\include\cglm\types.h" />
     <ClInclude Include="..\include\cglm\util.h" />
diff --git a/win/cglm.vcxproj.filters b/win/cglm.vcxproj.filters
index 5e65853..a668242 100644
--- a/win/cglm.vcxproj.filters
+++ b/win/cglm.vcxproj.filters
@@ -233,5 +233,11 @@
     <ClInclude Include="..\include\cglm\ease.h">
       <Filter>include\cglm</Filter>
     </ClInclude>
+    <ClInclude Include="..\include\cglm\simd\arm.h">
+      <Filter>include\cglm\simd</Filter>
+    </ClInclude>
+    <ClInclude Include="..\include\cglm\simd\x86.h">
+      <Filter>include\cglm\simd</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file