From a111693b6b42f71e410d3edc0e420fb9d2bab5cb Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Sat, 24 Apr 2021 15:45:36 +0300
Subject: [PATCH] arm, neon: implement mat4 determinant with neon

---
 include/cglm/mat4.h           |  2 +
 include/cglm/simd/arm.h       | 13 ++++++-
 include/cglm/simd/neon/mat4.h | 70 +++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h
index c099574..697f510 100644
--- a/include/cglm/mat4.h
+++ b/include/cglm/mat4.h
@@ -562,6 +562,8 @@ float
 glm_mat4_det(mat4 mat) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
   return glm_mat4_det_sse2(mat);
+#elif defined(CGLM_NEON_FP)
+  return glm_mat4_det_neon(mat);
 #else
   /* [square] det(A) = det(At) */
   float t[6];
diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h
index 791f4cb..cd4f2b8 100644
--- a/include/cglm/simd/arm.h
+++ b/include/cglm/simd/arm.h
@@ -69,12 +69,23 @@ SWIZZLE(glmm_2103) { return vextq_f32(v, v, 3);     }
 
 #undef SWIZZLE
 
+#define glmm_xor(a, b)                                                        \
+  vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a),                   \
+                                  vreinterpretq_s32_f32(b)))
+
 static inline
 float32x4_t
 glmm_abs(float32x4_t v) {
   return vabsq_f32(v);
 }
 
+static inline
+float32x4_t
+glmm_vhadd(float32x4_t v) {
+  v = vaddq_f32(v, vrev64q_f32(v));
+  return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
+}
+
 static inline
 float
 glmm_hadd(float32x4_t v) {
@@ -138,7 +149,7 @@ glmm_norm_inf(float32x4_t a) {
 static inline
 float32x4_t
 glmm_div(float32x4_t a, float32x4_t b) {
-#if CGLM_ARM641
+#if CGLM_ARM64
   return vdivq_f32(a, b);
 #else
   /* 2 iterations of Newton-Raphson refinement of reciprocal */
diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h
index 36d347b..ad32339 100644
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -101,5 +101,75 @@ glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) {
   vst1q_f32(dest, l0);
 }
 
+CGLM_INLINE
+float
+glm_mat4_det_neon(mat4 mat) {
+  float32x4_t   r0, r1, r2, r3, x0, x1, x2;
+  float32x2_t   ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
+  float32x4x2_t a1;
+  float32x4_t   x3 = { 0.f, -0.f, 0.f, -0.f };
+
+  /* 127 <- 0, [square] det(A) = det(At) */
+  r0 = glmm_load(mat[0]);              /* d c b a */
+  r1 = vrev64q_f32(glmm_load(mat[1])); /* g h e f */
+  r2 = vrev64q_f32(glmm_load(mat[2])); /* l k i j */
+  r3 = vrev64q_f32(glmm_load(mat[3])); /* o p m n */
+
+  gh = vget_high_f32(r1);
+  ef = vget_low_f32(r1);
+  kl = vget_high_f32(r2);
+  ij = vget_low_f32(r2);
+  op = vget_high_f32(r3);
+  mn = vget_low_f32(r3);
+  mm = vdup_lane_f32(mn, 1);
+  nn = vdup_lane_f32(mn, 0);
+  ii = vdup_lane_f32(ij, 1);
+  jj = vdup_lane_f32(ij, 0);
+  
+  /*
+   t[1] = j * p - n * l;
+   t[2] = j * o - n * k;
+   t[3] = i * p - m * l;
+   t[4] = i * o - m * k;
+   */
+  x0 = glmm_fnmadd(vcombine_f32(kl, kl), vcombine_f32(nn, mm),
+                   vmulq_f32(vcombine_f32(op, op), vcombine_f32(jj, ii)));
+
+  t12 = vget_low_f32(x0);
+  t34 = vget_high_f32(x0);
+  
+  /* 1 3 1 3 2 4 2 4 */
+  a1 = vuzpq_f32(x0, x0);
+  
+  /*
+   t[0] = k * p - o * l;
+   t[0] = k * p - o * l;
+   t[5] = i * n - m * j;
+   t[5] = i * n - m * j;
+   */
+  x1 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(kl, 0), jj),
+                   vcombine_f32(vdup_lane_f32(op, 1), mm),
+                   vmulq_f32(vcombine_f32(vdup_lane_f32(op, 0), nn),
+                             vcombine_f32(vdup_lane_f32(kl, 1), ii)));
+
+  /*
+     a * (f * t[0] - g * t[1] + h * t[2])
+   - b * (e * t[0] - g * t[3] + h * t[4])
+   + c * (e * t[1] - f * t[3] + h * t[5])
+   - d * (e * t[2] - f * t[4] + g * t[5])
+   */
+  x2 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(gh, 1), vdup_lane_f32(ef, 0)),
+                   vcombine_f32(vget_low_f32(a1.val[0]), t34),
+                   vmulq_f32(vcombine_f32(ef, vdup_lane_f32(ef, 1)),
+                             vcombine_f32(vget_low_f32(x1), t12)));
+
+  x2 = glmm_fmadd(vcombine_f32(vdup_lane_f32(gh, 0), gh),
+                  vcombine_f32(vget_low_f32(a1.val[1]), vget_high_f32(x1)), x2);
+
+  x2 = glmm_xor(x2, x3);
+
+  return glmm_hadd(vmulq_f32(x2, r0));
+}
+
 #endif
 #endif /* cglm_mat4_neon_h */