From 65292a94a6d3ab7b0d0b2d91668a69a588feb0c1 Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Sat, 24 Apr 2021 00:00:00 +0300
Subject: [PATCH] swizzling functions for NEON

---
 CREDITS                 |  5 +++++
 include/cglm/simd/arm.h | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/CREDITS b/CREDITS
index daeb27e..94d9d40 100644
--- a/CREDITS
+++ b/CREDITS
@@ -74,3 +74,8 @@ Link to paper: http://webserver2.tecgraf.puc-rio.br/~mgattass/cg/trbRR/Fast%20Mi
 
 14. ARM NEON: Matrix Vector Multiplication
 https://stackoverflow.com/a/57793352/2676533
+
+15. ARM NEON Vector Swizzling and Permute
+
+https://stackoverflow.com/questions/32536265/how-to-convert-mm-shuffle-ps-sse-intrinsic-to-neon-intrinsic
+http://github.com/microsoft/DirectXMath
diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h
index 4c8a3b4..1a5adc9 100644
--- a/include/cglm/simd/arm.h
+++ b/include/cglm/simd/arm.h
@@ -21,6 +21,46 @@
 #define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0)
 #define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1)
 
+#define SWIZZLE(NAME) \
+  static inline float32x4_t NAME(glmm_128 v)
+
+/* TODO:
+ * I'm not sure if glmm_xxxx(), glmm_yyyy()... is better than glmm_0000()...
+ */
+
+SWIZZLE(glmm_0000) { return vdupq_lane_f32(vget_low_f32(v),  0); }
+SWIZZLE(glmm_1111) { return vdupq_lane_f32(vget_low_f32(v),  1); }
+SWIZZLE(glmm_2222) { return vdupq_lane_f32(vget_high_f32(v), 0); }
+SWIZZLE(glmm_3333) { return vdupq_lane_f32(vget_high_f32(v), 1); }
+
+SWIZZLE(glmm_1032) { return vrev64q_f32(v); }
+
+SWIZZLE(glmm_0101) { float32x2_t vt = vget_low_f32(v);              return vcombine_f32(vt, vt); }
+SWIZZLE(glmm_2323) { float32x2_t vt = vget_high_f32(v);             return vcombine_f32(vt, vt); }
+SWIZZLE(glmm_1010) { float32x2_t vt = vrev64_f32(vget_low_f32(v));  return vcombine_f32(vt, vt); }
+SWIZZLE(glmm_3232) { float32x2_t vt = vrev64_f32(vget_high_f32(v)); return vcombine_f32(vt, vt); }
+
+SWIZZLE(glmm_0132) { return vcombine_f32(vget_low_f32(v), vrev64_f32(vget_high_f32(v))); }
+SWIZZLE(glmm_1023) { return vcombine_f32(vrev64_f32(vget_low_f32(v)), vget_high_f32(v)); }
+SWIZZLE(glmm_2310) { return vcombine_f32(vget_high_f32(v), vrev64_f32(vget_low_f32(v))); }
+SWIZZLE(glmm_3201) { return vcombine_f32(vrev64_f32(vget_high_f32(v)), vget_low_f32(v)); }
+SWIZZLE(glmm_3210) { return vcombine_f32(vrev64_f32(vget_high_f32(v)), vrev64_f32(vget_low_f32(v))); }
+
+SWIZZLE(glmm_0022) { return vtrnq_f32(v, v).val[0]; }
+SWIZZLE(glmm_1133) { return vtrnq_f32(v, v).val[1]; }
+
+SWIZZLE(glmm_0011) { return vzipq_f32(v, v).val[0]; }
+SWIZZLE(glmm_2233) { return vzipq_f32(v, v).val[1]; }
+
+SWIZZLE(glmm_0202) { return vuzpq_f32(v, v).val[0]; }
+SWIZZLE(glmm_1313) { return vuzpq_f32(v, v).val[1]; }
+
+SWIZZLE(glmm_1230) { return vextq_f32(v, v, 1);     }
+SWIZZLE(glmm_2301) { return vextq_f32(v, v, 2);     }
+SWIZZLE(glmm_3012) { return vextq_f32(v, v, 3);     }
+
+#undef SWIZZLE
+
 static inline
 float32x4_t
 glmm_abs(float32x4_t v) {