From e34601f5789cf211a137230c3a0b301965a0a84b Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Sat, 29 Aug 2020 11:51:07 +0300
Subject: [PATCH] arm neon: multiply mat4 with vec4

---
 include/cglm/mat4.h           |  2 ++
 include/cglm/simd/neon/mat4.h | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h
index 483c9dd..cda5285 100644
--- a/include/cglm/mat4.h
+++ b/include/cglm/mat4.h
@@ -358,6 +358,8 @@ void
 glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
   glm_mat4_mulv_sse2(m, v, dest);
+#elif defined(CGLM_NEON_FP)
+  glm_mat4_mulv_neon(m, v, dest);
 #else
   vec4 res;
   res[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3];
diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h
index 955cb80..36d347b 100644
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -79,5 +79,27 @@ glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {
   vst1q_f32(dest[3], d3);
 }
 
+CGLM_INLINE
+void
+glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) {
+  float32x4_t l0, l1, l2, l3;
+  float32x2_t vlo, vhi;
+  
+  l0  = vld1q_f32(m[0]);
+  l1  = vld1q_f32(m[1]);
+  l2  = vld1q_f32(m[2]);
+  l3  = vld1q_f32(m[3]);
+
+  vlo = vld1_f32(&v[0]);
+  vhi = vld1_f32(&v[2]);
+
+  l0  = vmulq_lane_f32(l0, vlo, 0);
+  l0  = vmlaq_lane_f32(l0, l1, vlo, 1);
+  l0  = vmlaq_lane_f32(l0, l2, vhi, 0);
+  l0  = vmlaq_lane_f32(l0, l3, vhi, 1);
+
+  vst1q_f32(dest, l0);
+}
+
 #endif
 #endif /* cglm_mat4_neon_h */