From f0e09776d772668287453d8738c8727936765da9 Mon Sep 17 00:00:00 2001
From: Recep Aslantas <info@recp.me>
Date: Tue, 2 Apr 2024 02:36:16 +0300
Subject: [PATCH] arm, neon: optimize glmm_vhadd and add glmm_vdot

---
 include/cglm/simd/arm.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h
index 1578390..eb999f1 100644
--- a/include/cglm/simd/arm.h
+++ b/include/cglm/simd/arm.h
@@ -63,8 +63,17 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_
 static inline
 float32x4_t
 glmm_vhadd(float32x4_t v) {
+  float32x4_t p;
+  p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */
+  return vpaddq_f32(p, p); /* [t, t, t, t] */;
+
+  /* TODO: measure speed of this compare to above */
+  /* return vdupq_n_f32(vaddvq_f32(v)); */
+
+  /*
   return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)),
                    vaddq_f32(glmm_splat_z(v), glmm_splat_w(v)));
+   */
   /*
    this seems slower:
    v = vaddq_f32(v, vrev64q_f32(v));
@@ -108,6 +117,12 @@ glmm_dot(float32x4_t a, float32x4_t b) {
   return glmm_hadd(vmulq_f32(a, b));
 }
 
+static inline
+float32x4_t
+glmm_vdot(float32x4_t a, float32x4_t b) {
+  return glmm_vhadd(vmulq_f32(a, b));
+}
+
 static inline
 float
 glmm_norm(float32x4_t a) {