From f0e09776d772668287453d8738c8727936765da9 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 2 Apr 2024 02:36:16 +0300 Subject: [PATCH] arm, neon: optimize glmm_vhadd and add glmm_vdot --- include/cglm/simd/arm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 1578390..eb999f1 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -63,8 +63,17 @@ static inline float32x4_t glmm_max(float32x4_t a, float32x4_t b) { return vmaxq_ static inline float32x4_t glmm_vhadd(float32x4_t v) { + float32x4_t p; + p = vpaddq_f32(v, v); /* [a+b, c+d, a+b, c+d] */ + return vpaddq_f32(p, p); /* [t, t, t, t] */; + + /* TODO: measure speed of this compare to above */ + /* return vdupq_n_f32(vaddvq_f32(v)); */ + + /* return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)), vaddq_f32(glmm_splat_z(v), glmm_splat_w(v))); + */ /* this seems slower: v = vaddq_f32(v, vrev64q_f32(v)); @@ -108,6 +117,12 @@ glmm_dot(float32x4_t a, float32x4_t b) { return glmm_hadd(vmulq_f32(a, b)); } +static inline +float32x4_t +glmm_vdot(float32x4_t a, float32x4_t b) { + return glmm_vhadd(vmulq_f32(a, b)); +} + static inline float glmm_norm(float32x4_t a) {