From 0ab50f720809e91c1d4f1edc5849ed0fc920d871 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 25 Apr 2021 03:41:39 +0300 Subject: [PATCH] arm, neon: update mat4_mul to use FMA --- include/cglm/simd/arm.h | 2 +- include/cglm/simd/neon/mat4.h | 57 +++++++++++++++++------------------ 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index a7009e5..e943ce3 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -123,7 +123,7 @@ static inline float32x4_t glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) { #if CGLM_ARM64 - return vfmaq_f32(c, a, b); + return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */ #else return vmlaq_f32(c, a, b); #endif diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 2fecebe..f1b3b60 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -41,42 +41,39 @@ glm_mat4_transp_neon(mat4 m, mat4 dest) { CGLM_INLINE void glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) { - /* D = R * L (Column-Major) */ - float32x4_t l0, l1, l2, l3, r, d0, d1, d2, d3; + /* D = R * L (Column-Major) */ - l0 = vld1q_f32(m2[0]); - l1 = vld1q_f32(m2[1]); - l2 = vld1q_f32(m2[2]); - l3 = vld1q_f32(m2[3]); + glmm_128 l0, l1, l2, l3, r0, r1, r2, r3, v0, v1, v2, v3; - r = vld1q_f32(m1[0]); - d0 = vmulq_lane_f32(r, vget_low_f32(l0), 0); - d1 = vmulq_lane_f32(r, vget_low_f32(l1), 0); - d2 = vmulq_lane_f32(r, vget_low_f32(l2), 0); - d3 = vmulq_lane_f32(r, vget_low_f32(l3), 0); + l0 = glmm_load(m1[0]); r0 = glmm_load(m2[0]); + l1 = glmm_load(m1[1]); r1 = glmm_load(m2[1]); + l2 = glmm_load(m1[2]); r2 = glmm_load(m2[2]); + l3 = glmm_load(m1[3]); r3 = glmm_load(m2[3]); - r = vld1q_f32(m1[1]); - d0 = vmlaq_lane_f32(d0, r, vget_low_f32(l0), 1); - d1 = vmlaq_lane_f32(d1, r, vget_low_f32(l1), 1); - d2 = vmlaq_lane_f32(d2, r, vget_low_f32(l2), 1); - d3 = vmlaq_lane_f32(d3, r, vget_low_f32(l3), 1); + v0 = vmulq_f32(glmm_splat_x(r0), l0); + v1 = vmulq_f32(glmm_splat_x(r1), l0); + v2 = vmulq_f32(glmm_splat_x(r2), l0); + v3 = vmulq_f32(glmm_splat_x(r3), l0); - r = vld1q_f32(m1[2]); - d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 0); - d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 0); - d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 0); - d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 0); + v0 = glmm_fmadd(glmm_splat_y(r0), l1, v0); + v1 = glmm_fmadd(glmm_splat_y(r1), l1, v1); + v2 = glmm_fmadd(glmm_splat_y(r2), l1, v2); + v3 = glmm_fmadd(glmm_splat_y(r3), l1, v3); - r = vld1q_f32(m1[3]); - d0 = vmlaq_lane_f32(d0, r, vget_high_f32(l0), 1); - d1 = vmlaq_lane_f32(d1, r, vget_high_f32(l1), 1); - d2 = vmlaq_lane_f32(d2, r, vget_high_f32(l2), 1); - d3 = vmlaq_lane_f32(d3, r, vget_high_f32(l3), 1); + v0 = glmm_fmadd(glmm_splat_z(r0), l2, v0); + v1 = glmm_fmadd(glmm_splat_z(r1), l2, v1); + v2 = glmm_fmadd(glmm_splat_z(r2), l2, v2); + v3 = glmm_fmadd(glmm_splat_z(r3), l2, v3); - vst1q_f32(dest[0], d0); - vst1q_f32(dest[1], d1); - vst1q_f32(dest[2], d2); - vst1q_f32(dest[3], d3); + v0 = glmm_fmadd(glmm_splat_w(r0), l3, v0); + v1 = glmm_fmadd(glmm_splat_w(r1), l3, v1); + v2 = glmm_fmadd(glmm_splat_w(r2), l3, v2); + v3 = glmm_fmadd(glmm_splat_w(r3), l3, v3); + + glmm_store(dest[0], v0); + glmm_store(dest[1], v1); + glmm_store(dest[2], v2); + glmm_store(dest[3], v3); } CGLM_INLINE