From e34601f5789cf211a137230c3a0b301965a0a84b Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 29 Aug 2020 11:51:07 +0300 Subject: [PATCH] arm neon: multiply mat4 with vec4 --- include/cglm/mat4.h | 2 ++ include/cglm/simd/neon/mat4.h | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 483c9dd..cda5285 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -358,6 +358,8 @@ void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_mulv_sse2(m, v, dest); +#elif defined(CGLM_NEON_FP) + glm_mat4_mulv_neon(m, v, dest); #else vec4 res; res[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3]; diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 955cb80..36d347b 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -79,5 +79,27 @@ glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) { vst1q_f32(dest[3], d3); } +CGLM_INLINE +void +glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) { + float32x4_t l0, l1, l2, l3; + float32x2_t vlo, vhi; + + l0 = vld1q_f32(m[0]); + l1 = vld1q_f32(m[1]); + l2 = vld1q_f32(m[2]); + l3 = vld1q_f32(m[3]); + + vlo = vld1_f32(&v[0]); + vhi = vld1_f32(&v[2]); + + l0 = vmulq_lane_f32(l0, vlo, 0); + l0 = vmlaq_lane_f32(l0, l1, vlo, 1); + l0 = vmlaq_lane_f32(l0, l2, vhi, 0); + l0 = vmlaq_lane_f32(l0, l3, vhi, 1); + + vst1q_f32(dest, l0); +} + #endif #endif /* cglm_mat4_neon_h */