diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 7a72f97..8971678 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -536,12 +536,7 @@ glm_mat4_scale(mat4 m, float s) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_scale_sse2(m, s); #elif defined(CGLM_NEON_FP) - float32x4_t v0; - v0 = vdupq_n_f32(s); - vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), v0)); - vst1q_f32(m[1], vmulq_f32(vld1q_f32(m[1]), v0)); - vst1q_f32(m[2], vmulq_f32(vld1q_f32(m[2]), v0)); - vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0)); + glm_mat4_scale_neon(m, s); #else glm_mat4_scale_p(m, s); #endif diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 0623dc0..205ca69 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -12,6 +12,19 @@ #include "../../common.h" #include "../intrin.h" +CGLM_INLINE +void +glm_mat4_scale_neon(mat4 m, float s) { + float32x4_t v0; + + v0 = vdupq_n_f32(s); + + vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), v0)); + vst1q_f32(m[1], vmulq_f32(vld1q_f32(m[1]), v0)); + vst1q_f32(m[2], vmulq_f32(vld1q_f32(m[2]), v0)); + vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0)); +} + CGLM_INLINE void glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {