diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h
index 7a72f97..8971678 100644
--- a/include/cglm/mat4.h
+++ b/include/cglm/mat4.h
@@ -536,12 +536,7 @@ glm_mat4_scale(mat4 m, float s) {
 #if defined( __SSE__ ) || defined( __SSE2__ )
   glm_mat4_scale_sse2(m, s);
 #elif defined(CGLM_NEON_FP)
-  float32x4_t v0;
-  v0 = vdupq_n_f32(s);
-  vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), v0));
-  vst1q_f32(m[1], vmulq_f32(vld1q_f32(m[1]), v0));
-  vst1q_f32(m[2], vmulq_f32(vld1q_f32(m[2]), v0));
-  vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0));
+  glm_mat4_scale_neon(m, s);
 #else
   glm_mat4_scale_p(m, s);
 #endif
diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h
index 0623dc0..205ca69 100644
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -12,6 +12,19 @@
 #include "../../common.h"
 #include "../intrin.h"
 
+CGLM_INLINE
+void
+glm_mat4_scale_neon(mat4 m, float s) {
+  float32x4_t v0;
+  
+  v0 = vdupq_n_f32(s);
+
+  vst1q_f32(m[0], vmulq_f32(vld1q_f32(m[0]), v0));
+  vst1q_f32(m[1], vmulq_f32(vld1q_f32(m[1]), v0));
+  vst1q_f32(m[2], vmulq_f32(vld1q_f32(m[2]), v0));
+  vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0));
+}
+
 CGLM_INLINE
 void
 glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {