diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index 8971678..483c9dd 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -476,6 +476,8 @@ void glm_mat4_transpose_to(mat4 m, mat4 dest) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, dest); +#elif defined(CGLM_NEON_FP) + glm_mat4_transp_neon(m, dest); #else dest[0][0] = m[0][0]; dest[1][0] = m[0][1]; dest[0][1] = m[1][0]; dest[1][1] = m[1][1]; @@ -498,6 +500,8 @@ void glm_mat4_transpose(mat4 m) { #if defined( __SSE__ ) || defined( __SSE2__ ) glm_mat4_transp_sse2(m, m); +#elif defined(CGLM_NEON_FP) + glm_mat4_transp_neon(m, m); #else mat4 d; glm_mat4_transpose_to(m, d); diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 205ca69..955cb80 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -25,6 +25,19 @@ glm_mat4_scale_neon(mat4 m, float s) { vst1q_f32(m[3], vmulq_f32(vld1q_f32(m[3]), v0)); } +CGLM_INLINE +void +glm_mat4_transp_neon(mat4 m, mat4 dest) { + float32x4x4_t vmat; + + vmat = vld4q_f32(m[0]); + + vst1q_f32(dest[0], vmat.val[0]); + vst1q_f32(dest[1], vmat.val[1]); + vst1q_f32(dest[2], vmat.val[2]); + vst1q_f32(dest[3], vmat.val[3]); +} + CGLM_INLINE void glm_mat4_mul_neon(mat4 m1, mat4 m2, mat4 dest) {