From 99c8aeab77ac1f245b1446ba37637cf851345ce0 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 24 Jan 2017 23:07:14 +0300 Subject: [PATCH] fix mat4_mulv and implement sse2 version --- include/cglm-mat-simd-sse2.h | 19 +++++++++++++++++++ include/cglm-mat.h | 14 ++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/cglm-mat-simd-sse2.h b/include/cglm-mat-simd-sse2.h index db42d3a..5803e39 100644 --- a/include/cglm-mat-simd-sse2.h +++ b/include/cglm-mat-simd-sse2.h @@ -81,6 +81,25 @@ glm_mat4_mul_sse2(mat4 m1, mat4 m2, mat4 dest) { _mm_mul_ps(_mm_shuffle1_ps1(r, 3), l3)))); } +CGLM_INLINE +void +glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { + __m128 x0, x1, x2; + + x0 = _mm_load_ps(v); + x1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_shuffle1_ps1(x0, 0)), + _mm_mul_ps(_mm_load_ps(m[1]), + _mm_shuffle1_ps1(x0, 1))); + + x2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_shuffle1_ps1(x0, 2)), + _mm_mul_ps(_mm_load_ps(m[3]), + _mm_shuffle1_ps1(x0, 3))); + + _mm_store_ps(dest, _mm_add_ps(x1, x2)); +} + CGLM_INLINE float glm_mat4_det_sse2(mat4 mat) { diff --git a/include/cglm-mat.h b/include/cglm-mat.h index 2398ec1..65784f7 100644 --- a/include/cglm-mat.h +++ b/include/cglm-mat.h @@ -234,10 +234,16 @@ glm_mat4_mulN(mat4 * __restrict matrices[], int len, mat4 dest) { CGLM_INLINE void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest) { - dest[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3]; - dest[1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2] + m[3][1] * v[3]; - dest[2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2] * v[3]; - dest[3] = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3] * v[3]; +#if defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_mulv_sse2(m, v, dest); +#else + vec4 res; + res[0] = m[0][0] * v[0] + m[1][0] * v[1] + m[2][0] * v[2] + m[3][0] * v[3]; + res[1] = m[0][1] * v[0] + m[1][1] * v[1] + m[2][1] * v[2] + m[3][1] * v[3]; + res[2] = m[0][2] * v[0] + m[1][2] * v[1] + m[2][2] * v[2] + m[3][2] * v[3]; + res[3] = m[0][3] * v[0] + m[1][3] * v[1] + m[2][3] * v[2] + m[3][3] * v[3]; + glm_vec4_dup(res, dest); +#endif } /*!