From c48026746350920966aff76f12cf24970f001204 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Tue, 13 Sep 2016 12:17:25 +0300 Subject: [PATCH] optimize affine translate with SSE --- include/cglm-affine.h | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/include/cglm-affine.h b/include/cglm-affine.h index 5517efd..f0a3c3a 100644 --- a/include/cglm-affine.h +++ b/include/cglm-affine.h @@ -15,11 +15,25 @@ CGLM_INLINE void glm_translate_to(mat4 m, vec3 v, mat4 dest) { - vec4 v1; - vec4 v2; - vec4 v3; mat4 t = GLM_MAT_IDENTITY_4F; +#if defined( __SSE__ ) || defined( __SSE2__ ) + _mm_store_ps(dest[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(t[0]), + _mm_set1_ps(v[0])), + _mm_mul_ps(_mm_load_ps(t[1]), + _mm_set1_ps(v[1]))), + _mm_add_ps(_mm_mul_ps(_mm_load_ps(t[2]), + _mm_set1_ps(v[2])), + _mm_load_ps(t[3])))) + ; + + _mm_store_ps(dest[0], _mm_load_ps(m[0])); + _mm_store_ps(dest[1], _mm_load_ps(m[1])); + _mm_store_ps(dest[2], _mm_load_ps(m[2])); +#else + vec4 v1, v2, v3; + glm_vec4_scale(t[0], v[0], v1); glm_vec4_scale(t[1], v[1], v2); glm_vec4_scale(t[2], v[2], v3); @@ -29,14 +43,24 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) { glm_vec4_add(v3, t[3], t[3]); glm__memcpy(float, dest, t, sizeof(mat4)); +#endif } CGLM_INLINE void glm_translate(mat4 m, vec3 v) { - vec4 v1; - vec4 v2; - vec4 v3; +#if defined( __SSE__ ) || defined( __SSE2__ ) + _mm_store_ps(m[3], + _mm_add_ps(_mm_add_ps(_mm_mul_ps(_mm_load_ps(m[0]), + _mm_set1_ps(v[0])), + _mm_mul_ps(_mm_load_ps(m[1]), + _mm_set1_ps(v[1]))), + _mm_add_ps(_mm_mul_ps(_mm_load_ps(m[2]), + _mm_set1_ps(v[2])), + _mm_load_ps(m[3])))) + ; +#else + vec4 v1, v2, v3; glm_vec4_scale(m[0], v[0], v1); glm_vec4_scale(m[1], v[1], v2); @@ -45,6 +69,7 @@ glm_translate(mat4 m, vec3 v) { glm_vec4_add(v1, m[3], m[3]); glm_vec4_add(v2, m[3], m[3]); glm_vec4_add(v3, m[3], m[3]); +#endif } CGLM_INLINE