diff --git a/include/cglm/affine.h b/include/cglm/affine.h index 15f2853..504e13b 100644 --- a/include/cglm/affine.h +++ b/include/cglm/affine.h @@ -50,26 +50,23 @@ CGLM_INLINE void glm_translate(mat4 m, vec3 v) { -#if defined( __SSE__ ) || defined( __SSE2__ ) +#if defined(CGLM_SIMD) + glmm_128 v0, m0, m1, m2, m3; + + m0 = glmm_load(m[0]); + m1 = glmm_load(m[1]); + m2 = glmm_load(m[2]); + m3 = glmm_load(m[3]); + v0 = glmm_load(v); + glmm_store(m[3], - _mm_add_ps(_mm_add_ps(_mm_mul_ps(glmm_load(m[0]), - _mm_set1_ps(v[0])), - _mm_mul_ps(glmm_load(m[1]), - _mm_set1_ps(v[1]))), - _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), - _mm_set1_ps(v[2])), - glmm_load(m[3])))) - ; + glmm_fmadd(m0, glmm_splat_x(v0), + glmm_fmadd(m1, glmm_splat_y(v0), + glmm_fmadd(m2, glmm_splat_z(v0), m3)))); #else - vec4 v1, v2, v3; - - glm_vec4_scale(m[0], v[0], v1); - glm_vec4_scale(m[1], v[1], v2); - glm_vec4_scale(m[2], v[2], v3); - - glm_vec4_add(v1, m[3], m[3]); - glm_vec4_add(v2, m[3], m[3]); - glm_vec4_add(v3, m[3], m[3]); + glm_vec4_muladds(m[0], v[0], m[3]); + glm_vec4_muladds(m[1], v[1], m[3]); + glm_vec4_muladds(m[2], v[2], m[3]); #endif } @@ -99,12 +96,8 @@ glm_translate_to(mat4 m, vec3 v, mat4 dest) { CGLM_INLINE void glm_translate_x(mat4 m, float x) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[0]), - _mm_set1_ps(x)), - glmm_load(m[3]))) - ; +#if defined(CGLM_SIMD) + glmm_store(m[3], glmm_fmadd(glmm_load(m[0]), glmm_set1(x), glmm_load(m[3]))); #else vec4 v1; glm_vec4_scale(m[0], x, v1); @@ -121,12 +114,8 @@ glm_translate_x(mat4 m, float x) { CGLM_INLINE void glm_translate_y(mat4 m, float y) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[1]), - _mm_set1_ps(y)), - glmm_load(m[3]))) - ; +#if defined(CGLM_SIMD) + glmm_store(m[3], glmm_fmadd(glmm_load(m[1]), glmm_set1(y), glmm_load(m[3]))); #else vec4 v1; glm_vec4_scale(m[1], y, v1); @@ -143,12 +132,8 @@ glm_translate_y(mat4 m, float y) { CGLM_INLINE void glm_translate_z(mat4 m, float z) { -#if defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(m[3], - _mm_add_ps(_mm_mul_ps(glmm_load(m[2]), - _mm_set1_ps(z)), - glmm_load(m[3]))) - ; +#if defined(CGLM_SIMD) + glmm_store(m[3], glmm_fmadd(glmm_load(m[2]), glmm_set1(z), glmm_load(m[3]))); #else vec4 v1; glm_vec4_scale(m[2], z, v1);