From 2be6ac949bdfa389cdc8465b6fb7bb0db9498b6e Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 1 May 2021 03:18:26 +0300 Subject: [PATCH] sse: optimize glm_quat_mul with sse --- include/cglm/simd/sse2/quat.h | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index ae82885..3f7405f 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -22,21 +22,30 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { a1 a2 − b1 b2 − c1 c2 − d1 d2 */ - __m128 xp, xq, x0, r; + __m128 xp, xq, x1, x2, x3, r, x, y, z; xp = glmm_load(p); /* 3 2 1 0 */ xq = glmm_load(q); + x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); + r = _mm_mul_ps(glmm_splat_w(xp), xq); + + x2 = _mm_unpackhi_ps(x1, x1); + x3 = glmm_shuff1(x1, 3, 2, 0, 1); + x = glmm_splat_x(xp); + y = glmm_splat_y(xp); + z = glmm_splat_z(xp); - r = _mm_mul_ps(glmm_splat(xp, 3), xq); - - x0 = _mm_xor_ps(glmm_splat(xp, 0), _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); - r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 0, 1, 2, 3))); - - x0 = _mm_xor_ps(glmm_splat(xp, 1), _mm_set_ps(-0.f, -0.f, 0.f, 0.f)); - r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 1, 0, 3, 2))); - - x0 = _mm_xor_ps(glmm_splat(xp, 2), _mm_set_ps(-0.f, 0.f, 0.f, -0.f)); - r = _mm_add_ps(r, _mm_mul_ps(x0, glmm_shuff1(xq, 2, 3, 0, 1))); + x = _mm_xor_ps(x, x1); + y = _mm_xor_ps(y, x2); + z = _mm_xor_ps(z, x3); + + x1 = glmm_shuff1(xq, 0, 1, 2, 3); + x2 = glmm_shuff1(xq, 1, 0, 3, 2); + x3 = glmm_shuff1(xq, 2, 3, 0, 1); + + r = glmm_fmadd(x, x1, r); + r = glmm_fmadd(y, x2, r); + r = glmm_fmadd(z, x3, r); glmm_store(dest, r); }