From 932f638d5a6b3bcb0897b059b74ddace2a204308 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sun, 8 Apr 2018 12:31:32 +0300 Subject: [PATCH] optimize mat4 to quaternion * add SSE2 version and optimize scalar version --- include/cglm/mat4.h | 34 +++++++++++++++-------- include/cglm/simd/sse2/mat4.h | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index ec7ac39..782a24b 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -343,25 +343,38 @@ glm_mat4_mulq(mat4 m, versor q, mat4 dest) { CGLM_INLINE void glm_mat4_quat(mat4 m, versor dest) { +#if defined( __SSE__ ) || defined( __SSE2__ ) + glm_mat4_quat_sse2(m, dest); +#else + vec4 vsgn, vzero = GLM_VEC4_ZERO_INIT; versor q; float m00, m10, m20, m01, m11, m21, m02, m12, m22; - m00 = m[0][0]; m10 = m[1][0]; m20 = m[2][0]; - m01 = m[0][1]; m11 = m[1][1]; m21 = m[2][1]; - m02 = m[0][2]; m12 = m[1][2]; m22 = m[2][2]; + m00 = m[0][0]; m01 = m[0][1]; m02 = m[0][2]; + m10 = m[1][0]; m11 = m[1][1]; m12 = m[1][2]; + m20 = m[2][0]; m21 = m[2][1]; m22 = m[2][2]; - q[0] = sqrtf(glm_max(0.0f, 1.0f + m00 + m11 + m22)) * 0.5f; /* w */ - q[1] = sqrtf(glm_max(0.0f, 1.0f + m00 - m11 - m22)) * 0.5f; /* x */ - q[2] = sqrtf(glm_max(0.0f, 1.0f - m00 + m11 - m22)) * 0.5f; /* y */ - q[3] = sqrtf(glm_max(0.0f, 1.0f - m00 - m11 + m22)) * 0.5f; /* z */ + q[0] = 1.0f + m00 + m11 + m22; /* w */ + q[1] = 1.0f + m00 - m11 - m22; /* x */ + q[2] = 1.0f - m00 + m11 - m22; /* y */ + q[3] = 1.0f - m00 - m11 + m22; /* z */ - q[1] *= glm_signf(m12 - m21); - q[2] *= glm_signf(m20 - m02); - q[3] *= glm_signf(m01 - m10); + glm_vec4_maxv(q, vzero, q); + glm_vec4_sqrt(q, q); + glm_vec4_scale(q, 0.5f, q); + + vsgn[0] = 1.0f; + vsgn[1] = m12 - m21; + vsgn[2] = m20 - m02; + vsgn[3] = m01 - m10; + + glm_vec4_sign(vsgn, vsgn); + glm_vec4_mulv(q, vsgn, q); glm_vec4_copy(q, dest); +#endif } /*! @@ -614,5 +627,4 @@ glm_mat4_swap_row(mat4 mat, int row1, int row2) { mat[3][row2] = tmp[3]; } -#else #endif /* cglm_mat_h */ diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 77874a8..0476176 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -102,6 +102,57 @@ glm_mat4_mulv_sse2(mat4 m, vec4 v, vec4 dest) { _mm_store_ps(dest, _mm_add_ps(x1, x2)); } +CGLM_INLINE +void +glm_mat4_quat_sse2(mat4 m, versor dest) { + __m128 c0, c1, c2, x0, x1, x2, x3, m00, m11, m22, r; + __m128 zero, one, half, ngone; + + c0 = _mm_load_ps(m[0]); + c1 = _mm_load_ps(m[1]); + c2 = _mm_load_ps(m[2]); + + m00 = _mm_xor_ps(_mm_shuffle1_ps1(c0, 0), _mm_set_ps(-0.f, -0.f, 0.f, 0.f)); + m11 = _mm_xor_ps(_mm_shuffle1_ps1(c1, 1), _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); + m22 = _mm_xor_ps(_mm_shuffle1_ps1(c2, 2), _mm_set_ps( 0.f, -0.f, -0.f, 0.f)); + + x0 = _mm_set_ps(-1.0f, 0.0f, 0.5, 1.0f); + one = _mm_shuffle1_ps1(x0, 0); + half = _mm_shuffle1_ps1(x0, 1); + zero = _mm_shuffle1_ps1(x0, 2); + ngone = _mm_shuffle1_ps1(x0, 3); + + /* + q[0] = sqrtf(glm_max(0.0f, 1.0f + m00 + m11 + m22)) * 0.5f; + q[1] = sqrtf(glm_max(0.0f, 1.0f + m00 - m11 - m22)) * 0.5f; + q[2] = sqrtf(glm_max(0.0f, 1.0f - m00 + m11 - m22)) * 0.5f; + q[3] = sqrtf(glm_max(0.0f, 1.0f - m00 - m11 + m22)) * 0.5f; + */ + x0 = _mm_add_ps(one, _mm_add_ps(_mm_add_ps(m00, m11), m22)); + r = _mm_mul_ps(_mm_sqrt_ps(_mm_max_ps(zero, x0)), half); + + /* + q[1] *= glm_signf(m12 - m21); + q[2] *= glm_signf(m20 - m02); + q[3] *= glm_signf(m01 - m10); + */ + x1 = _mm_shuffle_ps(c1, c2, _MM_SHUFFLE(0, 0, 2, 2)); /* m20 m20 m12 m12 */ + x1 = _mm_shuffle_ps(x1, c0, _MM_SHUFFLE(1, 1, 2, 0)); /* m01 m01 m20 m12 */ + + x2 = _mm_shuffle_ps(c2, c0, _MM_SHUFFLE(2, 2, 1, 1)); /* m02 m02 m21 m21 */ + x2 = _mm_shuffle_ps(x2, c1, _MM_SHUFFLE(0, 0, 2, 0)); /* m10 m10 m02 m21 */ + + x1 = _mm_sub_ps(x1, x2); + x2 = _mm_or_ps(_mm_and_ps(_mm_cmpgt_ps(x1, zero), one), + _mm_and_ps(_mm_cmplt_ps(x1, zero), ngone)); + x2 = _mm_shuffle1_ps(x2, 2, 1, 0, 0); + + x3 = _mm_shuffle_ps(one, x2, _MM_SHUFFLE(0, 0, 0, 0)); /* q1 q1 1 1 */ + x3 = _mm_shuffle_ps(x3, x2, _MM_SHUFFLE(3, 2, 2, 0)); /* q3 q2 q1 1 */ + + _mm_store_ps(dest, _mm_mul_ps(r, x3)); +} + CGLM_INLINE float glm_mat4_det_sse2(mat4 mat) {