From 0eb37da8bbcefc722ad4098b47b742d615c85fb1 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Fri, 13 Apr 2018 11:01:07 +0300 Subject: [PATCH] vec4: optimize vec4 normalize with SIMD --- include/cglm/vec4.h | 48 ++++++++++++++++++++++++------------------ test/src/test_common.c | 8 +++++++ test/src/test_common.h | 3 +++ test/src/test_vec4.c | 29 +++++++++++++++++++++++-- 4 files changed, 66 insertions(+), 22 deletions(-) diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index eaa4eaf..0ef6bff 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -312,26 +312,6 @@ glm_vec4_inv_to(vec4 v, vec4 dest) { glm_vec4_flipsign(dest); } -/*! - * @brief normalize vec4 and store result in same vec - * - * @param[in, out] v vector - */ -CGLM_INLINE -void -glm_vec4_normalize(vec4 v) { - float norm; - - norm = glm_vec4_norm(v); - - if (norm == 0.0f) { - v[0] = v[1] = v[2] = v[3] = 0.0f; - return; - } - - glm_vec4_scale(v, 1.0f / norm, v); -} - /*! * @brief normalize vec4 to dest * @@ -341,16 +321,44 @@ glm_vec4_normalize(vec4 v) { CGLM_INLINE void glm_vec4_normalize_to(vec4 vec, vec4 dest) { +#if defined( __SSE__ ) || defined( __SSE2__ ) + __m128 xdot, x0; + float dot; + + x0 = _mm_load_ps(vec); + xdot = glm_simd_dot(x0, x0); + dot = _mm_cvtss_f32(xdot); + + if (dot == 0.0f) { + _mm_store_ps(dest, _mm_setzero_ps()); + return; + } + + _mm_store_ps(dest, _mm_div_ps(x0, _mm_sqrt_ps(xdot))); +#else float norm; norm = glm_vec4_norm(vec); if (norm == 0.0f) { dest[0] = dest[1] = dest[2] = dest[3] = 0.0f; + glm_vec4_broadcast(0.0f, dest); return; } glm_vec4_scale(vec, 1.0f / norm, dest); +#endif +} + +/*! + * @brief normalize vec4 and store result in same vec + * + * @param[in, out] v vector + */ +CGLM_INLINE +void +glm_vec4_normalize(vec4 v) { + glm_vec4_normalize_to(v, v); } /** diff --git a/test/src/test_common.c b/test/src/test_common.c index 514c006..a559087 100644 --- a/test/src/test_common.c +++ b/test/src/test_common.c @@ -91,6 +91,14 @@ test_assert_vec3_eq(vec3 v1, vec3 v2) { assert_true(fabsf(v1[2] - v2[2]) <= 0.000009); } +void +test_assert_vec4_eq(vec4 v1, vec4 v2) { + assert_true(fabsf(v1[0] - v2[0]) <= 0.000009); /* rounding errors */ + assert_true(fabsf(v1[1] - v2[1]) <= 0.000009); + assert_true(fabsf(v1[2] - v2[2]) <= 0.000009); + assert_true(fabsf(v1[3] - v2[3]) <= 0.000009); +} + void test_assert_quat_eq_abs(versor v1, versor v2) { assert_true(fabsf(fabsf(v1[0]) - fabsf(v2[0])) <= 0.0009); /* rounding errors */ diff --git a/test/src/test_common.h b/test/src/test_common.h index c95405e..50e84d6 100644 --- a/test/src/test_common.h +++ b/test/src/test_common.h @@ -34,6 +34,9 @@ test_assert_mat4_eq2(mat4 m1, mat4 m2, float eps); void test_assert_vec3_eq(vec3 v1, vec3 v2); +void +test_assert_vec4_eq(vec4 v1, vec4 v2); + void test_assert_quat_eq(versor v1, versor v2); diff --git a/test/src/test_vec4.c b/test/src/test_vec4.c index a45a700..994bab7 100644 --- a/test/src/test_vec4.c +++ b/test/src/test_vec4.c @@ -13,18 +13,43 @@ test_vec4_dot(vec4 a, vec4 b) { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3]; } +CGLM_INLINE +void +test_vec4_normalize_to(vec4 vec, vec4 dest) { + float norm; + + norm = glm_vec4_norm(vec); + + if (norm == 0.0f) { + dest[0] = dest[1] = dest[2] = dest[3] = 0.0f; + return; + } + + glm_vec4_scale(vec, 1.0f / norm, dest); +} + void test_vec4(void **state) { - vec4 v; + vec4 v, v1, v2; int i; float d1, d2; - /* test SSE/SIMD dot product */ + for (i = 0; i < 100; i++) { + /* 1. test SSE/SIMD dot product */ test_rand_vec4(v); d1 = glm_vec4_dot(v, v); d2 = test_vec4_dot(v, v); assert_true(fabsf(d1 - d2) <= 0.000009); + + /* 2. test SIMD normalize */ + test_vec4_normalize_to(v, v1); + glm_vec4_normalize_to(v, v2); + glm_vec4_normalize(v); + + /* all must be same */ + test_assert_vec4_eq(v1, v2); + test_assert_vec4_eq(v, v2); } }