From 45c1beff516a5305e1a7d6c5138b3e3b3af22f7c Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Fri, 12 Apr 2024 21:53:20 +0300 Subject: [PATCH] simd: fix glmm_set1, glmm_splat --- include/cglm/simd/neon/mat4.h | 2 +- include/cglm/simd/sse2/mat4.h | 12 ++++++------ include/cglm/simd/x86.h | 31 ++++++++++++++++++------------- include/cglm/vec4.h | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 92442e3..6cf9811 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -299,7 +299,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { vget_low_f32(vzipq_f32(v2, v3).val[0])); /* - x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0))); + x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0))); glmm_store(dest[0], vmulq_f32(v0, x0)); glmm_store(dest[1], vmulq_f32(v1, x0)); diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 5df7254..2127e72 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -471,15 +471,15 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */ x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */ - x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */ - x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */ + x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */ + x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */ -#if 1 /* TODO measure both */ +#if 0 /* TODO measure both */ x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */ x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */ #else - x12 = _mm_movelh_ps(x4, x5); /* i a k c */ - x13 = _mm_movelh_ps(x6, x7); /* m e o g */ + x12 = _mm_movelh_ps(x4, x5); /* i a k c */ + x13 = _mm_movelh_ps(x6, x7); /* m e o g */ #endif t0 = _mm_mul_ps(x12, x10); @@ -494,7 +494,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { /* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ /* v1: c5 * c12 + c6 * c11 */ - v5 = glmm_set1(1.0f); + v5 = glmm_set1_rval(1.0f); v0 = glmm_shuff1(t2, 2, 3, 0, 1); v1 = glmm_shuff1(t1, 0, 1, 2, 3); v0 = _mm_mul_ps(t0, v0); diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index ea8577e..2410d0f 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -37,21 +37,26 @@ #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) #ifdef __AVX__ -# define glmm_set1(x) _mm_broadcast_ss(&x) -# define glmm_set1_ptr(x) _mm_broadcast_ss(x) - -# define glmm_splat_x(x) _mm_broadcastss_ps(x) -# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) -# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) -# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) +# define glmm_set1(x) _mm_broadcast_ss(&x) +# define glmm_set1_ptr(x) _mm_broadcast_ss(x) +# define glmm_set1_rval(x) _mm_set1_ps(x) +# ifdef __AVX2__ +# define glmm_splat_x(x) _mm_broadcastss_ps(x) +# else +# define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0)) +# endif +# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1)) +# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2)) +# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3)) #else -# define glmm_set1(x) _mm_set1_ps(x) -# define glmm_set1_ptr(x) _mm_set1_ps(*x) +# define glmm_set1(x) _mm_set1_ps(x) +# define glmm_set1_ptr(x) _mm_set1_ps(*x) +# define glmm_set1_rval(x) _mm_set1_ps(x) -# define glmm_splat_x(x) glmm_splat(x, 0) -# define glmm_splat_y(x) glmm_splat(x, 1) -# define glmm_splat_z(x) glmm_splat(x, 2) -# define glmm_splat_w(x) glmm_splat(x, 3) +# define glmm_splat_x(x) glmm_splat(x, 0) +# define glmm_splat_y(x) glmm_splat(x, 1) +# define glmm_splat_z(x) glmm_splat(x, 2) +# define glmm_splat_w(x) glmm_splat(x, 3) #endif #ifdef __AVX__ diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index a85419b..2d18d8c 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -215,7 +215,7 @@ glm_vec4_one(vec4 v) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(v, wasm_f32x4_const_splat(1.0f)); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(v, glmm_set1(1.0f)); + glmm_store(v, glmm_set1_rval(1.0f)); #elif defined(CGLM_NEON_FP) vst1q_f32(v, vdupq_n_f32(1.0f)); #else