diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 8ba5494..328c7e3 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -52,6 +52,10 @@ glmm_float32x4_init(float x, float y, float z, float w) { # define glmm_float32x4_init(x, y, z, w) { x, y, z, w } #endif +#define glmm_float32x4_SIGNMASK_PNPN glmm_float32x4_init( 0.f, -0.f, 0.f, -0.f) +#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f) +#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f) + static inline float32x4_t glmm_abs(float32x4_t v) { diff --git a/include/cglm/simd/intrin.h b/include/cglm/simd/intrin.h index bf8d119..8fb51d7 100644 --- a/include/cglm/simd/intrin.h +++ b/include/cglm/simd/intrin.h @@ -99,7 +99,6 @@ # endif #endif - /* WebAssembly */ #if defined(__wasm__) && defined(__wasm_simd128__) # ifndef CGLM_SIMD_WASM diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 2d1184e..d76a454 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -108,7 +108,7 @@ glm_mat4_det_neon(mat4 mat) { float32x4_t r0, r1, r2, r3, x0, x1, x2; float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34; float32x4x2_t a1; - float32x4_t x3 = glmm_float32x4_init(0.f, -0.f, 0.f, -0.f); + float32x4_t x3 = glmm_float32x4_SIGNMASK_PNPN; /* 127 <- 0, [square] det(A) = det(At) */ r0 = glmm_load(mat[0]); /* d c b a */ @@ -181,7 +181,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { x0, x1, x2, x3, x4, x5, x6, x7, x8; float32x4x2_t a1; float32x2_t lp, ko, hg, jn, im, fe, ae, bf, cg, dh; - float32x4_t x9 = glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f); + float32x4_t x9 = glmm_float32x4_SIGNMASK_NPNP; x8 = vrev64q_f32(x9); diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h index fbaf390..55dc1da 100644 --- a/include/cglm/simd/neon/quat.h +++ b/include/cglm/simd/neon/quat.h @@ -23,7 +23,8 @@ glm_quat_mul_neon(versor p, versor q, versor dest) { */ glmm_128 xp, xq, xqr, r, x, y, z, s2, s3; - glmm_128 s1 = glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f); + glmm_128 s1 = glmm_float32x4_SIGNMASK_NPPN; + float32x2_t qh, ql; xp = glmm_load(p); /* 3 2 1 0 */ diff --git a/include/cglm/simd/sse2/affine.h b/include/cglm/simd/sse2/affine.h index 99edaa0..0619995 100644 --- a/include/cglm/simd/sse2/affine.h +++ b/include/cglm/simd/sse2/affine.h @@ -98,7 +98,7 @@ glm_inv_tr_sse2(mat4 mat) { x2 = glmm_shuff1(r3, 0, 0, 0, 0); x3 = glmm_shuff1(r3, 1, 1, 1, 1); x4 = glmm_shuff1(r3, 2, 2, 2, 2); - x5 = _mm_set1_ps(-0.f); + x5 = glmm_float32x4_SIGNMASK_NEG; x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4))); x0 = _mm_xor_ps(x0, x5); diff --git a/include/cglm/simd/sse2/mat4.h b/include/cglm/simd/sse2/mat4.h index 5c78499..05e7efe 100644 --- a/include/cglm/simd/sse2/mat4.h +++ b/include/cglm/simd/sse2/mat4.h @@ -153,7 +153,7 @@ glm_mat4_det_sse2(mat4 mat) { _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)), x2); - x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f)); + x2 = _mm_xor_ps(x2, glmm_float32x4_SIGNMASK_NPNP); return glmm_hadd(_mm_mul_ps(x2, r0)); } @@ -166,7 +166,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) { t0, t1, t2, t3, t4, t5, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; - x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); + /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */ + x8 = glmm_float32x4_SIGNMASK_NPNP; x9 = glmm_shuff1(x8, 2, 1, 2, 1); /* 127 <- 0 */ @@ -302,7 +303,8 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) { t0, t1, t2, t3, t4, t5, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9; - x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); + /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */ + x8 = glmm_float32x4_SIGNMASK_NPNP; x9 = glmm_shuff1(x8, 2, 1, 2, 1); /* 127 <- 0 */ diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index 94850cc..def0fe2 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -26,9 +26,9 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { xp = glmm_load(p); /* 3 2 1 0 */ xq = glmm_load(q); - x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); /* TODO: _mm_set1_ss() + shuff ? */ + x1 = glmm_float32x4_SIGNMASK_NPNP; /* TODO: _mm_set1_ss() + shuff ? */ r = _mm_mul_ps(glmm_splat_w(xp), xq); - + x2 = _mm_unpackhi_ps(x1, x1); x3 = glmm_shuff1(x1, 3, 2, 0, 1); x = glmm_splat_x(xp); diff --git a/include/cglm/simd/x86.h b/include/cglm/simd/x86.h index dbbd0f8..77c289e 100644 --- a/include/cglm/simd/x86.h +++ b/include/cglm/simd/x86.h @@ -54,10 +54,23 @@ # endif #endif +#define GLMM_NEGZEROf 0x80000000 /* 0x80000000 ---> -0.0f */ + +#define GLMM__SIGNMASKf(X, Y, Z, W) \ + _mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W)) + /* _mm_set_ps(X, Y, Z, W); */ + +#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf) +#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0) +#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf) + +#define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(0x80000000)) /* _mm_set1_ps(-0.0f) */ +#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf)) + static inline __m128 glmm_abs(__m128 x) { - return _mm_andnot_ps(_mm_set1_ps(-0.0f), x); + return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x); } static inline @@ -256,7 +269,8 @@ glmm_fnmsub(__m128 a, __m128 b, __m128 c) { #ifdef __FMA__ return _mm_fnmsub_ps(a, b, c); #else - return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f)); + return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), + glmm_float32x4_SIGNMASK_NEG); #endif } @@ -298,7 +312,7 @@ glmm256_fnmsub(__m256 a, __m256 b, __m256 c) { return _mm256_fmsub_ps(a, b, c); #else return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c), - _mm256_set1_ps(-0.0f)); + glmm_float32x8_SIGNMASK_NEG); #endif } #endif diff --git a/include/cglm/vec4.h b/include/cglm/vec4.h index de29941..2d4daed 100644 --- a/include/cglm/vec4.h +++ b/include/cglm/vec4.h @@ -709,7 +709,7 @@ glm_vec4_negate_to(vec4 v, vec4 dest) { #if defined(__wasm__) && defined(__wasm_simd128__) glmm_store(dest, wasm_f32x4_neg(glmm_load(v))); #elif defined( __SSE__ ) || defined( __SSE2__ ) - glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f))); + glmm_store(dest, _mm_xor_ps(glmm_load(v), glmm_float32x4_SIGNMASK_NEG)); #elif defined(CGLM_NEON_FP) vst1q_f32(dest, vnegq_f32(vld1q_f32(v))); #else