Merge pull request #291 from recp/fix-simd-signmask

simd, sse: use 0x80000000 insteaf of -0.f to fix fastmath on msvc
2025-12-24 12:32:40 +00:00 · 2023-04-22 10:11:17 +03:00
parent 13036036c4 a4b8778be9
commit e2e923554b
9 changed files with 34 additions and 14 deletions
--- a/include/cglm/simd/arm.h
+++ b/include/cglm/simd/arm.h
@@ -52,6 +52,10 @@ glmm_float32x4_init(float x, float y, float z, float w) {
 #  define glmm_float32x4_init(x, y, z, w) { x, y, z, w }
 #endif

+#define glmm_float32x4_SIGNMASK_PNPN glmm_float32x4_init( 0.f, -0.f,  0.f, -0.f)
+#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f,  0.f, -0.f,  0.f)
+#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f,  0.f,  0.f, -0.f)
+
 static inline
 float32x4_t
 glmm_abs(float32x4_t v) {
--- a/include/cglm/simd/intrin.h
+++ b/include/cglm/simd/intrin.h
@@ -99,7 +99,6 @@
 #  endif
 #endif

-
 /* WebAssembly */
 #if defined(__wasm__) && defined(__wasm_simd128__)
 #  ifndef CGLM_SIMD_WASM
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -108,7 +108,7 @@ glm_mat4_det_neon(mat4 mat) {
  float32x4_t   r0, r1, r2, r3, x0, x1, x2;
  float32x2_t   ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
  float32x4x2_t a1;
-  float32x4_t   x3 = glmm_float32x4_init(0.f, -0.f, 0.f, -0.f);
+  float32x4_t   x3 = glmm_float32x4_SIGNMASK_PNPN;

  /* 127 <- 0, [square] det(A) = det(At) */
  r0 = glmm_load(mat[0]);              /* d c b a */
@@ -181,7 +181,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
                x0, x1, x2, x3, x4, x5, x6, x7, x8;
  float32x4x2_t a1;
  float32x2_t   lp, ko, hg, jn, im, fe, ae, bf, cg, dh;
-  float32x4_t   x9 = glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f);
+  float32x4_t   x9 = glmm_float32x4_SIGNMASK_NPNP;

  x8 = vrev64q_f32(x9);

--- a/include/cglm/simd/neon/quat.h
+++ b/include/cglm/simd/neon/quat.h
@@ -23,7 +23,8 @@ glm_quat_mul_neon(versor p, versor q, versor dest) {
   */

  glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
-  glmm_128 s1 = glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f);
+  glmm_128 s1 = glmm_float32x4_SIGNMASK_NPPN;
+
  float32x2_t   qh, ql;
  
  xp  = glmm_load(p); /* 3 2 1 0 */
--- a/include/cglm/simd/sse2/affine.h
+++ b/include/cglm/simd/sse2/affine.h
@@ -98,7 +98,7 @@ glm_inv_tr_sse2(mat4 mat) {
  x2 = glmm_shuff1(r3, 0, 0, 0, 0);
  x3 = glmm_shuff1(r3, 1, 1, 1, 1);
  x4 = glmm_shuff1(r3, 2, 2, 2, 2);
-  x5 = _mm_set1_ps(-0.f);
+  x5 = glmm_float32x4_SIGNMASK_NEG;

  x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4)));
  x0 = _mm_xor_ps(x0, x5);
--- a/include/cglm/simd/sse2/mat4.h
+++ b/include/cglm/simd/sse2/mat4.h
@@ -153,7 +153,7 @@ glm_mat4_det_sse2(mat4 mat) {
                  _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
                  x2);
  
-  x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
+  x2 = _mm_xor_ps(x2, glmm_float32x4_SIGNMASK_NPNP);
  
  return glmm_hadd(_mm_mul_ps(x2, r0));
 }
@@ -166,7 +166,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
         t0, t1, t2, t3, t4, t5,
         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;

-  x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
+  /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
+  x8 = glmm_float32x4_SIGNMASK_NPNP;
  x9 = glmm_shuff1(x8, 2, 1, 2, 1);

  /* 127 <- 0 */
@@ -302,7 +303,8 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
         t0, t1, t2, t3, t4, t5,
         x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;

-  x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
+  /* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
+  x8 = glmm_float32x4_SIGNMASK_NPNP;
  x9 = glmm_shuff1(x8, 2, 1, 2, 1);

  /* 127 <- 0 */
--- a/include/cglm/simd/sse2/quat.h
+++ b/include/cglm/simd/sse2/quat.h
@@ -26,9 +26,9 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {

  xp = glmm_load(p); /* 3 2 1 0 */
  xq = glmm_load(q);
-  x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); /* TODO: _mm_set1_ss() + shuff ? */
+  x1 = glmm_float32x4_SIGNMASK_NPNP; /* TODO: _mm_set1_ss() + shuff ? */
  r  = _mm_mul_ps(glmm_splat_w(xp), xq);
-  
+
  x2 = _mm_unpackhi_ps(x1, x1);
  x3 = glmm_shuff1(x1, 3, 2, 0, 1);
  x  = glmm_splat_x(xp);
--- a/include/cglm/simd/x86.h
+++ b/include/cglm/simd/x86.h
@@ -54,10 +54,23 @@
 #  endif
 #endif

+#define GLMM_NEGZEROf 0x80000000 /*  0x80000000 ---> -0.0f  */
+
+#define GLMM__SIGNMASKf(X, Y, Z, W)                                           \
+   _mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W))
+  /* _mm_set_ps(X, Y, Z, W); */
+
+#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf)
+#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0)
+#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf)
+
+#define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(0x80000000)) /* _mm_set1_ps(-0.0f) */
+#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
+
 static inline
 __m128
 glmm_abs(__m128 x) {
-  return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
+  return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x);
 }

 static inline
@@ -256,7 +269,8 @@ glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
 #ifdef __FMA__
  return _mm_fnmsub_ps(a, b, c);
 #else
-  return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
+  return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c),
+                    glmm_float32x4_SIGNMASK_NEG);
 #endif
 }

@@ -298,7 +312,7 @@ glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
  return _mm256_fmsub_ps(a, b, c);
 #else
  return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
-                       _mm256_set1_ps(-0.0f));
+                       glmm_float32x8_SIGNMASK_NEG);
 #endif
 }
 #endif
--- a/include/cglm/vec4.h
+++ b/include/cglm/vec4.h
@@ -709,7 +709,7 @@ glm_vec4_negate_to(vec4 v, vec4 dest) {
 #if defined(__wasm__) && defined(__wasm_simd128__)
  glmm_store(dest, wasm_f32x4_neg(glmm_load(v)));
 #elif defined( __SSE__ ) || defined( __SSE2__ )
-  glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
+  glmm_store(dest, _mm_xor_ps(glmm_load(v), glmm_float32x4_SIGNMASK_NEG));
 #elif defined(CGLM_NEON_FP)
  vst1q_f32(dest, vnegq_f32(vld1q_f32(v)));
 #else