Merge pull request #291 from recp/fix-simd-signmask

simd, sse: use 0x80000000 insteaf of -0.f to fix fastmath on msvc
This commit is contained in:
Recep Aslantas
2023-04-22 10:11:17 +03:00
committed by GitHub
9 changed files with 34 additions and 14 deletions

View File

@@ -52,6 +52,10 @@ glmm_float32x4_init(float x, float y, float z, float w) {
# define glmm_float32x4_init(x, y, z, w) { x, y, z, w }
#endif
#define glmm_float32x4_SIGNMASK_PNPN glmm_float32x4_init( 0.f, -0.f, 0.f, -0.f)
#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f)
#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f)
static inline
float32x4_t
glmm_abs(float32x4_t v) {

View File

@@ -99,7 +99,6 @@
# endif
#endif
/* WebAssembly */
#if defined(__wasm__) && defined(__wasm_simd128__)
# ifndef CGLM_SIMD_WASM

View File

@@ -108,7 +108,7 @@ glm_mat4_det_neon(mat4 mat) {
float32x4_t r0, r1, r2, r3, x0, x1, x2;
float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
float32x4x2_t a1;
float32x4_t x3 = glmm_float32x4_init(0.f, -0.f, 0.f, -0.f);
float32x4_t x3 = glmm_float32x4_SIGNMASK_PNPN;
/* 127 <- 0, [square] det(A) = det(At) */
r0 = glmm_load(mat[0]); /* d c b a */
@@ -181,7 +181,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
x0, x1, x2, x3, x4, x5, x6, x7, x8;
float32x4x2_t a1;
float32x2_t lp, ko, hg, jn, im, fe, ae, bf, cg, dh;
float32x4_t x9 = glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f);
float32x4_t x9 = glmm_float32x4_SIGNMASK_NPNP;
x8 = vrev64q_f32(x9);

View File

@@ -23,7 +23,8 @@ glm_quat_mul_neon(versor p, versor q, versor dest) {
*/
glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
glmm_128 s1 = glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f);
glmm_128 s1 = glmm_float32x4_SIGNMASK_NPPN;
float32x2_t qh, ql;
xp = glmm_load(p); /* 3 2 1 0 */

View File

@@ -98,7 +98,7 @@ glm_inv_tr_sse2(mat4 mat) {
x2 = glmm_shuff1(r3, 0, 0, 0, 0);
x3 = glmm_shuff1(r3, 1, 1, 1, 1);
x4 = glmm_shuff1(r3, 2, 2, 2, 2);
x5 = _mm_set1_ps(-0.f);
x5 = glmm_float32x4_SIGNMASK_NEG;
x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4)));
x0 = _mm_xor_ps(x0, x5);

View File

@@ -153,7 +153,7 @@ glm_mat4_det_sse2(mat4 mat) {
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
x2);
x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
x2 = _mm_xor_ps(x2, glmm_float32x4_SIGNMASK_NPNP);
return glmm_hadd(_mm_mul_ps(x2, r0));
}
@@ -166,7 +166,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
/* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
x8 = glmm_float32x4_SIGNMASK_NPNP;
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */
@@ -302,7 +303,8 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
t0, t1, t2, t3, t4, t5,
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
/* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
x8 = glmm_float32x4_SIGNMASK_NPNP;
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
/* 127 <- 0 */

View File

@@ -26,9 +26,9 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
xp = glmm_load(p); /* 3 2 1 0 */
xq = glmm_load(q);
x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); /* TODO: _mm_set1_ss() + shuff ? */
x1 = glmm_float32x4_SIGNMASK_NPNP; /* TODO: _mm_set1_ss() + shuff ? */
r = _mm_mul_ps(glmm_splat_w(xp), xq);
x2 = _mm_unpackhi_ps(x1, x1);
x3 = glmm_shuff1(x1, 3, 2, 0, 1);
x = glmm_splat_x(xp);

View File

@@ -54,10 +54,23 @@
# endif
#endif
#define GLMM_NEGZEROf 0x80000000 /* 0x80000000 ---> -0.0f */
#define GLMM__SIGNMASKf(X, Y, Z, W) \
_mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W))
/* _mm_set_ps(X, Y, Z, W); */
#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf)
#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0)
#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf)
#define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(0x80000000)) /* _mm_set1_ps(-0.0f) */
#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
static inline
__m128
glmm_abs(__m128 x) {
return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x);
}
static inline
@@ -256,7 +269,8 @@ glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
#ifdef __FMA__
return _mm_fnmsub_ps(a, b, c);
#else
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c),
glmm_float32x4_SIGNMASK_NEG);
#endif
}
@@ -298,7 +312,7 @@ glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
return _mm256_fmsub_ps(a, b, c);
#else
return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
_mm256_set1_ps(-0.0f));
glmm_float32x8_SIGNMASK_NEG);
#endif
}
#endif

View File

@@ -709,7 +709,7 @@ glm_vec4_negate_to(vec4 v, vec4 dest) {
#if defined(__wasm__) && defined(__wasm_simd128__)
glmm_store(dest, wasm_f32x4_neg(glmm_load(v)));
#elif defined( __SSE__ ) || defined( __SSE2__ )
glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
glmm_store(dest, _mm_xor_ps(glmm_load(v), glmm_float32x4_SIGNMASK_NEG));
#elif defined(CGLM_NEON_FP)
vst1q_f32(dest, vnegq_f32(vld1q_f32(v)));
#else