mirror of
https://github.com/recp/cglm.git
synced 2025-12-24 12:32:40 +00:00
Merge pull request #291 from recp/fix-simd-signmask
simd, sse: use 0x80000000 insteaf of -0.f to fix fastmath on msvc
This commit is contained in:
@@ -52,6 +52,10 @@ glmm_float32x4_init(float x, float y, float z, float w) {
|
||||
# define glmm_float32x4_init(x, y, z, w) { x, y, z, w }
|
||||
#endif
|
||||
|
||||
#define glmm_float32x4_SIGNMASK_PNPN glmm_float32x4_init( 0.f, -0.f, 0.f, -0.f)
|
||||
#define glmm_float32x4_SIGNMASK_NPNP glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f)
|
||||
#define glmm_float32x4_SIGNMASK_NPPN glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f)
|
||||
|
||||
static inline
|
||||
float32x4_t
|
||||
glmm_abs(float32x4_t v) {
|
||||
|
||||
@@ -99,7 +99,6 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
/* WebAssembly */
|
||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||
# ifndef CGLM_SIMD_WASM
|
||||
|
||||
@@ -108,7 +108,7 @@ glm_mat4_det_neon(mat4 mat) {
|
||||
float32x4_t r0, r1, r2, r3, x0, x1, x2;
|
||||
float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
|
||||
float32x4x2_t a1;
|
||||
float32x4_t x3 = glmm_float32x4_init(0.f, -0.f, 0.f, -0.f);
|
||||
float32x4_t x3 = glmm_float32x4_SIGNMASK_PNPN;
|
||||
|
||||
/* 127 <- 0, [square] det(A) = det(At) */
|
||||
r0 = glmm_load(mat[0]); /* d c b a */
|
||||
@@ -181,7 +181,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
||||
float32x4x2_t a1;
|
||||
float32x2_t lp, ko, hg, jn, im, fe, ae, bf, cg, dh;
|
||||
float32x4_t x9 = glmm_float32x4_init(-0.f, 0.f, -0.f, 0.f);
|
||||
float32x4_t x9 = glmm_float32x4_SIGNMASK_NPNP;
|
||||
|
||||
x8 = vrev64q_f32(x9);
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@ glm_quat_mul_neon(versor p, versor q, versor dest) {
|
||||
*/
|
||||
|
||||
glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
|
||||
glmm_128 s1 = glmm_float32x4_init(-0.f, 0.f, 0.f, -0.f);
|
||||
glmm_128 s1 = glmm_float32x4_SIGNMASK_NPPN;
|
||||
|
||||
float32x2_t qh, ql;
|
||||
|
||||
xp = glmm_load(p); /* 3 2 1 0 */
|
||||
|
||||
@@ -98,7 +98,7 @@ glm_inv_tr_sse2(mat4 mat) {
|
||||
x2 = glmm_shuff1(r3, 0, 0, 0, 0);
|
||||
x3 = glmm_shuff1(r3, 1, 1, 1, 1);
|
||||
x4 = glmm_shuff1(r3, 2, 2, 2, 2);
|
||||
x5 = _mm_set1_ps(-0.f);
|
||||
x5 = glmm_float32x4_SIGNMASK_NEG;
|
||||
|
||||
x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, _mm_mul_ps(r2, x4)));
|
||||
x0 = _mm_xor_ps(x0, x5);
|
||||
|
||||
@@ -153,7 +153,7 @@ glm_mat4_det_sse2(mat4 mat) {
|
||||
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
|
||||
x2);
|
||||
|
||||
x2 = _mm_xor_ps(x2, _mm_set_ps(-0.f, 0.f, -0.f, 0.f));
|
||||
x2 = _mm_xor_ps(x2, glmm_float32x4_SIGNMASK_NPNP);
|
||||
|
||||
return glmm_hadd(_mm_mul_ps(x2, r0));
|
||||
}
|
||||
@@ -166,7 +166,8 @@ glm_mat4_inv_fast_sse2(mat4 mat, mat4 dest) {
|
||||
t0, t1, t2, t3, t4, t5,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
|
||||
|
||||
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
|
||||
/* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
|
||||
x8 = glmm_float32x4_SIGNMASK_NPNP;
|
||||
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
|
||||
|
||||
/* 127 <- 0 */
|
||||
@@ -302,7 +303,8 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
||||
t0, t1, t2, t3, t4, t5,
|
||||
x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
|
||||
|
||||
x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f);
|
||||
/* x8 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); */
|
||||
x8 = glmm_float32x4_SIGNMASK_NPNP;
|
||||
x9 = glmm_shuff1(x8, 2, 1, 2, 1);
|
||||
|
||||
/* 127 <- 0 */
|
||||
|
||||
@@ -26,9 +26,9 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) {
|
||||
|
||||
xp = glmm_load(p); /* 3 2 1 0 */
|
||||
xq = glmm_load(q);
|
||||
x1 = _mm_set_ps(-0.f, 0.f, -0.f, 0.f); /* TODO: _mm_set1_ss() + shuff ? */
|
||||
x1 = glmm_float32x4_SIGNMASK_NPNP; /* TODO: _mm_set1_ss() + shuff ? */
|
||||
r = _mm_mul_ps(glmm_splat_w(xp), xq);
|
||||
|
||||
|
||||
x2 = _mm_unpackhi_ps(x1, x1);
|
||||
x3 = glmm_shuff1(x1, 3, 2, 0, 1);
|
||||
x = glmm_splat_x(xp);
|
||||
|
||||
@@ -54,10 +54,23 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define GLMM_NEGZEROf 0x80000000 /* 0x80000000 ---> -0.0f */
|
||||
|
||||
#define GLMM__SIGNMASKf(X, Y, Z, W) \
|
||||
_mm_castsi128_ps(_mm_set_epi32(X, Y, Z, W))
|
||||
/* _mm_set_ps(X, Y, Z, W); */
|
||||
|
||||
#define glmm_float32x4_SIGNMASK_PNPN GLMM__SIGNMASKf(0, GLMM_NEGZEROf, 0, GLMM_NEGZEROf)
|
||||
#define glmm_float32x4_SIGNMASK_NPNP GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, GLMM_NEGZEROf, 0)
|
||||
#define glmm_float32x4_SIGNMASK_NPPN GLMM__SIGNMASKf(GLMM_NEGZEROf, 0, 0, GLMM_NEGZEROf)
|
||||
|
||||
#define glmm_float32x4_SIGNMASK_NEG _mm_castsi128_ps(_mm_set1_epi32(0x80000000)) /* _mm_set1_ps(-0.0f) */
|
||||
#define glmm_float32x8_SIGNMASK_NEG _mm256_castsi256_ps(_mm256_set1_epi32(GLMM_NEGZEROf))
|
||||
|
||||
static inline
|
||||
__m128
|
||||
glmm_abs(__m128 x) {
|
||||
return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
|
||||
return _mm_andnot_ps(glmm_float32x4_SIGNMASK_NEG, x);
|
||||
}
|
||||
|
||||
static inline
|
||||
@@ -256,7 +269,8 @@ glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
|
||||
#ifdef __FMA__
|
||||
return _mm_fnmsub_ps(a, b, c);
|
||||
#else
|
||||
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
|
||||
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c),
|
||||
glmm_float32x4_SIGNMASK_NEG);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -298,7 +312,7 @@ glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
|
||||
return _mm256_fmsub_ps(a, b, c);
|
||||
#else
|
||||
return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
|
||||
_mm256_set1_ps(-0.0f));
|
||||
glmm_float32x8_SIGNMASK_NEG);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -709,7 +709,7 @@ glm_vec4_negate_to(vec4 v, vec4 dest) {
|
||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||
glmm_store(dest, wasm_f32x4_neg(glmm_load(v)));
|
||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||
glmm_store(dest, _mm_xor_ps(glmm_load(v), _mm_set1_ps(-0.0f)));
|
||||
glmm_store(dest, _mm_xor_ps(glmm_load(v), glmm_float32x4_SIGNMASK_NEG));
|
||||
#elif defined(CGLM_NEON_FP)
|
||||
vst1q_f32(dest, vnegq_f32(vld1q_f32(v)));
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user