mirror of
https://github.com/recp/cglm.git
synced 2025-10-04 09:08:53 +00:00
simd: fix glmm_set1, glmm_splat
This commit is contained in:
@@ -299,7 +299,7 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) {
|
|||||||
vget_low_f32(vzipq_f32(v2, v3).val[0]));
|
vget_low_f32(vzipq_f32(v2, v3).val[0]));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
x0 = glmm_div(glmm_set1(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
|
x0 = glmm_div(glmm_set1_rval(1.0f), glmm_vhadd(vmulq_f32(x0, r0)));
|
||||||
|
|
||||||
glmm_store(dest[0], vmulq_f32(v0, x0));
|
glmm_store(dest[0], vmulq_f32(v0, x0));
|
||||||
glmm_store(dest[1], vmulq_f32(v1, x0));
|
glmm_store(dest[1], vmulq_f32(v1, x0));
|
||||||
|
@@ -471,15 +471,15 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
|||||||
x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */
|
x8 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(3, 1, 3, 1)); /* k c j b */
|
||||||
x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */
|
x9 = _mm_shuffle_ps(x0, x3, _MM_SHUFFLE(2, 0, 2, 0)); /* o g n f */
|
||||||
|
|
||||||
x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */
|
x10 = glmm_shuff1(x2, 2, 0, 2, 0); /* p h p h */
|
||||||
x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */
|
x11 = glmm_shuff1(x2, 3, 1, 3, 1); /* l d l d */
|
||||||
|
|
||||||
#if 1 /* TODO measure both */
|
#if 0 /* TODO measure both */
|
||||||
x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */
|
x12 = _mm_shuffle_ps(x4, x5, _MM_SHUFFLE(1, 0, 1, 0)); /* i a k c */
|
||||||
x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */
|
x13 = _mm_shuffle_ps(x6, x7, _MM_SHUFFLE(1, 0, 1, 0)); /* m e o g */
|
||||||
#else
|
#else
|
||||||
x12 = _mm_movelh_ps(x4, x5); /* i a k c */
|
x12 = _mm_movelh_ps(x4, x5); /* i a k c */
|
||||||
x13 = _mm_movelh_ps(x6, x7); /* m e o g */
|
x13 = _mm_movelh_ps(x6, x7); /* m e o g */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
t0 = _mm_mul_ps(x12, x10);
|
t0 = _mm_mul_ps(x12, x10);
|
||||||
@@ -494,7 +494,7 @@ glm_mat4_inv_sse2(mat4 mat, mat4 dest) {
|
|||||||
/* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
|
/* v0: c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
|
||||||
/* v1: c5 * c12 + c6 * c11 */
|
/* v1: c5 * c12 + c6 * c11 */
|
||||||
|
|
||||||
v5 = glmm_set1(1.0f);
|
v5 = glmm_set1_rval(1.0f);
|
||||||
v0 = glmm_shuff1(t2, 2, 3, 0, 1);
|
v0 = glmm_shuff1(t2, 2, 3, 0, 1);
|
||||||
v1 = glmm_shuff1(t1, 0, 1, 2, 3);
|
v1 = glmm_shuff1(t1, 0, 1, 2, 3);
|
||||||
v0 = _mm_mul_ps(t0, v0);
|
v0 = _mm_mul_ps(t0, v0);
|
||||||
|
@@ -37,21 +37,26 @@
|
|||||||
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
|
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
|
||||||
|
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
# define glmm_set1(x) _mm_broadcast_ss(&x)
|
# define glmm_set1(x) _mm_broadcast_ss(&x)
|
||||||
# define glmm_set1_ptr(x) _mm_broadcast_ss(x)
|
# define glmm_set1_ptr(x) _mm_broadcast_ss(x)
|
||||||
|
# define glmm_set1_rval(x) _mm_set1_ps(x)
|
||||||
# define glmm_splat_x(x) _mm_broadcastss_ps(x)
|
# ifdef __AVX2__
|
||||||
# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1))
|
# define glmm_splat_x(x) _mm_broadcastss_ps(x)
|
||||||
# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2))
|
# else
|
||||||
# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3))
|
# define glmm_splat_x(x) _mm_permute_ps(x, _MM_SHUFFLE(0, 0, 0, 0))
|
||||||
|
# endif
|
||||||
|
# define glmm_splat_y(x) _mm_permute_ps(x, _MM_SHUFFLE(1, 1, 1, 1))
|
||||||
|
# define glmm_splat_z(x) _mm_permute_ps(x, _MM_SHUFFLE(2, 2, 2, 2))
|
||||||
|
# define glmm_splat_w(x) _mm_permute_ps(x, _MM_SHUFFLE(3, 3, 3, 3))
|
||||||
#else
|
#else
|
||||||
# define glmm_set1(x) _mm_set1_ps(x)
|
# define glmm_set1(x) _mm_set1_ps(x)
|
||||||
# define glmm_set1_ptr(x) _mm_set1_ps(*x)
|
# define glmm_set1_ptr(x) _mm_set1_ps(*x)
|
||||||
|
# define glmm_set1_rval(x) _mm_set1_ps(x)
|
||||||
|
|
||||||
# define glmm_splat_x(x) glmm_splat(x, 0)
|
# define glmm_splat_x(x) glmm_splat(x, 0)
|
||||||
# define glmm_splat_y(x) glmm_splat(x, 1)
|
# define glmm_splat_y(x) glmm_splat(x, 1)
|
||||||
# define glmm_splat_z(x) glmm_splat(x, 2)
|
# define glmm_splat_z(x) glmm_splat(x, 2)
|
||||||
# define glmm_splat_w(x) glmm_splat(x, 3)
|
# define glmm_splat_w(x) glmm_splat(x, 3)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __AVX__
|
#ifdef __AVX__
|
||||||
|
@@ -215,7 +215,7 @@ glm_vec4_one(vec4 v) {
|
|||||||
#if defined(__wasm__) && defined(__wasm_simd128__)
|
#if defined(__wasm__) && defined(__wasm_simd128__)
|
||||||
glmm_store(v, wasm_f32x4_const_splat(1.0f));
|
glmm_store(v, wasm_f32x4_const_splat(1.0f));
|
||||||
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
#elif defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
glmm_store(v, glmm_set1(1.0f));
|
glmm_store(v, glmm_set1_rval(1.0f));
|
||||||
#elif defined(CGLM_NEON_FP)
|
#elif defined(CGLM_NEON_FP)
|
||||||
vst1q_f32(v, vdupq_n_f32(1.0f));
|
vst1q_f32(v, vdupq_n_f32(1.0f));
|
||||||
#else
|
#else
|
||||||
|
Reference in New Issue
Block a user