From e40b477929be138f75cdcd4db43f2987fbe9aea5 Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sat, 1 Apr 2023 19:19:49 +0800 Subject: [PATCH] simd128: inline _mm_movelh_ps --- include/cglm/simd/wasm.h | 6 ------ include/cglm/simd/wasm/affine.h | 6 ++++-- include/cglm/simd/wasm/mat2.h | 3 ++- include/cglm/simd/wasm/mat3.h | 10 ++++++---- include/cglm/simd/wasm/mat4.h | 20 ++++++++++++-------- 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index 82bc4ae..ba621dd 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -28,12 +28,6 @@ _mm_movehl_ps(glmm_128 __a, glmm_128 __b) return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3); } -static inline glmm_128 __attribute__((__always_inline__, __nodebug__)) -_mm_movelh_ps(glmm_128 __a, glmm_128 __b) -{ - return wasm_i32x4_shuffle(__a, __b, 0, 1, 4, 5); -} - static inline glmm_128 glmm_abs(glmm_128 x) { diff --git a/include/cglm/simd/wasm/affine.h b/include/cglm/simd/wasm/affine.h index 4fa2c44..9471592 100644 --- a/include/cglm/simd/wasm/affine.h +++ b/include/cglm/simd/wasm/affine.h @@ -98,9 +98,11 @@ glm_inv_tr_wasm(mat4 mat) { x3 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); x4 = wasm_i32x4_shuffle(r2, x1, 0, 4, 1, 5); x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7); - r0 = _mm_movelh_ps(x2, x4); + // r0 = _mm_movelh_ps(x2, x4); + r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5); r1 = _mm_movehl_ps(x4, x2); - r2 = _mm_movelh_ps(x3, x5); + // r2 = _mm_movelh_ps(x3, x5); + r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5); x1 = _mm_movehl_ps(x5, x3); x2 = glmm_shuff1(r3, 0, 0, 0, 0); diff --git a/include/cglm/simd/wasm/mat2.h b/include/cglm/simd/wasm/mat2.h index 42dc1f7..6c3f5fb 100644 --- a/include/cglm/simd/wasm/mat2.h +++ b/include/cglm/simd/wasm/mat2.h @@ -22,7 +22,8 @@ glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) { x3 = glmm_shuff1(x2, 2, 2, 0, 0); x4 = glmm_shuff1(x2, 3, 3, 1, 1); - x0 = _mm_movelh_ps(x1, x1); + // x0 = _mm_movelh_ps(x1, x1); + x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5); x2 = _mm_movehl_ps(x1, x1); /* diff --git a/include/cglm/simd/wasm/mat3.h b/include/cglm/simd/wasm/mat3.h index 25b911c..62d179c 100644 --- a/include/cglm/simd/wasm/mat3.h +++ b/include/cglm/simd/wasm/mat3.h @@ -25,8 +25,8 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { x8 = glmm_shuff1(l0, 0, 2, 1, 0); /* a00 a02 a01 a00 */ x1 = glmm_shuff1(r0, 3, 0, 0, 0); /* b10 b00 b00 b00 */ - x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5); /* a12 a11 a10 a10 */ - x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6); /* b20 b11 b10 b01 */ + x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5); /* a12 a11 a10 a10 */ + x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6); /* b20 b11 b10 b01 */ x0 = wasm_f32x4_mul(x8, x1); x6 = glmm_shuff1(l0, 1, 0, 2, 1); /* a01 a00 a02 a01 */ @@ -65,8 +65,10 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { a12 * b21 + a22 * b22 + 0 * 00 */ - x2 = _mm_movelh_ps(x8, l2); /* 0.f a22 a12 a02 */ - x3 = _mm_movelh_ps(x9, r2); /* 0.f b22 b21 b20 */ + // x2 = _mm_movelh_ps(x8, l2); + // x3 = _mm_movelh_ps(x9, r2); + x2 = wasm_i32x4_shuffle(x8, l2, 0, 1, 4, 5); /* 0.f a22 a12 a02 */ + x3 = wasm_i32x4_shuffle(x9, r2, 0, 1, 4, 5); /* 0.f b22 b21 b20 */ x2 = glmm_vdots(x2, x3); // _mm_storeu_ps(&dest[0][0], x0); diff --git a/include/cglm/simd/wasm/mat4.h b/include/cglm/simd/wasm/mat4.h index b90daee..4b3cde5 100644 --- a/include/cglm/simd/wasm/mat4.h +++ b/include/cglm/simd/wasm/mat4.h @@ -41,9 +41,11 @@ glm_mat4_transp_wasm(mat4 m, mat4 dest) { tmp1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); tmp2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); - r0 = _mm_movelh_ps(tmp0, tmp2); + // r0 = _mm_movelh_ps(tmp0, tmp2); + r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5); r1 = _mm_movehl_ps(tmp2, tmp0); - r2 = _mm_movelh_ps(tmp1, tmp3); + // r2 = _mm_movelh_ps(tmp1, tmp3); + r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5); r3 = _mm_movehl_ps(tmp3, tmp1); glmm_store(dest[0], r0); @@ -184,7 +186,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */ - x3 = _mm_movelh_ps(r2, r3); /* n m j i */ + // x3 = _mm_movelh_ps(r2, r3); + x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ @@ -237,8 +240,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { t2[5] = e * n - m * f; t3[5] = e * j - i * f; */ t5 = glmm_fnmadd(x7, x5, t5); - - x4 = _mm_movelh_ps(r0, r1); /* f e b a */ + // x4 = _mm_movelh_ps(r0, r1); + x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ @@ -322,7 +325,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { r3 = glmm_load(mat[3]); /* p o n m */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */ - x3 = _mm_movelh_ps(r2, r3); /* n m j i */ + // x3 = _mm_movelh_ps(r2, r3); + x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ @@ -375,8 +379,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { t2[5] = e * n - m * f; t3[5] = e * j - i * f; */ t5 = glmm_fnmadd(x7, x5, t5); - - x4 = _mm_movelh_ps(r0, r1); /* f e b a */ + // x4 = _mm_movelh_ps(r0, r1); + x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */