From 998d9626a242dce56bbbe8819f7a31bd0150518e Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sun, 26 Mar 2023 19:22:24 +0800 Subject: [PATCH] simd128: inline _mm_unpackhi_ps and _mm_unpacklo_ps --- include/cglm/simd/wasm.h | 20 ++++---------------- include/cglm/simd/wasm/mat3.h | 6 ++++-- include/cglm/simd/wasm/quat.h | 4 ++-- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index 21ba62d..4ade269 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -22,18 +22,6 @@ #define _mm_cvtss_f32(v) wasm_f32x4_extract_lane(v, 0) -static inline glmm_128 __attribute__((__always_inline__, __nodebug__)) -_mm_unpackhi_ps(glmm_128 __a, glmm_128 __b) -{ - return wasm_i32x4_shuffle(__a, __b, 2, 6, 3, 7); -} - -static inline glmm_128 __attribute__((__always_inline__, __nodebug__)) -_mm_unpacklo_ps(glmm_128 __a, glmm_128 __b) -{ - return wasm_i32x4_shuffle(__a, __b, 0, 4, 1, 5); -} - static inline glmm_128 __attribute__((__always_inline__, __nodebug__)) _mm_movehl_ps(glmm_128 __a, glmm_128 __b) { @@ -79,10 +67,10 @@ _mm_store_ss(float *__p, glmm_128 __a) glmm_128 __row1 = (row1); \ glmm_128 __row2 = (row2); \ glmm_128 __row3 = (row3); \ - glmm_128 __tmp0 = _mm_unpacklo_ps(__row0, __row1); \ - glmm_128 __tmp1 = _mm_unpackhi_ps(__row0, __row1); \ - glmm_128 __tmp2 = _mm_unpacklo_ps(__row2, __row3); \ - glmm_128 __tmp3 = _mm_unpackhi_ps(__row2, __row3); \ + glmm_128 __tmp0 = wasm_i32x4_shuffle(__row0, __row1, 0, 4, 1, 5); \ + glmm_128 __tmp1 = wasm_i32x4_shuffle(__row0, __row1, 2, 6, 3, 7); \ + glmm_128 __tmp2 = wasm_i32x4_shuffle(__row2, __row3, 0, 4, 1, 5); \ + glmm_128 __tmp3 = wasm_i32x4_shuffle(__row2, __row3, 2, 6, 3, 7); \ (row0) = _mm_movelh_ps(__tmp0, __tmp2); \ (row1) = _mm_movehl_ps(__tmp2, __tmp0); \ (row2) = _mm_movelh_ps(__tmp1, __tmp3); \ diff --git a/include/cglm/simd/wasm/mat3.h b/include/cglm/simd/wasm/mat3.h index 412ad8a..621d2bd 100644 --- a/include/cglm/simd/wasm/mat3.h +++ b/include/cglm/simd/wasm/mat3.h @@ -42,8 +42,10 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { x6 = glmm_shuff1(x3, 2, 0, 0, 0); /* b11 b01 b01 b01 */ x2 = glmm_shuff1(r1, 3, 3, 0, 0); /* b21 b21 b11 b11 */ - x8 = _mm_unpackhi_ps(x8, x4); /* a10 a00 a12 a02 */ - x9 = _mm_unpackhi_ps(x7, x2); /* b21 b20 b21 b20 */ + // x8 = _mm_unpackhi_ps(x8, x4); + // x9 = _mm_unpackhi_ps(x7, x2); + x8 = wasm_i32x4_shuffle(x8, x4, 2, 6, 3, 7); /* a10 a00 a12 a02 */ + x9 = wasm_i32x4_shuffle(x7, x2, 2, 6, 3, 7); /* b21 b20 b21 b20 */ x0 = glmm_fmadd(x4, x6, x0); x1 = glmm_fmadd(x5, x2, x1); diff --git a/include/cglm/simd/wasm/quat.h b/include/cglm/simd/wasm/quat.h index ccbcec2..f8434f1 100644 --- a/include/cglm/simd/wasm/quat.h +++ b/include/cglm/simd/wasm/quat.h @@ -28,8 +28,8 @@ glm_quat_mul_wasm(versor p, versor q, versor dest) { xq = glmm_load(q); x1 = wasm_f32x4_const(0.f, -0.f, 0.f, -0.f); /* TODO: _mm_set1_ss() + shuff ? */ r = wasm_f32x4_mul(glmm_splat_w(xp), xq); - - x2 = _mm_unpackhi_ps(x1, x1); + // x2 = _mm_unpackhi_ps(x1, x1); + x2 = wasm_i32x4_shuffle(x1, x1, 2, 6, 3, 7); x3 = glmm_shuff1(x1, 3, 2, 0, 1); x = glmm_splat_x(xp); y = glmm_splat_y(xp);