From 84b482971d818ed9bd2ee6d7a0072644c249d524 Mon Sep 17 00:00:00 2001 From: myfreeer Date: Sun, 26 Mar 2023 19:17:40 +0800 Subject: [PATCH] simd128: inline _mm_shuffle_ps --- include/cglm/simd/wasm.h | 9 --------- include/cglm/simd/wasm/mat3.h | 6 +++--- include/cglm/simd/wasm/mat4.h | 32 ++++++++++++++++---------------- 3 files changed, 19 insertions(+), 28 deletions(-) diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h index 38c992e..21ba62d 100644 --- a/include/cglm/simd/wasm.h +++ b/include/cglm/simd/wasm.h @@ -11,15 +11,6 @@ #define glmm_set1(x) wasm_f32x4_splat(x) #define glmm_128 v128_t -#define _MM_SHUFFLE(w, z, y, x) (((w) << 6) | ((z) << 4) | ((y) << 2) | (x)) - -#define _mm_shuffle_ps(__a, __b, __mask) \ - ((glmm_128)wasm_i32x4_shuffle(__a, __b, \ - (((__mask) >> 0) & 0x3) + 0, \ - (((__mask) >> 2) & 0x3) + 0, \ - (((__mask) >> 4) & 0x3) + 4, \ - (((__mask) >> 6) & 0x3) + 4)) - #define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z) #define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane) diff --git a/include/cglm/simd/wasm/mat3.h b/include/cglm/simd/wasm/mat3.h index f4d8fed..412ad8a 100644 --- a/include/cglm/simd/wasm/mat3.h +++ b/include/cglm/simd/wasm/mat3.h @@ -25,8 +25,8 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { x8 = glmm_shuff1(l0, 0, 2, 1, 0); /* a00 a02 a01 a00 */ x1 = glmm_shuff1(r0, 3, 0, 0, 0); /* b10 b00 b00 b00 */ - x2 = _mm_shuffle_ps(l0, l1, _MM_SHUFFLE(1, 0, 3, 3)); /* a12 a11 a10 a10 */ - x3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 3, 1)); /* b20 b11 b10 b01 */ + x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5); /* a12 a11 a10 a10 */ + x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6); /* b20 b11 b10 b01 */ x0 = wasm_f32x4_mul(x8, x1); x6 = glmm_shuff1(l0, 1, 0, 2, 1); /* a01 a00 a02 a01 */ @@ -51,7 +51,7 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) { x2 = _mm_movehl_ps(l2, l1); /* a22 a22 a21 a20 */ x3 = glmm_shuff1(x2, 0, 2, 1, 0); /* a20 a22 a21 a20 */ x2 = glmm_shuff1(x2, 1, 0, 2, 1); /* a21 a20 a22 a21 */ - x4 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 1, 2, 2)); /* b12 b12 b02 b02 */ + x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5); /* b12 b12 b02 b02 */ x5 = glmm_shuff1(x4, 3, 0, 0, 0); /* b12 b02 b02 b02 */ x4 = _mm_movehl_ps(r2, x4); /* b22 b22 b12 b12 */ diff --git a/include/cglm/simd/wasm/mat4.h b/include/cglm/simd/wasm/mat4.h index 847c6c3..35d5854 100644 --- a/include/cglm/simd/wasm/mat4.h +++ b/include/cglm/simd/wasm/mat4.h @@ -148,9 +148,9 @@ glm_mat4_det_wasm(mat4 mat) { */ x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0), wasm_f32x4_mul(glmm_shuff1(r1, 0, 0, 0, 1), - _mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0)))); + wasm_i32x4_shuffle(x1, x0, 0, 0, 4, 5))); x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3), - _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)), + wasm_i32x4_shuffle(x0, x1, 1, 3, 6, 6), x2); x2 = wasm_v128_xor(x2, wasm_f32x4_const(0.f, -0.f, 0.f, -0.f)); @@ -182,10 +182,10 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */ - x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ - x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */ - x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */ - x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */ + x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ + x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ + x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ + x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ t0 = wasm_f32x4_mul(x3, x1); t1 = wasm_f32x4_mul(x5, x1); @@ -282,9 +282,9 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) { v3 = wasm_v128_xor(v3, x9); /* determinant */ - x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); - x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); - x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); + x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4); + x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4); + x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6); x0 = _mm_rcp_ps(glmm_vhadd(wasm_f32x4_mul(x0, r0))); @@ -318,10 +318,10 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */ - x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */ - x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */ - x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */ - x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */ + x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */ + x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */ + x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */ + x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */ t0 = wasm_f32x4_mul(x3, x1); t1 = wasm_f32x4_mul(x5, x1); @@ -418,9 +418,9 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) { v3 = wasm_v128_xor(v3, x9); /* determinant */ - x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0)); - x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0)); - x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0)); + x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4); + x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4); + x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6); x0 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), glmm_vhadd(wasm_f32x4_mul(x0, r0)));