simd128: inline _mm_shuffle_ps

This commit is contained in:
myfreeer
2023-03-26 19:17:40 +08:00
parent f24ec41a26
commit 84b482971d
3 changed files with 19 additions and 28 deletions

View File

@@ -11,15 +11,6 @@
#define glmm_set1(x) wasm_f32x4_splat(x)
#define glmm_128 v128_t
#define _MM_SHUFFLE(w, z, y, x) (((w) << 6) | ((z) << 4) | ((y) << 2) | (x))
#define _mm_shuffle_ps(__a, __b, __mask) \
((glmm_128)wasm_i32x4_shuffle(__a, __b, \
(((__mask) >> 0) & 0x3) + 0, \
(((__mask) >> 2) & 0x3) + 0, \
(((__mask) >> 4) & 0x3) + 4, \
(((__mask) >> 6) & 0x3) + 4))
#define glmm_shuff1(xmm, z, y, x, w) wasm_i32x4_shuffle(xmm, xmm, w, x, y, z)
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)

View File

@@ -25,8 +25,8 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
x8 = glmm_shuff1(l0, 0, 2, 1, 0); /* a00 a02 a01 a00 */
x1 = glmm_shuff1(r0, 3, 0, 0, 0); /* b10 b00 b00 b00 */
x2 = _mm_shuffle_ps(l0, l1, _MM_SHUFFLE(1, 0, 3, 3)); /* a12 a11 a10 a10 */
x3 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 3, 1)); /* b20 b11 b10 b01 */
x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5); /* a12 a11 a10 a10 */
x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6); /* b20 b11 b10 b01 */
x0 = wasm_f32x4_mul(x8, x1);
x6 = glmm_shuff1(l0, 1, 0, 2, 1); /* a01 a00 a02 a01 */
@@ -51,7 +51,7 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
x2 = _mm_movehl_ps(l2, l1); /* a22 a22 a21 a20 */
x3 = glmm_shuff1(x2, 0, 2, 1, 0); /* a20 a22 a21 a20 */
x2 = glmm_shuff1(x2, 1, 0, 2, 1); /* a21 a20 a22 a21 */
x4 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 1, 2, 2)); /* b12 b12 b02 b02 */
x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5); /* b12 b12 b02 b02 */
x5 = glmm_shuff1(x4, 3, 0, 0, 0); /* b12 b02 b02 b02 */
x4 = _mm_movehl_ps(r2, x4); /* b22 b22 b12 b12 */

View File

@@ -148,9 +148,9 @@ glm_mat4_det_wasm(mat4 mat) {
*/
x2 = glmm_fnmadd(glmm_shuff1(r1, 1, 1, 2, 2), glmm_shuff1(x0, 3, 2, 2, 0),
wasm_f32x4_mul(glmm_shuff1(r1, 0, 0, 0, 1),
_mm_shuffle_ps(x1, x0, _MM_SHUFFLE(1, 0, 0, 0))));
wasm_i32x4_shuffle(x1, x0, 0, 0, 4, 5)));
x2 = glmm_fmadd(glmm_shuff1(r1, 2, 3, 3, 3),
_mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 2, 3, 1)),
wasm_i32x4_shuffle(x0, x1, 1, 3, 6, 6),
x2);
x2 = wasm_v128_xor(x2, wasm_f32x4_const(0.f, -0.f, 0.f, -0.f));
@@ -182,10 +182,10 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */
x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */
x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */
x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */
x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */
t0 = wasm_f32x4_mul(x3, x1);
t1 = wasm_f32x4_mul(x5, x1);
@@ -282,9 +282,9 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
v3 = wasm_v128_xor(v3, x9);
/* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4);
x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4);
x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6);
x0 = _mm_rcp_ps(glmm_vhadd(wasm_f32x4_mul(x0, r0)));
@@ -318,10 +318,10 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
x7 = glmm_shuff1(x3, 0, 2, 2, 2); /* i m m m */
x6 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 0)); /* e e i i */
x5 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(1, 1, 1, 1)); /* f f j j */
x3 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(2, 2, 2, 2)); /* g g k k */
x0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(3, 3, 3, 3)); /* h h l l */
x6 = wasm_i32x4_shuffle(r2, r1, 0, 0, 4, 4); /* e e i i */
x5 = wasm_i32x4_shuffle(r2, r1, 1, 1, 5, 5); /* f f j j */
x3 = wasm_i32x4_shuffle(r2, r1, 2, 2, 6, 6); /* g g k k */
x0 = wasm_i32x4_shuffle(r2, r1, 3, 3, 7, 7); /* h h l l */
t0 = wasm_f32x4_mul(x3, x1);
t1 = wasm_f32x4_mul(x5, x1);
@@ -418,9 +418,9 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
v3 = wasm_v128_xor(v3, x9);
/* determinant */
x0 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(0, 0, 0, 0));
x1 = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 0, 0));
x0 = _mm_shuffle_ps(x0, x1, _MM_SHUFFLE(2, 0, 2, 0));
x0 = wasm_i32x4_shuffle(v0, v1, 0, 0, 4, 4);
x1 = wasm_i32x4_shuffle(v2, v3, 0, 0, 4, 4);
x0 = wasm_i32x4_shuffle(x0, x1, 0, 2, 4, 6);
x0 = wasm_f32x4_div(wasm_f32x4_splat(1.0f), glmm_vhadd(wasm_f32x4_mul(x0, r0)));