simd128: inline _mm_movehl_ps

This commit is contained in:
myfreeer
2023-04-01 19:28:45 +08:00
parent e40b477929
commit 48d6ab79bd
5 changed files with 45 additions and 40 deletions

View File

@@ -22,12 +22,6 @@
#define _mm_cvtss_f32(v) wasm_f32x4_extract_lane(v, 0)
static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
_mm_movehl_ps(glmm_128 __a, glmm_128 __b)
{
return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3);
}
static inline
glmm_128
glmm_abs(glmm_128 x) {
@@ -49,7 +43,8 @@ glmm_vhadds(glmm_128 v) {
glmm_128 shuf, sums;
shuf = glmm_shuff1(v, 2, 3, 0, 1);
sums = wasm_f32x4_add(v, shuf);
shuf = _mm_movehl_ps(shuf, sums);
// shuf = _mm_movehl_ps(shuf, sums);
shuf = wasm_i32x4_shuffle(shuf, sums, 6, 7, 2, 3);
sums = wasm_i32x4_shuffle(sums, wasm_f32x4_add(sums, shuf), 4, 1, 2, 3);
return sums;
}

View File

@@ -100,17 +100,20 @@ glm_inv_tr_wasm(mat4 mat) {
x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7);
// r0 = _mm_movelh_ps(x2, x4);
r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5);
r1 = _mm_movehl_ps(x4, x2);
// r1 = _mm_movehl_ps(x4, x2);
r1 = wasm_i32x4_shuffle(x4, x2, 6, 7, 2, 3);
// r2 = _mm_movelh_ps(x3, x5);
r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5);
x1 = _mm_movehl_ps(x5, x3);
// x1 = _mm_movehl_ps(x5, x3);
x1 = wasm_i32x4_shuffle(x5, x3, 6, 7, 2, 3);
x2 = glmm_shuff1(r3, 0, 0, 0, 0);
x3 = glmm_shuff1(r3, 1, 1, 1, 1);
x4 = glmm_shuff1(r3, 2, 2, 2, 2);
x5 = wasm_f32x4_const_splat(-0.f);
x0 = glmm_fmadd(r0, x2, glmm_fmadd(r1, x3, wasm_f32x4_mul(r2, x4)));
x0 = glmm_fmadd(r0, x2,
glmm_fmadd(r1, x3, wasm_f32x4_mul(r2, x4)));
x0 = wasm_v128_xor(x0, x5);
x0 = wasm_f32x4_add(x0, x1);

View File

@@ -24,7 +24,8 @@ glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) {
x4 = glmm_shuff1(x2, 3, 3, 1, 1);
// x0 = _mm_movelh_ps(x1, x1);
x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5);
x2 = _mm_movehl_ps(x1, x1);
// x2 = _mm_movehl_ps(x1, x1);
x2 = wasm_i32x4_shuffle(x1, x1, 6, 7, 2, 3);
/*
dest[0][0] = a * e + c * f;

View File

@@ -50,13 +50,15 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
x0 = glmm_fmadd(x4, x6, x0);
x1 = glmm_fmadd(x5, x2, x1);
x2 = _mm_movehl_ps(l2, l1); /* a22 a22 a21 a20 */
// x2 = _mm_movehl_ps(l2, l1);
x2 = wasm_i32x4_shuffle(l2, l1, 6, 7, 2, 3); /* a22 a22 a21 a20 */
x3 = glmm_shuff1(x2, 0, 2, 1, 0); /* a20 a22 a21 a20 */
x2 = glmm_shuff1(x2, 1, 0, 2, 1); /* a21 a20 a22 a21 */
x4 = wasm_i32x4_shuffle(r0, r1, 2, 2, 5, 5); /* b12 b12 b02 b02 */
x5 = glmm_shuff1(x4, 3, 0, 0, 0); /* b12 b02 b02 b02 */
x4 = _mm_movehl_ps(r2, x4); /* b22 b22 b12 b12 */
// x4 = _mm_movehl_ps(r2, x4);
x4 = wasm_i32x4_shuffle(r2, x4, 6, 7, 2, 3); /* b22 b22 b12 b12 */
x0 = glmm_fmadd(x3, x5, x0);
x1 = glmm_fmadd(x2, x4, x1);

View File

@@ -43,10 +43,12 @@ glm_mat4_transp_wasm(mat4 m, mat4 dest) {
tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7);
// r0 = _mm_movelh_ps(tmp0, tmp2);
r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5);
r1 = _mm_movehl_ps(tmp2, tmp0);
// r1 = _mm_movehl_ps(tmp2, tmp0);
r1 = wasm_i32x4_shuffle(tmp2, tmp0, 6, 7, 2, 3);
// r2 = _mm_movelh_ps(tmp1, tmp3);
r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5);
r3 = _mm_movehl_ps(tmp3, tmp1);
// r3 = _mm_movehl_ps(tmp3, tmp1);
r3 = wasm_i32x4_shuffle(tmp3, tmp1, 6, 7, 2, 3);
glmm_store(dest[0], r0);
glmm_store(dest[1], r1);
@@ -184,8 +186,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
r1 = glmm_load(mat[1]); /* h g f e */
r2 = glmm_load(mat[2]); /* l k j i */
r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_movehl_ps(r3, r2); /* p o l k */
// x0 = _mm_movehl_ps(r3, r2);
x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3); /* p o l k */
// x3 = _mm_movelh_ps(r2, r3);
x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
@@ -242,7 +244,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
t5 = glmm_fnmadd(x7, x5, t5);
// x4 = _mm_movelh_ps(r0, r1);
x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */
x5 = _mm_movehl_ps(r1, r0); /* h g d c */
// x5 = _mm_movehl_ps(r1, r0);
x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3); /* h g d c */
x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */
x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */
@@ -323,8 +326,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
r1 = glmm_load(mat[1]); /* h g f e */
r2 = glmm_load(mat[2]); /* l k j i */
r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_movehl_ps(r3, r2); /* p o l k */
// x0 = _mm_movehl_ps(r3, r2);
x0 = wasm_i32x4_shuffle(r3, r2, 6, 7, 2, 3); /* p o l k */
// x3 = _mm_movelh_ps(r2, r3);
x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
@@ -381,7 +384,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
t5 = glmm_fnmadd(x7, x5, t5);
// x4 = _mm_movelh_ps(r0, r1);
x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */
x5 = _mm_movehl_ps(r1, r0); /* h g d c */
// x5 = _mm_movehl_ps(r1, r0);
x5 = wasm_i32x4_shuffle(r1, r0, 6, 7, 2, 3); /* h g d c */
x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */
x1 = glmm_shuff1(x4, 1, 1, 1, 3); /* b b b f */