simd128: inline _mm_movelh_ps

This commit is contained in:
myfreeer
2023-04-01 19:19:49 +08:00
parent 5e05eec6d6
commit e40b477929
5 changed files with 24 additions and 21 deletions

View File

@@ -28,12 +28,6 @@ _mm_movehl_ps(glmm_128 __a, glmm_128 __b)
return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3); return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3);
} }
static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
_mm_movelh_ps(glmm_128 __a, glmm_128 __b)
{
return wasm_i32x4_shuffle(__a, __b, 0, 1, 4, 5);
}
static inline static inline
glmm_128 glmm_128
glmm_abs(glmm_128 x) { glmm_abs(glmm_128 x) {

View File

@@ -98,9 +98,11 @@ glm_inv_tr_wasm(mat4 mat) {
x3 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); x3 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
x4 = wasm_i32x4_shuffle(r2, x1, 0, 4, 1, 5); x4 = wasm_i32x4_shuffle(r2, x1, 0, 4, 1, 5);
x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7); x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7);
r0 = _mm_movelh_ps(x2, x4); // r0 = _mm_movelh_ps(x2, x4);
r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5);
r1 = _mm_movehl_ps(x4, x2); r1 = _mm_movehl_ps(x4, x2);
r2 = _mm_movelh_ps(x3, x5); // r2 = _mm_movelh_ps(x3, x5);
r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5);
x1 = _mm_movehl_ps(x5, x3); x1 = _mm_movehl_ps(x5, x3);
x2 = glmm_shuff1(r3, 0, 0, 0, 0); x2 = glmm_shuff1(r3, 0, 0, 0, 0);

View File

@@ -22,7 +22,8 @@ glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) {
x3 = glmm_shuff1(x2, 2, 2, 0, 0); x3 = glmm_shuff1(x2, 2, 2, 0, 0);
x4 = glmm_shuff1(x2, 3, 3, 1, 1); x4 = glmm_shuff1(x2, 3, 3, 1, 1);
x0 = _mm_movelh_ps(x1, x1); // x0 = _mm_movelh_ps(x1, x1);
x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5);
x2 = _mm_movehl_ps(x1, x1); x2 = _mm_movehl_ps(x1, x1);
/* /*

View File

@@ -65,8 +65,10 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
a12 * b21 + a12 * b21 +
a22 * b22 + a22 * b22 +
0 * 00 */ 0 * 00 */
x2 = _mm_movelh_ps(x8, l2); /* 0.f a22 a12 a02 */ // x2 = _mm_movelh_ps(x8, l2);
x3 = _mm_movelh_ps(x9, r2); /* 0.f b22 b21 b20 */ // x3 = _mm_movelh_ps(x9, r2);
x2 = wasm_i32x4_shuffle(x8, l2, 0, 1, 4, 5); /* 0.f a22 a12 a02 */
x3 = wasm_i32x4_shuffle(x9, r2, 0, 1, 4, 5); /* 0.f b22 b21 b20 */
x2 = glmm_vdots(x2, x3); x2 = glmm_vdots(x2, x3);
// _mm_storeu_ps(&dest[0][0], x0); // _mm_storeu_ps(&dest[0][0], x0);

View File

@@ -41,9 +41,11 @@ glm_mat4_transp_wasm(mat4 m, mat4 dest) {
tmp1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); tmp1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
tmp2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); tmp2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5);
tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7);
r0 = _mm_movelh_ps(tmp0, tmp2); // r0 = _mm_movelh_ps(tmp0, tmp2);
r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5);
r1 = _mm_movehl_ps(tmp2, tmp0); r1 = _mm_movehl_ps(tmp2, tmp0);
r2 = _mm_movelh_ps(tmp1, tmp3); // r2 = _mm_movelh_ps(tmp1, tmp3);
r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5);
r3 = _mm_movehl_ps(tmp3, tmp1); r3 = _mm_movehl_ps(tmp3, tmp1);
glmm_store(dest[0], r0); glmm_store(dest[0], r0);
@@ -184,7 +186,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
r3 = glmm_load(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_movehl_ps(r3, r2); /* p o l k */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */
x3 = _mm_movelh_ps(r2, r3); /* n m j i */ // x3 = _mm_movelh_ps(r2, r3);
x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */
x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
@@ -237,8 +240,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, t5); t5 = glmm_fnmadd(x7, x5, t5);
// x4 = _mm_movelh_ps(r0, r1);
x4 = _mm_movelh_ps(r0, r1); /* f e b a */ x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */
x5 = _mm_movehl_ps(r1, r0); /* h g d c */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */
x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */
@@ -322,7 +325,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
r3 = glmm_load(mat[3]); /* p o n m */ r3 = glmm_load(mat[3]); /* p o n m */
x0 = _mm_movehl_ps(r3, r2); /* p o l k */ x0 = _mm_movehl_ps(r3, r2); /* p o l k */
x3 = _mm_movelh_ps(r2, r3); /* n m j i */ // x3 = _mm_movelh_ps(r2, r3);
x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5); /* n m j i */
x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */ x1 = glmm_shuff1(x0, 1, 3, 3 ,3); /* l p p p */
x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */ x2 = glmm_shuff1(x0, 0, 2, 2, 2); /* k o o o */
x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */ x4 = glmm_shuff1(x3, 1, 3, 3, 3); /* j n n n */
@@ -375,8 +379,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
t2[5] = e * n - m * f; t2[5] = e * n - m * f;
t3[5] = e * j - i * f; */ t3[5] = e * j - i * f; */
t5 = glmm_fnmadd(x7, x5, t5); t5 = glmm_fnmadd(x7, x5, t5);
// x4 = _mm_movelh_ps(r0, r1);
x4 = _mm_movelh_ps(r0, r1); /* f e b a */ x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5); /* f e b a */
x5 = _mm_movehl_ps(r1, r0); /* h g d c */ x5 = _mm_movehl_ps(r1, r0); /* h g d c */
x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */ x0 = glmm_shuff1(x4, 0, 0, 0, 2); /* a a a e */