From e40b477929be138f75cdcd4db43f2987fbe9aea5 Mon Sep 17 00:00:00 2001
From: myfreeer <myfreeer@users.noreply.github.com>
Date: Sat, 1 Apr 2023 19:19:49 +0800
Subject: [PATCH] simd128: inline _mm_movelh_ps

---
 include/cglm/simd/wasm.h        |  6 ------
 include/cglm/simd/wasm/affine.h |  6 ++++--
 include/cglm/simd/wasm/mat2.h   |  3 ++-
 include/cglm/simd/wasm/mat3.h   | 10 ++++++----
 include/cglm/simd/wasm/mat4.h   | 20 ++++++++++++--------
 5 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/include/cglm/simd/wasm.h b/include/cglm/simd/wasm.h
index 82bc4ae..ba621dd 100644
--- a/include/cglm/simd/wasm.h
+++ b/include/cglm/simd/wasm.h
@@ -28,12 +28,6 @@ _mm_movehl_ps(glmm_128 __a, glmm_128 __b)
   return wasm_i32x4_shuffle(__a, __b, 6, 7, 2, 3);
 }
 
-static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
-_mm_movelh_ps(glmm_128 __a, glmm_128 __b)
-{
-  return wasm_i32x4_shuffle(__a, __b, 0, 1, 4, 5);
-}
-
 static inline
 glmm_128
 glmm_abs(glmm_128 x) {
diff --git a/include/cglm/simd/wasm/affine.h b/include/cglm/simd/wasm/affine.h
index 4fa2c44..9471592 100644
--- a/include/cglm/simd/wasm/affine.h
+++ b/include/cglm/simd/wasm/affine.h
@@ -98,9 +98,11 @@ glm_inv_tr_wasm(mat4 mat) {
   x3 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
   x4 = wasm_i32x4_shuffle(r2, x1, 0, 4, 1, 5);
   x5 = wasm_i32x4_shuffle(r2, x1, 2, 6, 3, 7);
-  r0 = _mm_movelh_ps(x2, x4);
+  // r0 = _mm_movelh_ps(x2, x4);
+  r0 = wasm_i32x4_shuffle(x2, x4, 0, 1, 4, 5);
   r1 = _mm_movehl_ps(x4, x2);
-  r2 = _mm_movelh_ps(x3, x5);
+  // r2 = _mm_movelh_ps(x3, x5);
+  r2 = wasm_i32x4_shuffle(x3, x5, 0, 1, 4, 5);
   x1 = _mm_movehl_ps(x5, x3);
 
   x2 = glmm_shuff1(r3, 0, 0, 0, 0);
diff --git a/include/cglm/simd/wasm/mat2.h b/include/cglm/simd/wasm/mat2.h
index 42dc1f7..6c3f5fb 100644
--- a/include/cglm/simd/wasm/mat2.h
+++ b/include/cglm/simd/wasm/mat2.h
@@ -22,7 +22,8 @@ glm_mat2_mul_wasm(mat2 m1, mat2 m2, mat2 dest) {
 
   x3 = glmm_shuff1(x2, 2, 2, 0, 0);
   x4 = glmm_shuff1(x2, 3, 3, 1, 1);
-  x0 = _mm_movelh_ps(x1, x1);
+  // x0 = _mm_movelh_ps(x1, x1);
+  x0 = wasm_i32x4_shuffle(x1, x1, 0, 1, 4, 5);
   x2 = _mm_movehl_ps(x1, x1);
 
   /*
diff --git a/include/cglm/simd/wasm/mat3.h b/include/cglm/simd/wasm/mat3.h
index 25b911c..62d179c 100644
--- a/include/cglm/simd/wasm/mat3.h
+++ b/include/cglm/simd/wasm/mat3.h
@@ -25,8 +25,8 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
 
   x8 = glmm_shuff1(l0, 0, 2, 1, 0);                     /* a00 a02 a01 a00 */
   x1 = glmm_shuff1(r0, 3, 0, 0, 0);                     /* b10 b00 b00 b00 */
-  x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5); /* a12 a11 a10 a10 */
-  x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6); /* b20 b11 b10 b01 */
+  x2 = wasm_i32x4_shuffle(l0, l1, 3, 3, 4, 5);          /* a12 a11 a10 a10 */
+  x3 = wasm_i32x4_shuffle(r0, r1, 1, 3, 4, 6);          /* b20 b11 b10 b01 */
   x0 = wasm_f32x4_mul(x8, x1);
 
   x6 = glmm_shuff1(l0, 1, 0, 2, 1);                     /* a01 a00 a02 a01 */
@@ -65,8 +65,10 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
                                a12 * b21 +
                                a22 * b22 +
                                0   * 00                                    */
-  x2 = _mm_movelh_ps(x8, l2);                           /* 0.f a22 a12 a02 */
-  x3 = _mm_movelh_ps(x9, r2);                           /* 0.f b22 b21 b20 */
+  // x2 = _mm_movelh_ps(x8, l2);
+  // x3 = _mm_movelh_ps(x9, r2);
+  x2 = wasm_i32x4_shuffle(x8, l2, 0, 1, 4, 5);           /* 0.f a22 a12 a02 */
+  x3 = wasm_i32x4_shuffle(x9, r2, 0, 1, 4, 5);           /* 0.f b22 b21 b20 */
   x2 = glmm_vdots(x2, x3);
 
   // _mm_storeu_ps(&dest[0][0], x0);
diff --git a/include/cglm/simd/wasm/mat4.h b/include/cglm/simd/wasm/mat4.h
index b90daee..4b3cde5 100644
--- a/include/cglm/simd/wasm/mat4.h
+++ b/include/cglm/simd/wasm/mat4.h
@@ -41,9 +41,11 @@ glm_mat4_transp_wasm(mat4 m, mat4 dest) {
   tmp1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7);
   tmp2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5);
   tmp3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7);
-  r0 = _mm_movelh_ps(tmp0, tmp2);
+  // r0 = _mm_movelh_ps(tmp0, tmp2);
+  r0 = wasm_i32x4_shuffle(tmp0, tmp2, 0, 1, 4, 5);
   r1 = _mm_movehl_ps(tmp2, tmp0);
-  r2 = _mm_movelh_ps(tmp1, tmp3);
+  // r2 = _mm_movelh_ps(tmp1, tmp3);
+  r2 = wasm_i32x4_shuffle(tmp1, tmp3, 0, 1, 4, 5);
   r3 = _mm_movehl_ps(tmp3, tmp1);
 
   glmm_store(dest[0], r0);
@@ -184,7 +186,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
   r3 = glmm_load(mat[3]); /* p o n m */
   
   x0 = _mm_movehl_ps(r3, r2);                            /* p o l k */
-  x3 = _mm_movelh_ps(r2, r3);                            /* n m j i */
+  // x3 = _mm_movelh_ps(r2, r3);
+  x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5);           /* n m j i */
   x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
   x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
   x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
@@ -237,8 +240,8 @@ glm_mat4_inv_fast_wasm(mat4 mat, mat4 dest) {
      t2[5] = e * n - m * f;
      t3[5] = e * j - i * f; */
   t5 = glmm_fnmadd(x7, x5, t5);
-  
-  x4 = _mm_movelh_ps(r0, r1);        /* f e b a */
+  // x4 = _mm_movelh_ps(r0, r1);
+  x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5);        /* f e b a */
   x5 = _mm_movehl_ps(r1, r0);        /* h g d c */
   
   x0 = glmm_shuff1(x4, 0, 0, 0, 2);  /* a a a e */
@@ -322,7 +325,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
   r3 = glmm_load(mat[3]); /* p o n m */
   
   x0 = _mm_movehl_ps(r3, r2);                            /* p o l k */
-  x3 = _mm_movelh_ps(r2, r3);                            /* n m j i */
+  // x3 = _mm_movelh_ps(r2, r3);
+  x3 = wasm_i32x4_shuffle(r2, r3, 0, 1, 4, 5);           /* n m j i */
   x1 = glmm_shuff1(x0, 1, 3, 3 ,3);                      /* l p p p */
   x2 = glmm_shuff1(x0, 0, 2, 2, 2);                      /* k o o o */
   x4 = glmm_shuff1(x3, 1, 3, 3, 3);                      /* j n n n */
@@ -375,8 +379,8 @@ glm_mat4_inv_wasm(mat4 mat, mat4 dest) {
      t2[5] = e * n - m * f;
      t3[5] = e * j - i * f; */
   t5 = glmm_fnmadd(x7, x5, t5);
-  
-  x4 = _mm_movelh_ps(r0, r1);        /* f e b a */
+  // x4 = _mm_movelh_ps(r0, r1);
+  x4 = wasm_i32x4_shuffle(r0, r1, 0, 1, 4, 5);        /* f e b a */
   x5 = _mm_movehl_ps(r1, r0);        /* h g d c */
   
   x0 = glmm_shuff1(x4, 0, 0, 0, 2);  /* a a a e */