simd128: inline _mm_unpackhi_ps and _mm_unpacklo_ps

2025-12-25 04:44:58 +00:00 · 2023-03-26 19:22:24 +08:00
parent 84b482971d
commit 998d9626a2
3 changed files with 10 additions and 20 deletions
--- a/include/cglm/simd/wasm.h
+++ b/include/cglm/simd/wasm.h
@@ -22,18 +22,6 @@

 #define _mm_cvtss_f32(v) wasm_f32x4_extract_lane(v, 0)

-static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
-_mm_unpackhi_ps(glmm_128 __a, glmm_128 __b)
-{
-  return wasm_i32x4_shuffle(__a, __b, 2, 6, 3, 7);
-}
-
-static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
-_mm_unpacklo_ps(glmm_128 __a, glmm_128 __b)
-{
-  return wasm_i32x4_shuffle(__a, __b, 0, 4, 1, 5);
-}
-
 static inline glmm_128 __attribute__((__always_inline__, __nodebug__))
 _mm_movehl_ps(glmm_128 __a, glmm_128 __b)
 {
@@ -79,10 +67,10 @@ _mm_store_ss(float *__p, glmm_128 __a)
    glmm_128 __row1 = (row1); \
    glmm_128 __row2 = (row2); \
    glmm_128 __row3 = (row3); \
-    glmm_128 __tmp0 = _mm_unpacklo_ps(__row0, __row1); \
-    glmm_128 __tmp1 = _mm_unpackhi_ps(__row0, __row1); \
-    glmm_128 __tmp2 = _mm_unpacklo_ps(__row2, __row3); \
-    glmm_128 __tmp3 = _mm_unpackhi_ps(__row2, __row3); \
+    glmm_128 __tmp0 = wasm_i32x4_shuffle(__row0, __row1, 0, 4, 1, 5); \
+    glmm_128 __tmp1 = wasm_i32x4_shuffle(__row0, __row1, 2, 6, 3, 7); \
+    glmm_128 __tmp2 = wasm_i32x4_shuffle(__row2, __row3, 0, 4, 1, 5); \
+    glmm_128 __tmp3 = wasm_i32x4_shuffle(__row2, __row3, 2, 6, 3, 7); \
    (row0) = _mm_movelh_ps(__tmp0, __tmp2); \
    (row1) = _mm_movehl_ps(__tmp2, __tmp0); \
    (row2) = _mm_movelh_ps(__tmp1, __tmp3); \
--- a/include/cglm/simd/wasm/mat3.h
+++ b/include/cglm/simd/wasm/mat3.h
@@ -42,8 +42,10 @@ glm_mat3_mul_wasm(mat3 m1, mat3 m2, mat3 dest) {
  x6 = glmm_shuff1(x3, 2, 0, 0, 0);                     /* b11 b01 b01 b01 */
  x2 = glmm_shuff1(r1, 3, 3, 0, 0);                     /* b21 b21 b11 b11 */

-  x8 = _mm_unpackhi_ps(x8, x4);                         /* a10 a00 a12 a02 */
-  x9 = _mm_unpackhi_ps(x7, x2);                         /* b21 b20 b21 b20 */
+  // x8 = _mm_unpackhi_ps(x8, x4);
+  // x9 = _mm_unpackhi_ps(x7, x2);
+  x8 = wasm_i32x4_shuffle(x8, x4, 2, 6, 3, 7);          /* a10 a00 a12 a02 */
+  x9 = wasm_i32x4_shuffle(x7, x2, 2, 6, 3, 7);          /* b21 b20 b21 b20 */

  x0 = glmm_fmadd(x4, x6, x0);
  x1 = glmm_fmadd(x5, x2, x1);
--- a/include/cglm/simd/wasm/quat.h
+++ b/include/cglm/simd/wasm/quat.h
@@ -28,8 +28,8 @@ glm_quat_mul_wasm(versor p, versor q, versor dest) {
  xq = glmm_load(q);
  x1 = wasm_f32x4_const(0.f, -0.f, 0.f, -0.f); /* TODO: _mm_set1_ss() + shuff ? */
  r  = wasm_f32x4_mul(glmm_splat_w(xp), xq);
-  
-  x2 = _mm_unpackhi_ps(x1, x1);
+  // x2 = _mm_unpackhi_ps(x1, x1);
+  x2 = wasm_i32x4_shuffle(x1, x1, 2, 6, 3, 7);
  x3 = glmm_shuff1(x1, 3, 2, 0, 1);
  x  = glmm_splat_x(xp);
  y  = glmm_splat_y(xp);