neon: mat4_inv, reduce 1mul for two extra 2xor

2025-12-24 12:32:40 +00:00 · 2024-04-03 00:03:55 +03:00
parent c528ca1095
commit 5b772d0eb4
1 changed files with 8 additions and 5 deletions
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -370,7 +370,9 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
  /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
  v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */
  v0 = vrev64q_f32(v0);      /* c7 c8 c9 c10 */
-  v0 = glmm_vdot(t0, v0);
+  v0 = vmulq_f32(t0, v0);
+  v0 = vpaddq_f32(v0, v0);
+  v0 = vpaddq_f32(v0, v0);

  /* c5 * c12 + c6 * c11 */
  l1 = vget_low_f32(t2);
@@ -394,8 +396,11 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
  /* inv div */
  v1 = vdupq_n_f32(1.0f);
  v0 = glmm_div(v1, v0);     /* inv div */
+  
+  //  v1 = glmm_xor(v0, s1);     /* idt ndt idt ndt */
+  //  v2 = glmm_xor(v0, s2);     /* ndt idt ndt idt */

-  /* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
+  /* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
  t0 = vmulq_f32(t0, v0);
  t1 = vmulq_f32(t1, v0);
  t2 = vmulq_f32(t2, v0);
@@ -404,9 +409,6 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
  a2 = vzipq_f32(t1, t1);    /* c6  c6  c5 c5, c8  c8  c7  c7  */
  a3 = vzipq_f32(t2, t2);    /* c10 c10 c9 c9, c12 c12 c11 c11 */

-//  v1 = glmm_xor(v0, s1);     /* idt ndt idt ndt */
-//  v2 = glmm_xor(v0, s2);     /* ndt idt ndt idt */
-
  /* result */

  /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
@@ -444,6 +446,7 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
  r2 = glmm_fmadd(r8, a2.val[0], r2);
  r3 = glmm_fmadd(r6, a2.val[0], r3);

+  /* 4 xor may be fastart then 4mul, see aboe [**] */
  r0 = glmm_xor(r0, s1);
  r1 = glmm_xor(r1, s2);
  r2 = glmm_xor(r2, s1);