diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h
index b3f07fe..e6b4f8f 100644
--- a/include/cglm/simd/neon/mat4.h
+++ b/include/cglm/simd/neon/mat4.h
@@ -370,7 +370,9 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
   /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
   v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */
   v0 = vrev64q_f32(v0);      /* c7 c8 c9 c10 */
-  v0 = glmm_vdot(t0, v0);
+  v0 = vmulq_f32(t0, v0);
+  v0 = vpaddq_f32(v0, v0);
+  v0 = vpaddq_f32(v0, v0);
 
   /* c5 * c12 + c6 * c11 */
   l1 = vget_low_f32(t2);
@@ -394,8 +396,11 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
   /* inv div */
   v1 = vdupq_n_f32(1.0f);
   v0 = glmm_div(v1, v0);     /* inv div */
+  
+  //  v1 = glmm_xor(v0, s1);     /* idt ndt idt ndt */
+  //  v2 = glmm_xor(v0, s2);     /* ndt idt ndt idt */
 
-  /* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
+  /* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
   t0 = vmulq_f32(t0, v0);
   t1 = vmulq_f32(t1, v0);
   t2 = vmulq_f32(t2, v0);
@@ -404,9 +409,6 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
   a2 = vzipq_f32(t1, t1);    /* c6  c6  c5 c5, c8  c8  c7  c7  */
   a3 = vzipq_f32(t2, t2);    /* c10 c10 c9 c9, c12 c12 c11 c11 */
 
-//  v1 = glmm_xor(v0, s1);     /* idt ndt idt ndt */
-//  v2 = glmm_xor(v0, s2);     /* ndt idt ndt idt */
-
   /* result */
 
   /* dest[0][0] = (f * c1  - g * c5  + h * c9)  * idt;
@@ -444,6 +446,7 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
   r2 = glmm_fmadd(r8, a2.val[0], r2);
   r3 = glmm_fmadd(r6, a2.val[0], r3);
 
+  /* 4 xor may be fastart then 4mul, see aboe [**] */
   r0 = glmm_xor(r0, s1);
   r1 = glmm_xor(r1, s2);
   r2 = glmm_xor(r2, s1);