diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 9712a71..a15f1b2 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -319,13 +319,16 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { CGLM_INLINE void glm_mat4_inv_neon(mat4 mat, mat4 dest) { - float32x4_t r0, r1, r2, r3, r5, r6, r7, r8, - v0, v1, v2, v3, + float32x4_t r0, r1, r2, r3, + v0, v1, v2, v3, v4, v5, t0, t1, t2; - float32x4x2_t a1, a2, a3, a4, a5, a6; - float32x2_t l0, l1; + float32x4x2_t a0, a1, a2, a3, a4; float32x4_t s1 = glmm_float32x4_SIGNMASK_PNPN, s2; +#if !CGLM_ARM64 + float32x2_t l0, l1; +#endif + s2 = vrev64q_f32(s1); /* 127 <- 0 */ @@ -334,83 +337,83 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { r2 = glmm_load(mat[2]); /* l k j i */ r3 = glmm_load(mat[3]); /* p o n m */ - a1 = vzipq_f32(r2, r0); /* d l c k, b j a i */ - a2 = vzipq_f32(r3, r1); /* h p g o, f n e m */ - a3 = vtrnq_f32(r3, r1); /* h p f n, g o e m */ - a4 = vtrnq_f32(r2, r0); /* d l b j, c k a i */ + a1 = vzipq_f32(r0, r2); /* l d k c, j b i a */ + a2 = vzipq_f32(r1, r3); /* p h o g, n f m e */ + a3 = vzipq_f32(a2.val[0], a1.val[0]); /* j n b f, i m a e */ + a4 = vzipq_f32(a2.val[1], a1.val[1]); /* l p d h, k o c g */ - a5 = vzipq_f32(a3.val[0], a4.val[0]); /* c g k o, a e i m */ - a6 = vzipq_f32(a3.val[1], a4.val[1]); /* d h l p, b f j n */ + v0 = vextq_f32(a1.val[0], a1.val[1], 2); /* k c j b */ + v1 = vextq_f32(a2.val[0], a2.val[1], 2); /* o g n f */ + v2 = vextq_f32(a1.val[1], a2.val[0], 2); /* m e l d */ + v3 = vextq_f32(a2.val[1], a1.val[0], 2); /* i a p h */ + v4 = vextq_f32(v1, v2, 2); /* l d o g */ + v5 = vextq_f32(v0, v3, 2); /* p h k c */ - r5 = vextq_f32(a5.val[0], a5.val[0], 2); /* i m a e */ - r6 = vextq_f32(a5.val[1], a5.val[1], 2); /* k o c g */ - - r7 = vextq_f32(a6.val[0], a6.val[0], 2); /* j n b f */ - r8 = vextq_f32(a6.val[1], a6.val[1], 2); /* l p d h */ - - l0 = vget_high_f32(a2.val[1]); /* h p */ - l1 = vget_high_f32(a1.val[1]); /* d l */ - - v0 = vcombine_f32(l0, l0); /* h p h p */ - v2 = vcombine_f32(l1, l1); /* d l d l */ - v1 = vextq_f32(a3.val[0], a2.val[1], 2); /* g o g o */ - v3 = vextq_f32(a4.val[0], a1.val[1], 2); /* c k c k */ - - t0 = vmulq_f32(a4.val[0], v0); + /* c2 = c * h - g * d c12 = a * g - c * e c8 = a * f - b * e + c1 = k * p - o * l c11 = i * o - k * m c7 = i * n - j * m + c4 = h * a - d * e c6 = b * h - d * f c10 = b * g - c * f + c3 = p * i - l * m c5 = j * p - l * n c9 = j * o - k * n */ + t0 = vmulq_f32(v5, v3); + t1 = vmulq_f32(a1.val[0], a2.val[1]); t2 = vmulq_f32(a1.val[0], v1); - t1 = vmulq_f32(a1.val[0], a3.val[1]); - /* c3 = i * p - l * m c7 = i * n - j * m c11 = i * o - k * m - c4 = a * h - d * e c8 = a * f - b * e c12 = a * g - c * e - c1 = k * p - l * o c5 = j * p - l * n c9 = j * o - k * n - c2 = c * h - d * g c6 = b * h - d * f c10 = b * g - c * f */ - t0 = glmm_fnmadd(v2, a3.val[0], t0); - t1 = glmm_fnmadd(a4.val[1], a2.val[0], t1); - t2 = glmm_fnmadd(v3, a2.val[0], t2); + t0 = glmm_fnmadd(v4, v2, t0); + t1 = glmm_fnmadd(a1.val[1], a2.val[0], t1); + t2 = glmm_fnmadd(v0, a2.val[0], t2); + + t0 = vrev64q_f32(t0); + t1 = vrev64q_f32(t1); + t2 = vrev64q_f32(t2); /* det */ + v0 = vrev64q_f32(t2); + v1 = vextq_f32(t1, t1, 2); + v0 = vmulq_f32(t0, v0); + v1 = vrev64q_f32(v1); + v1 = vmulq_f32(v1, t1); /* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */ - v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */ - v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */ - v0 = vmulq_f32(t0, v0); +#if CGLM_ARM64 v0 = vpaddq_f32(v0, v0); v0 = vpaddq_f32(v0, v0); +#else + l0 = vget_low_f32(v0); + l1 = vget_high_f32(v0); + + l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ + l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ + l0 = vadd_f32(l0, l1); /* [sum, sum] */ + + v0 = vcombine_f32(l0, l0); +#endif /* c5 * c12 + c6 * c11 */ - l1 = vget_low_f32(t2); - l0 = vget_high_f32(t1); - l1 = vrev64_f32(l1); +#if CGLM_ARM64 + v1 = vpaddq_f32(v1, v1); +#else + l0 = vget_low_f32(v1); + l1 = vget_high_f32(v1); - l0 = vmul_f32(l0, l1); - l0 = vpadd_f32(l0, l0); - v1 = vdupq_lane_f32(l0, 0); + l0 = vpadd_f32(l0, l0); /* [a+b, a+b] */ + l1 = vpadd_f32(l1, l1); /* [c+d, c+d] */ -// v1 = vextq_f32(t1, t2, 2); /* c12 c11 c6 c5 */ -// v2 = vrev64q_f32(v1); /* c11 c12 c5 c6 */ -// v2 = vextq_f32(v2, v2, 2); /* c5 c6 c11 c12 */ -// v1 = vmulq_f32(v1, v2); -// v1 = vpaddq_f32(v1, v1); -// /* v2 = vrev64q_f32(v1); -// v1 = vaddq_f32(v1, v2); */ + v1 = vcombine_f32(l0, l1); +#endif v0 = vsubq_f32(v0, v1); /* det */ /* inv div */ v1 = vdupq_n_f32(1.0f); v0 = glmm_div(v1, v0); /* inv div */ - - // v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */ - // v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */ - /* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */ + /* multiply t0,t1,t2 by idt to reduce 1mul below: 2eor+4mul vs 3mul+4eor */ t0 = vmulq_f32(t0, v0); t1 = vmulq_f32(t1, v0); t2 = vmulq_f32(t2, v0); - a1 = vzipq_f32(t0, t0); /* c2 c2 c1 c1, c4 c4 c3 c3 */ - a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */ - a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */ + a0 = vzipq_f32(t0, t0); /* c4 c4 c3 c3, c2 c2 c1 c1 */ + a1 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c12 c12 c11 c11 */ + a2 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c8 c8 c7 c7 */ /* result */ @@ -434,32 +437,27 @@ glm_mat4_inv_neon(mat4 mat, mat4 dest) { dest[3][2] = (m * c10 - n * c12 + o * c8) * ndt; dest[3][3] = (i * c10 - j * c12 + k * c8) * idt; */ - r0 = vmulq_f32(r7, a1.val[1]); - r1 = vmulq_f32(r5, a1.val[1]); - r2 = vmulq_f32(r5, a2.val[1]); - r3 = vmulq_f32(r5, a3.val[1]); + r0 = vmulq_f32(a3.val[1], a0.val[0]); + r1 = vmulq_f32(a3.val[0], a0.val[0]); + r2 = vmulq_f32(a3.val[0], a1.val[1]); + r3 = vmulq_f32(a3.val[0], a2.val[1]); - r0 = glmm_fnmadd(r6, a2.val[1], r0); - r1 = glmm_fnmadd(r6, a1.val[0], r1); - r2 = glmm_fnmadd(r7, a1.val[0], r2); - r3 = glmm_fnmadd(r7, a3.val[0], r3); + r0 = glmm_fnmadd(a4.val[0], a1.val[1], r0); + r1 = glmm_fnmadd(a4.val[0], a0.val[1], r1); + r2 = glmm_fnmadd(a3.val[1], a0.val[1], r2); + r3 = glmm_fnmadd(a3.val[1], a1.val[0], r3); - r0 = glmm_fmadd(r8, a3.val[1], r0); - r1 = glmm_fmadd(r8, a3.val[0], r1); - r2 = glmm_fmadd(r8, a2.val[0], r2); - r3 = glmm_fmadd(r6, a2.val[0], r3); + r0 = glmm_fmadd(a4.val[1], a2.val[1], r0); + r1 = glmm_fmadd(a4.val[1], a1.val[0], r1); + r2 = glmm_fmadd(a4.val[1], a2.val[0], r2); + r3 = glmm_fmadd(a4.val[0], a2.val[0], r3); - /* 4 xor may be fastart then 4mul, see aboe [**] */ + /* 4xor may be fastart then 4mul, see above */ r0 = glmm_xor(r0, s1); r1 = glmm_xor(r1, s2); r2 = glmm_xor(r2, s1); r3 = glmm_xor(r3, s2); -// r0 = vmulq_f32(r0, v1); -// r1 = vmulq_f32(r1, v2); -// r2 = vmulq_f32(r2, v1); -// r3 = vmulq_f32(r3, v2); - glmm_store(dest[0], r0); glmm_store(dest[1], r1); glmm_store(dest[2], r2);