neon: mat4_inv, reduce 1mul for two extra 2xor

This commit is contained in:
Recep Aslantas
2024-04-03 00:03:55 +03:00
parent c528ca1095
commit 5b772d0eb4

View File

@@ -370,7 +370,9 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
/* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */
v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */
v0 = glmm_vdot(t0, v0);
v0 = vmulq_f32(t0, v0);
v0 = vpaddq_f32(v0, v0);
v0 = vpaddq_f32(v0, v0);
/* c5 * c12 + c6 * c11 */
l1 = vget_low_f32(t2);
@@ -394,8 +396,11 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
/* inv div */
v1 = vdupq_n_f32(1.0f);
v0 = glmm_div(v1, v0); /* inv div */
// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */
// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */
/* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
/* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
t0 = vmulq_f32(t0, v0);
t1 = vmulq_f32(t1, v0);
t2 = vmulq_f32(t2, v0);
@@ -404,9 +409,6 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */
a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */
// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */
// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */
/* result */
/* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
@@ -444,6 +446,7 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
r2 = glmm_fmadd(r8, a2.val[0], r2);
r3 = glmm_fmadd(r6, a2.val[0], r3);
/* 4 xor may be fastart then 4mul, see aboe [**] */
r0 = glmm_xor(r0, s1);
r1 = glmm_xor(r1, s2);
r2 = glmm_xor(r2, s1);