mirror of
https://github.com/recp/cglm.git
synced 2025-12-24 12:32:40 +00:00
neon: mat4_inv, reduce 1mul for two extra 2xor
This commit is contained in:
@@ -370,7 +370,9 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
|
||||
/* c3 * c10 + c4 * c9 + c1 * c8 + c2 * c7 */
|
||||
v0 = vextq_f32(t2, t1, 2); /* c8 c7 c10 c9 */
|
||||
v0 = vrev64q_f32(v0); /* c7 c8 c9 c10 */
|
||||
v0 = glmm_vdot(t0, v0);
|
||||
v0 = vmulq_f32(t0, v0);
|
||||
v0 = vpaddq_f32(v0, v0);
|
||||
v0 = vpaddq_f32(v0, v0);
|
||||
|
||||
/* c5 * c12 + c6 * c11 */
|
||||
l1 = vget_low_f32(t2);
|
||||
@@ -394,8 +396,11 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
|
||||
/* inv div */
|
||||
v1 = vdupq_n_f32(1.0f);
|
||||
v0 = glmm_div(v1, v0); /* inv div */
|
||||
|
||||
// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */
|
||||
// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */
|
||||
|
||||
/* multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
|
||||
/* [*] multiply t0, t1, t2 to reduce 1mul below: 2 eor + 34mul vs 3mul + 4eor */
|
||||
t0 = vmulq_f32(t0, v0);
|
||||
t1 = vmulq_f32(t1, v0);
|
||||
t2 = vmulq_f32(t2, v0);
|
||||
@@ -404,9 +409,6 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
|
||||
a2 = vzipq_f32(t1, t1); /* c6 c6 c5 c5, c8 c8 c7 c7 */
|
||||
a3 = vzipq_f32(t2, t2); /* c10 c10 c9 c9, c12 c12 c11 c11 */
|
||||
|
||||
// v1 = glmm_xor(v0, s1); /* idt ndt idt ndt */
|
||||
// v2 = glmm_xor(v0, s2); /* ndt idt ndt idt */
|
||||
|
||||
/* result */
|
||||
|
||||
/* dest[0][0] = (f * c1 - g * c5 + h * c9) * idt;
|
||||
@@ -444,6 +446,7 @@ glm_mat4_inv_neon_2(mat4 mat, mat4 dest) {
|
||||
r2 = glmm_fmadd(r8, a2.val[0], r2);
|
||||
r3 = glmm_fmadd(r6, a2.val[0], r3);
|
||||
|
||||
/* 4 xor may be fastart then 4mul, see aboe [**] */
|
||||
r0 = glmm_xor(r0, s1);
|
||||
r1 = glmm_xor(r1, s2);
|
||||
r2 = glmm_xor(r2, s1);
|
||||
|
||||
Reference in New Issue
Block a user