From a111693b6b42f71e410d3edc0e420fb9d2bab5cb Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Sat, 24 Apr 2021 15:45:36 +0300 Subject: [PATCH] arm, neon: implement mat4 determinant with neon --- include/cglm/mat4.h | 2 + include/cglm/simd/arm.h | 13 ++++++- include/cglm/simd/neon/mat4.h | 70 +++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/include/cglm/mat4.h b/include/cglm/mat4.h index c099574..697f510 100644 --- a/include/cglm/mat4.h +++ b/include/cglm/mat4.h @@ -562,6 +562,8 @@ float glm_mat4_det(mat4 mat) { #if defined( __SSE__ ) || defined( __SSE2__ ) return glm_mat4_det_sse2(mat); +#elif defined(CGLM_NEON_FP) + return glm_mat4_det_neon(mat); #else /* [square] det(A) = det(At) */ float t[6]; diff --git a/include/cglm/simd/arm.h b/include/cglm/simd/arm.h index 791f4cb..cd4f2b8 100644 --- a/include/cglm/simd/arm.h +++ b/include/cglm/simd/arm.h @@ -69,12 +69,23 @@ SWIZZLE(glmm_2103) { return vextq_f32(v, v, 3); } #undef SWIZZLE +#define glmm_xor(a, b) \ + vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \ + vreinterpretq_s32_f32(b))) + static inline float32x4_t glmm_abs(float32x4_t v) { return vabsq_f32(v); } +static inline +float32x4_t +glmm_vhadd(float32x4_t v) { + v = vaddq_f32(v, vrev64q_f32(v)); + return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v))); +} + static inline float glmm_hadd(float32x4_t v) { @@ -138,7 +149,7 @@ glmm_norm_inf(float32x4_t a) { static inline float32x4_t glmm_div(float32x4_t a, float32x4_t b) { -#if CGLM_ARM641 +#if CGLM_ARM64 return vdivq_f32(a, b); #else /* 2 iterations of Newton-Raphson refinement of reciprocal */ diff --git a/include/cglm/simd/neon/mat4.h b/include/cglm/simd/neon/mat4.h index 36d347b..ad32339 100644 --- a/include/cglm/simd/neon/mat4.h +++ b/include/cglm/simd/neon/mat4.h @@ -101,5 +101,75 @@ glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) { vst1q_f32(dest, l0); } +CGLM_INLINE +float +glm_mat4_det_neon(mat4 mat) { + float32x4_t r0, r1, r2, r3, x0, x1, x2; + float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34; + float32x4x2_t a1; + float32x4_t x3 = { 0.f, -0.f, 0.f, -0.f }; + + /* 127 <- 0, [square] det(A) = det(At) */ + r0 = glmm_load(mat[0]); /* d c b a */ + r1 = vrev64q_f32(glmm_load(mat[1])); /* g h e f */ + r2 = vrev64q_f32(glmm_load(mat[2])); /* l k i j */ + r3 = vrev64q_f32(glmm_load(mat[3])); /* o p m n */ + + gh = vget_high_f32(r1); + ef = vget_low_f32(r1); + kl = vget_high_f32(r2); + ij = vget_low_f32(r2); + op = vget_high_f32(r3); + mn = vget_low_f32(r3); + mm = vdup_lane_f32(mn, 1); + nn = vdup_lane_f32(mn, 0); + ii = vdup_lane_f32(ij, 1); + jj = vdup_lane_f32(ij, 0); + + /* + t[1] = j * p - n * l; + t[2] = j * o - n * k; + t[3] = i * p - m * l; + t[4] = i * o - m * k; + */ + x0 = glmm_fnmadd(vcombine_f32(kl, kl), vcombine_f32(nn, mm), + vmulq_f32(vcombine_f32(op, op), vcombine_f32(jj, ii))); + + t12 = vget_low_f32(x0); + t34 = vget_high_f32(x0); + + /* 1 3 1 3 2 4 2 4 */ + a1 = vuzpq_f32(x0, x0); + + /* + t[0] = k * p - o * l; + t[0] = k * p - o * l; + t[5] = i * n - m * j; + t[5] = i * n - m * j; + */ + x1 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(kl, 0), jj), + vcombine_f32(vdup_lane_f32(op, 1), mm), + vmulq_f32(vcombine_f32(vdup_lane_f32(op, 0), nn), + vcombine_f32(vdup_lane_f32(kl, 1), ii))); + + /* + a * (f * t[0] - g * t[1] + h * t[2]) + - b * (e * t[0] - g * t[3] + h * t[4]) + + c * (e * t[1] - f * t[3] + h * t[5]) + - d * (e * t[2] - f * t[4] + g * t[5]) + */ + x2 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(gh, 1), vdup_lane_f32(ef, 0)), + vcombine_f32(vget_low_f32(a1.val[0]), t34), + vmulq_f32(vcombine_f32(ef, vdup_lane_f32(ef, 1)), + vcombine_f32(vget_low_f32(x1), t12))); + + x2 = glmm_fmadd(vcombine_f32(vdup_lane_f32(gh, 0), gh), + vcombine_f32(vget_low_f32(a1.val[1]), vget_high_f32(x1)), x2); + + x2 = glmm_xor(x2, x3); + + return glmm_hadd(vmulq_f32(x2, r0)); +} + #endif #endif /* cglm_mat4_neon_h */