mirror of
https://github.com/recp/cglm.git
synced 2025-10-04 01:00:46 +00:00
arm, neon: implement mat4 determinant with neon
This commit is contained in:
@@ -562,6 +562,8 @@ float
|
|||||||
glm_mat4_det(mat4 mat) {
|
glm_mat4_det(mat4 mat) {
|
||||||
#if defined( __SSE__ ) || defined( __SSE2__ )
|
#if defined( __SSE__ ) || defined( __SSE2__ )
|
||||||
return glm_mat4_det_sse2(mat);
|
return glm_mat4_det_sse2(mat);
|
||||||
|
#elif defined(CGLM_NEON_FP)
|
||||||
|
return glm_mat4_det_neon(mat);
|
||||||
#else
|
#else
|
||||||
/* [square] det(A) = det(At) */
|
/* [square] det(A) = det(At) */
|
||||||
float t[6];
|
float t[6];
|
||||||
|
@@ -69,12 +69,23 @@ SWIZZLE(glmm_2103) { return vextq_f32(v, v, 3); }
|
|||||||
|
|
||||||
#undef SWIZZLE
|
#undef SWIZZLE
|
||||||
|
|
||||||
|
#define glmm_xor(a, b) \
|
||||||
|
vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \
|
||||||
|
vreinterpretq_s32_f32(b)))
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
float32x4_t
|
float32x4_t
|
||||||
glmm_abs(float32x4_t v) {
|
glmm_abs(float32x4_t v) {
|
||||||
return vabsq_f32(v);
|
return vabsq_f32(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
float32x4_t
|
||||||
|
glmm_vhadd(float32x4_t v) {
|
||||||
|
v = vaddq_f32(v, vrev64q_f32(v));
|
||||||
|
return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v)));
|
||||||
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
float
|
float
|
||||||
glmm_hadd(float32x4_t v) {
|
glmm_hadd(float32x4_t v) {
|
||||||
@@ -138,7 +149,7 @@ glmm_norm_inf(float32x4_t a) {
|
|||||||
static inline
|
static inline
|
||||||
float32x4_t
|
float32x4_t
|
||||||
glmm_div(float32x4_t a, float32x4_t b) {
|
glmm_div(float32x4_t a, float32x4_t b) {
|
||||||
#if CGLM_ARM641
|
#if CGLM_ARM64
|
||||||
return vdivq_f32(a, b);
|
return vdivq_f32(a, b);
|
||||||
#else
|
#else
|
||||||
/* 2 iterations of Newton-Raphson refinement of reciprocal */
|
/* 2 iterations of Newton-Raphson refinement of reciprocal */
|
||||||
|
@@ -101,5 +101,75 @@ glm_mat4_mulv_neon(mat4 m, vec4 v, vec4 dest) {
|
|||||||
vst1q_f32(dest, l0);
|
vst1q_f32(dest, l0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CGLM_INLINE
|
||||||
|
float
|
||||||
|
glm_mat4_det_neon(mat4 mat) {
|
||||||
|
float32x4_t r0, r1, r2, r3, x0, x1, x2;
|
||||||
|
float32x2_t ij, op, mn, kl, nn, mm, jj, ii, gh, ef, t12, t34;
|
||||||
|
float32x4x2_t a1;
|
||||||
|
float32x4_t x3 = { 0.f, -0.f, 0.f, -0.f };
|
||||||
|
|
||||||
|
/* 127 <- 0, [square] det(A) = det(At) */
|
||||||
|
r0 = glmm_load(mat[0]); /* d c b a */
|
||||||
|
r1 = vrev64q_f32(glmm_load(mat[1])); /* g h e f */
|
||||||
|
r2 = vrev64q_f32(glmm_load(mat[2])); /* l k i j */
|
||||||
|
r3 = vrev64q_f32(glmm_load(mat[3])); /* o p m n */
|
||||||
|
|
||||||
|
gh = vget_high_f32(r1);
|
||||||
|
ef = vget_low_f32(r1);
|
||||||
|
kl = vget_high_f32(r2);
|
||||||
|
ij = vget_low_f32(r2);
|
||||||
|
op = vget_high_f32(r3);
|
||||||
|
mn = vget_low_f32(r3);
|
||||||
|
mm = vdup_lane_f32(mn, 1);
|
||||||
|
nn = vdup_lane_f32(mn, 0);
|
||||||
|
ii = vdup_lane_f32(ij, 1);
|
||||||
|
jj = vdup_lane_f32(ij, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
t[1] = j * p - n * l;
|
||||||
|
t[2] = j * o - n * k;
|
||||||
|
t[3] = i * p - m * l;
|
||||||
|
t[4] = i * o - m * k;
|
||||||
|
*/
|
||||||
|
x0 = glmm_fnmadd(vcombine_f32(kl, kl), vcombine_f32(nn, mm),
|
||||||
|
vmulq_f32(vcombine_f32(op, op), vcombine_f32(jj, ii)));
|
||||||
|
|
||||||
|
t12 = vget_low_f32(x0);
|
||||||
|
t34 = vget_high_f32(x0);
|
||||||
|
|
||||||
|
/* 1 3 1 3 2 4 2 4 */
|
||||||
|
a1 = vuzpq_f32(x0, x0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
t[0] = k * p - o * l;
|
||||||
|
t[0] = k * p - o * l;
|
||||||
|
t[5] = i * n - m * j;
|
||||||
|
t[5] = i * n - m * j;
|
||||||
|
*/
|
||||||
|
x1 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(kl, 0), jj),
|
||||||
|
vcombine_f32(vdup_lane_f32(op, 1), mm),
|
||||||
|
vmulq_f32(vcombine_f32(vdup_lane_f32(op, 0), nn),
|
||||||
|
vcombine_f32(vdup_lane_f32(kl, 1), ii)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
a * (f * t[0] - g * t[1] + h * t[2])
|
||||||
|
- b * (e * t[0] - g * t[3] + h * t[4])
|
||||||
|
+ c * (e * t[1] - f * t[3] + h * t[5])
|
||||||
|
- d * (e * t[2] - f * t[4] + g * t[5])
|
||||||
|
*/
|
||||||
|
x2 = glmm_fnmadd(vcombine_f32(vdup_lane_f32(gh, 1), vdup_lane_f32(ef, 0)),
|
||||||
|
vcombine_f32(vget_low_f32(a1.val[0]), t34),
|
||||||
|
vmulq_f32(vcombine_f32(ef, vdup_lane_f32(ef, 1)),
|
||||||
|
vcombine_f32(vget_low_f32(x1), t12)));
|
||||||
|
|
||||||
|
x2 = glmm_fmadd(vcombine_f32(vdup_lane_f32(gh, 0), gh),
|
||||||
|
vcombine_f32(vget_low_f32(a1.val[1]), vget_high_f32(x1)), x2);
|
||||||
|
|
||||||
|
x2 = glmm_xor(x2, x3);
|
||||||
|
|
||||||
|
return glmm_hadd(vmulq_f32(x2, r0));
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif /* cglm_mat4_neon_h */
|
#endif /* cglm_mat4_neon_h */
|
||||||
|
Reference in New Issue
Block a user