From 04eaf9c535d586deacb56f6f7f6e088bda2bfdb0 Mon Sep 17 00:00:00 2001 From: Recep Aslantas Date: Thu, 29 Apr 2021 01:12:00 +0300 Subject: [PATCH] arm, neon: neon/fma support for glm_quat_mul() --- Makefile.am | 3 +- include/cglm/quat.h | 6 ++++ include/cglm/simd/neon/quat.h | 56 +++++++++++++++++++++++++++++++++++ include/cglm/simd/sse2/quat.h | 1 - win/cglm.vcxproj | 1 + win/cglm.vcxproj.filters | 3 ++ 6 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 include/cglm/simd/neon/quat.h diff --git a/Makefile.am b/Makefile.am index 3dcf155..e85faa4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -111,7 +111,8 @@ cglm_simd_avx_HEADERS = include/cglm/simd/avx/mat4.h \ cglm_simd_neondir=$(includedir)/cglm/simd/neon cglm_simd_neon_HEADERS = include/cglm/simd/neon/mat4.h \ include/cglm/simd/neon/mat2.h \ - include/cglm/simd/neon/affine.h + include/cglm/simd/neon/affine.h \ + include/cglm/simd/neon/quat.h cglm_structdir=$(includedir)/cglm/struct cglm_struct_HEADERS = include/cglm/struct/mat4.h \ diff --git a/include/cglm/quat.h b/include/cglm/quat.h index 6d38f27..8560ec8 100644 --- a/include/cglm/quat.h +++ b/include/cglm/quat.h @@ -63,6 +63,10 @@ # include "simd/sse2/quat.h" #endif +#ifdef CGLM_NEON_FP +# include "simd/neon/quat.h" +#endif + CGLM_INLINE void glm_mat4_mulv(mat4 m, vec4 v, vec4 dest); @@ -412,6 +416,8 @@ glm_quat_mul(versor p, versor q, versor dest) { */ #if defined( __SSE__ ) || defined( __SSE2__ ) glm_quat_mul_sse2(p, q, dest); +#elif defined(CGLM_NEON_FP) + glm_quat_mul_neon(p, q, dest); #else dest[0] = p[3] * q[0] + p[0] * q[3] + p[1] * q[2] - p[2] * q[1]; dest[1] = p[3] * q[1] - p[0] * q[2] + p[1] * q[3] + p[2] * q[0]; diff --git a/include/cglm/simd/neon/quat.h b/include/cglm/simd/neon/quat.h new file mode 100644 index 0000000..f6b9e99 --- /dev/null +++ b/include/cglm/simd/neon/quat.h @@ -0,0 +1,56 @@ +/* + * Copyright (c), Recep Aslantas. + * + * MIT License (MIT), http://opensource.org/licenses/MIT + * Full license can be found in the LICENSE file + */ + +#ifndef cglm_quat_neon_h +#define cglm_quat_neon_h +#if defined(__ARM_NEON_FP) + +#include "../../common.h" +#include "../intrin.h" + +CGLM_INLINE +void +glm_quat_mul_neon(versor p, versor q, versor dest) { + /* + + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i + + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j + + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k + a1 a2 − b1 b2 − c1 c2 − d1 d2 + */ + + glmm_128 xp, xq, xqr, r, x, y, z, s2, s3; + glmm_128 s1 = {-0.f, 0.f, 0.f, -0.f}; + float32x2_t qh, ql; + + xp = glmm_load(p); /* 3 2 1 0 */ + xq = glmm_load(q); + + r = vmulq_f32(glmm_splat_w(xp), xq); + x = glmm_splat_x(xp); + y = glmm_splat_y(xp); + z = glmm_splat_z(xp); + + ql = vget_high_f32(s1); + s3 = vcombine_f32(ql, ql); + s2 = vzipq_f32(s3, s3).val[0]; + + xqr = vrev64q_f32(xq); + qh = vget_high_f32(xqr); + ql = vget_low_f32(xqr); + + r = glmm_fmadd(glmm_xor(x, s3), vcombine_f32(qh, ql), r); + + r = glmm_fmadd(glmm_xor(y, s2), vcombine_f32(vget_high_f32(xq), + vget_low_f32(xq)), r); + + r = glmm_fmadd(glmm_xor(z, s1), vcombine_f32(ql, qh), r); + + glmm_store(dest, r); +} + +#endif +#endif /* cglm_quat_neon_h */ diff --git a/include/cglm/simd/sse2/quat.h b/include/cglm/simd/sse2/quat.h index 894d492..ae82885 100644 --- a/include/cglm/simd/sse2/quat.h +++ b/include/cglm/simd/sse2/quat.h @@ -41,6 +41,5 @@ glm_quat_mul_sse2(versor p, versor q, versor dest) { glmm_store(dest, r); } - #endif #endif /* cglm_quat_simd_h */ diff --git a/win/cglm.vcxproj b/win/cglm.vcxproj index b166fa7..336ff0b 100644 --- a/win/cglm.vcxproj +++ b/win/cglm.vcxproj @@ -93,6 +93,7 @@ + diff --git a/win/cglm.vcxproj.filters b/win/cglm.vcxproj.filters index d9f38bb..97c3270 100644 --- a/win/cglm.vcxproj.filters +++ b/win/cglm.vcxproj.filters @@ -376,5 +376,8 @@ include\cglm\simd\neon + + include\cglm\simd\neon + \ No newline at end of file